aboutsummaryrefslogtreecommitdiffhomepage
path: root/third_party/eigen3/Eigen/src/Core/arch
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/eigen3/Eigen/src/Core/arch')
-rw-r--r--third_party/eigen3/Eigen/src/Core/arch/AVX/Complex.h463
-rw-r--r--third_party/eigen3/Eigen/src/Core/arch/AVX/MathFunctions.h495
-rw-r--r--third_party/eigen3/Eigen/src/Core/arch/AVX/PacketMath.h650
-rw-r--r--third_party/eigen3/Eigen/src/Core/arch/AVX/TypeCasting.h51
-rw-r--r--third_party/eigen3/Eigen/src/Core/arch/AltiVec/Complex.h439
-rw-r--r--third_party/eigen3/Eigen/src/Core/arch/AltiVec/MathFunctions.h299
-rw-r--r--third_party/eigen3/Eigen/src/Core/arch/AltiVec/PacketMath.h943
-rw-r--r--third_party/eigen3/Eigen/src/Core/arch/CUDA/MathFunctions.h112
-rw-r--r--third_party/eigen3/Eigen/src/Core/arch/CUDA/PacketMath.h342
-rw-r--r--third_party/eigen3/Eigen/src/Core/arch/Default/Settings.h49
-rw-r--r--third_party/eigen3/Eigen/src/Core/arch/NEON/Complex.h467
-rw-r--r--third_party/eigen3/Eigen/src/Core/arch/NEON/MathFunctions.h91
-rw-r--r--third_party/eigen3/Eigen/src/Core/arch/NEON/PacketMath.h745
-rw-r--r--third_party/eigen3/Eigen/src/Core/arch/SSE/Complex.h486
-rw-r--r--third_party/eigen3/Eigen/src/Core/arch/SSE/MathFunctions.h529
-rw-r--r--third_party/eigen3/Eigen/src/Core/arch/SSE/PacketMath.h883
-rw-r--r--third_party/eigen3/Eigen/src/Core/arch/SSE/TypeCasting.h77
17 files changed, 0 insertions, 7121 deletions
diff --git a/third_party/eigen3/Eigen/src/Core/arch/AVX/Complex.h b/third_party/eigen3/Eigen/src/Core/arch/AVX/Complex.h
deleted file mode 100644
index e98c40e1f1..0000000000
--- a/third_party/eigen3/Eigen/src/Core/arch/AVX/Complex.h
+++ /dev/null
@@ -1,463 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_COMPLEX_AVX_H
-#define EIGEN_COMPLEX_AVX_H
-
-namespace Eigen {
-
-namespace internal {
-
-//---------- float ----------
-struct Packet4cf
-{
- EIGEN_STRONG_INLINE Packet4cf() {}
- EIGEN_STRONG_INLINE explicit Packet4cf(const __m256& a) : v(a) {}
- __m256 v;
-};
-
-template<> struct packet_traits<std::complex<float> > : default_packet_traits
-{
- typedef Packet4cf type;
- typedef Packet2cf half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size = 4,
- HasHalfPacket = 1,
-
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 1,
- HasNegate = 1,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 0,
- HasMax = 0,
- HasSetLinear = 0
- };
-};
-
-template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4}; typedef Packet2cf half; };
-
-template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cf pnegate(const Packet4cf& a)
-{
- return Packet4cf(pnegate(a.v));
-}
-template<> EIGEN_STRONG_INLINE Packet4cf pconj(const Packet4cf& a)
-{
- const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000));
- return Packet4cf(_mm256_xor_ps(a.v,mask));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cf pmul<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
-{
- __m256 tmp1 = _mm256_mul_ps(_mm256_moveldup_ps(a.v), b.v);
- __m256 tmp2 = _mm256_mul_ps(_mm256_movehdup_ps(a.v), _mm256_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1)));
- __m256 result = _mm256_addsub_ps(tmp1, tmp2);
- return Packet4cf(result);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cf pand <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cf por <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cf pxor <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(a.v,b.v)); }
-
-template<> EIGEN_STRONG_INLINE Packet4cf pload <Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload<Packet8f>(&numext::real_ref(*from))); }
-template<> EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu<Packet8f>(&numext::real_ref(*from))); }
-
-
-template<> EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<float>& from)
-{
- return Packet4cf(_mm256_castpd_ps(_mm256_broadcast_sd((const double*)(const void*)&from)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from)
-{
- // FIXME The following might be optimized using _mm256_movedup_pd
- Packet2cf a = ploaddup<Packet2cf>(from);
- Packet2cf b = ploaddup<Packet2cf>(from+1);
- return Packet4cf(_mm256_insertf128_ps(_mm256_castps128_ps256(a.v), b.v, 1));
-}
-
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet4cf pgather<std::complex<float>, Packet4cf>(const std::complex<float>* from, int stride)
-{
- return Packet4cf(_mm256_set_ps(std::imag(from[3*stride]), std::real(from[3*stride]),
- std::imag(from[2*stride]), std::real(from[2*stride]),
- std::imag(from[1*stride]), std::real(from[1*stride]),
- std::imag(from[0*stride]), std::real(from[0*stride])));
-}
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet4cf>(std::complex<float>* to, const Packet4cf& from, int stride)
-{
- __m128 low = _mm256_extractf128_ps(from.v, 0);
- to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 0)),
- _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)));
- to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)),
- _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)));
-
- __m128 high = _mm256_extractf128_ps(from.v, 1);
- to[stride*2] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 0)),
- _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)));
- to[stride*3] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)),
- _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)));
-
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet4cf>(const Packet4cf& a)
-{
- return pfirst(Packet2cf(_mm256_castps256_ps128(a.v)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) {
- __m128 low = _mm256_extractf128_ps(a.v, 0);
- __m128 high = _mm256_extractf128_ps(a.v, 1);
- __m128d lowd = _mm_castps_pd(low);
- __m128d highd = _mm_castps_pd(high);
- low = _mm_castpd_ps(_mm_shuffle_pd(lowd,lowd,0x1));
- high = _mm_castpd_ps(_mm_shuffle_pd(highd,highd,0x1));
- __m256 result = _mm256_setzero_ps();
- result = _mm256_insertf128_ps(result, low, 1);
- result = _mm256_insertf128_ps(result, high, 0);
- return Packet4cf(result);
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet4cf>(const Packet4cf& a)
-{
- return predux(padd(Packet2cf(_mm256_extractf128_ps(a.v,0)),
- Packet2cf(_mm256_extractf128_ps(a.v,1))));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cf preduxp<Packet4cf>(const Packet4cf* vecs)
-{
- Packet8f t0 = _mm256_shuffle_ps(vecs[0].v, vecs[0].v, _MM_SHUFFLE(3, 1, 2 ,0));
- Packet8f t1 = _mm256_shuffle_ps(vecs[1].v, vecs[1].v, _MM_SHUFFLE(3, 1, 2 ,0));
- t0 = _mm256_hadd_ps(t0,t1);
- Packet8f t2 = _mm256_shuffle_ps(vecs[2].v, vecs[2].v, _MM_SHUFFLE(3, 1, 2 ,0));
- Packet8f t3 = _mm256_shuffle_ps(vecs[3].v, vecs[3].v, _MM_SHUFFLE(3, 1, 2 ,0));
- t2 = _mm256_hadd_ps(t2,t3);
-
- t1 = _mm256_permute2f128_ps(t0,t2, 0 + (2<<4));
- t3 = _mm256_permute2f128_ps(t0,t2, 1 + (3<<4));
-
- return Packet4cf(_mm256_add_ps(t1,t3));
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(const Packet4cf& a)
-{
- return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)),
- Packet2cf(_mm256_extractf128_ps(a.v, 1))));
-}
-
-template<int Offset>
-struct palign_impl<Offset,Packet4cf>
-{
- static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second)
- {
- if (Offset==0) return;
- palign_impl<Offset*2,Packet8f>::run(first.v, second.v);
- }
-};
-
-template<> struct conj_helper<Packet4cf, Packet4cf, false,true>
-{
- EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
- { return padd(pmul(x,y),c); }
-
- EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
- {
- return internal::pmul(a, pconj(b));
- }
-};
-
-template<> struct conj_helper<Packet4cf, Packet4cf, true,false>
-{
- EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
- { return padd(pmul(x,y),c); }
-
- EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
- {
- return internal::pmul(pconj(a), b);
- }
-};
-
-template<> struct conj_helper<Packet4cf, Packet4cf, true,true>
-{
- EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
- { return padd(pmul(x,y),c); }
-
- EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
- {
- return pconj(internal::pmul(a, b));
- }
-};
-
-template<> struct conj_helper<Packet8f, Packet4cf, false,false>
-{
- EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const
- { return padd(c, pmul(x,y)); }
-
- EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const
- { return Packet4cf(Eigen::internal::pmul(x, y.v)); }
-};
-
-template<> struct conj_helper<Packet4cf, Packet8f, false,false>
-{
- EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const
- { return padd(c, pmul(x,y)); }
-
- EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const
- { return Packet4cf(Eigen::internal::pmul(x.v, y)); }
-};
-
-template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
-{
- Packet4cf num = pmul(a, pconj(b));
- __m256 tmp = _mm256_mul_ps(b.v, b.v);
- __m256 tmp2 = _mm256_shuffle_ps(tmp,tmp,0xB1);
- __m256 denom = _mm256_add_ps(tmp, tmp2);
- return Packet4cf(_mm256_div_ps(num.v, denom));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(const Packet4cf& x)
-{
- return Packet4cf(_mm256_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1)));
-}
-
-//---------- double ----------
-struct Packet2cd
-{
- EIGEN_STRONG_INLINE Packet2cd() {}
- EIGEN_STRONG_INLINE explicit Packet2cd(const __m256d& a) : v(a) {}
- __m256d v;
-};
-
-template<> struct packet_traits<std::complex<double> > : default_packet_traits
-{
- typedef Packet2cd type;
- typedef Packet1cd half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 0,
- size = 2,
- HasHalfPacket = 1,
-
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 1,
- HasNegate = 1,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 0,
- HasMax = 0,
- HasSetLinear = 0
- };
-};
-
-template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2}; typedef Packet1cd half; };
-
-template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd pnegate(const Packet2cd& a) { return Packet2cd(pnegate(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd pconj(const Packet2cd& a)
-{
- const __m256d mask = _mm256_castsi256_pd(_mm256_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0));
- return Packet2cd(_mm256_xor_pd(a.v,mask));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cd pmul<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
-{
- __m256d tmp1 = _mm256_shuffle_pd(a.v,a.v,0x0);
- __m256d even = _mm256_mul_pd(tmp1, b.v);
- __m256d tmp2 = _mm256_shuffle_pd(a.v,a.v,0xF);
- __m256d tmp3 = _mm256_shuffle_pd(b.v,b.v,0x5);
- __m256d odd = _mm256_mul_pd(tmp2, tmp3);
- return Packet2cd(_mm256_addsub_pd(even, odd));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cd pand <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd por <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd pxor <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(a.v,b.v)); }
-
-template<> EIGEN_STRONG_INLINE Packet2cd pload <Packet2cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload<Packet4d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet2cd ploadu<Packet2cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cd(ploadu<Packet4d>((const double*)from)); }
-
-template<> EIGEN_STRONG_INLINE Packet2cd pset1<Packet2cd>(const std::complex<double>& from)
-{
- // in case casting to a __m128d* is really not safe, then we can still fallback to this version: (much slower though)
-// return Packet2cd(_mm256_loadu2_m128d((const double*)&from,(const double*)&from));
- return Packet2cd(_mm256_broadcast_pd((const __m128d*)(const void*)&from));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cd ploaddup<Packet2cd>(const std::complex<double>* from) { return pset1<Packet2cd>(*from); }
-
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet2cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet2cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet2cd pgather<std::complex<double>, Packet2cd>(const std::complex<double>* from, int stride)
-{
- return Packet2cd(_mm256_set_pd(std::imag(from[1*stride]), std::real(from[1*stride]),
- std::imag(from[0*stride]), std::real(from[0*stride])));
-}
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet2cd>(std::complex<double>* to, const Packet2cd& from, int stride)
-{
- __m128d low = _mm256_extractf128_pd(from.v, 0);
- to[stride*0] = std::complex<double>(_mm_cvtsd_f64(low), _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1)));
- __m128d high = _mm256_extractf128_pd(from.v, 1);
- to[stride*1] = std::complex<double>(_mm_cvtsd_f64(high), _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1)));
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet2cd>(const Packet2cd& a)
-{
- __m128d low = _mm256_extractf128_pd(a.v, 0);
- EIGEN_ALIGN16 double res[2];
- _mm_store_pd(res, low);
- return std::complex<double>(res[0],res[1]);
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cd preverse(const Packet2cd& a) {
- __m256d result = _mm256_permute2f128_pd(a.v, a.v, 1);
- return Packet2cd(result);
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet2cd>(const Packet2cd& a)
-{
- return predux(padd(Packet1cd(_mm256_extractf128_pd(a.v,0)),
- Packet1cd(_mm256_extractf128_pd(a.v,1))));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cd preduxp<Packet2cd>(const Packet2cd* vecs)
-{
- Packet4d t0 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 0 + (2<<4));
- Packet4d t1 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 1 + (3<<4));
-
- return Packet2cd(_mm256_add_pd(t0,t1));
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet2cd>(const Packet2cd& a)
-{
- return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v,0)),
- Packet1cd(_mm256_extractf128_pd(a.v,1))));
-}
-
-template<int Offset>
-struct palign_impl<Offset,Packet2cd>
-{
- static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second)
- {
- if (Offset==0) return;
- palign_impl<Offset*2,Packet4d>::run(first.v, second.v);
- }
-};
-
-template<> struct conj_helper<Packet2cd, Packet2cd, false,true>
-{
- EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
- { return padd(pmul(x,y),c); }
-
- EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
- {
- return internal::pmul(a, pconj(b));
- }
-};
-
-template<> struct conj_helper<Packet2cd, Packet2cd, true,false>
-{
- EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
- { return padd(pmul(x,y),c); }
-
- EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
- {
- return internal::pmul(pconj(a), b);
- }
-};
-
-template<> struct conj_helper<Packet2cd, Packet2cd, true,true>
-{
- EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
- { return padd(pmul(x,y),c); }
-
- EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
- {
- return pconj(internal::pmul(a, b));
- }
-};
-
-template<> struct conj_helper<Packet4d, Packet2cd, false,false>
-{
- EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const
- { return padd(c, pmul(x,y)); }
-
- EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const
- { return Packet2cd(Eigen::internal::pmul(x, y.v)); }
-};
-
-template<> struct conj_helper<Packet2cd, Packet4d, false,false>
-{
- EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const
- { return padd(c, pmul(x,y)); }
-
- EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const
- { return Packet2cd(Eigen::internal::pmul(x.v, y)); }
-};
-
-template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
-{
- Packet2cd num = pmul(a, pconj(b));
- __m256d tmp = _mm256_mul_pd(b.v, b.v);
- __m256d denom = _mm256_hadd_pd(tmp, tmp);
- return Packet2cd(_mm256_div_pd(num.v, denom));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& x)
-{
- return Packet2cd(_mm256_shuffle_pd(x.v, x.v, 0x5));
-}
-
-template<> EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4cf,4>& kernel) {
- __m256d P0 = _mm256_castps_pd(kernel.packet[0].v);
- __m256d P1 = _mm256_castps_pd(kernel.packet[1].v);
- __m256d P2 = _mm256_castps_pd(kernel.packet[2].v);
- __m256d P3 = _mm256_castps_pd(kernel.packet[3].v);
-
- __m256d T0 = _mm256_shuffle_pd(P0, P1, 15);
- __m256d T1 = _mm256_shuffle_pd(P0, P1, 0);
- __m256d T2 = _mm256_shuffle_pd(P2, P3, 15);
- __m256d T3 = _mm256_shuffle_pd(P2, P3, 0);
-
- kernel.packet[1].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 32));
- kernel.packet[3].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 49));
- kernel.packet[0].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 32));
- kernel.packet[2].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 49));
-}
-
-template<> EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet2cd,2>& kernel) {
- __m256d tmp = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 0+(2<<4));
- kernel.packet[1].v = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 1+(3<<4));
- kernel.packet[0].v = tmp;
-}
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_COMPLEX_AVX_H
diff --git a/third_party/eigen3/Eigen/src/Core/arch/AVX/MathFunctions.h b/third_party/eigen3/Eigen/src/Core/arch/AVX/MathFunctions.h
deleted file mode 100644
index faa5c79021..0000000000
--- a/third_party/eigen3/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ /dev/null
@@ -1,495 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_MATH_FUNCTIONS_AVX_H
-#define EIGEN_MATH_FUNCTIONS_AVX_H
-
-// For some reason, this function didn't make it into the avxintirn.h
-// used by the compiler, so we'll just wrap it.
-#define _mm256_setr_m128(lo, hi) \
- _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)
-
-/* The sin, cos, exp, and log functions of this file are loosely derived from
- * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
- */
-
-namespace Eigen {
-
-namespace internal {
-
-// Sine function
-// Computes sin(x) by wrapping x to the interval [-Pi/4,3*Pi/4] and
-// evaluating interpolants in [-Pi/4,Pi/4] or [Pi/4,3*Pi/4]. The interpolants
-// are (anti-)symmetric and thus have only odd/even coefficients
-template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
-psin<Packet8f>(const Packet8f& _x) {
- Packet8f x = _x;
-
- // Some useful values.
- _EIGEN_DECLARE_CONST_Packet8i(one, 1);
- _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f);
- _EIGEN_DECLARE_CONST_Packet8f(two, 2.0f);
- _EIGEN_DECLARE_CONST_Packet8f(one_over_four, 0.25f);
- _EIGEN_DECLARE_CONST_Packet8f(one_over_pi, 3.183098861837907e-01f);
- _EIGEN_DECLARE_CONST_Packet8f(neg_pi_first, -3.140625000000000e+00);
- _EIGEN_DECLARE_CONST_Packet8f(neg_pi_second, -9.670257568359375e-04);
- _EIGEN_DECLARE_CONST_Packet8f(neg_pi_third, -6.278329571784980e-07);
- _EIGEN_DECLARE_CONST_Packet8f(four_over_pi, 1.273239544735163e+00);
-
- // Map x from [-Pi/4,3*Pi/4] to z in [-1,3] and subtract the shifted period.
- Packet8f z = pmul(x, p8f_one_over_pi);
- Packet8f shift = _mm256_floor_ps(padd(z, p8f_one_over_four));
- x = pmadd(shift, p8f_neg_pi_first, x);
- x = pmadd(shift, p8f_neg_pi_second, x);
- x = pmadd(shift, p8f_neg_pi_third, x);
- z = pmul(x, p8f_four_over_pi);
-
- // Make a mask for the entries that need flipping, i.e. wherever the shift
- // is odd.
- Packet8i shift_ints = _mm256_cvtps_epi32(shift);
- Packet8i shift_isodd =
- (__m256i)_mm256_and_ps((__m256)shift_ints, (__m256)p8i_one);
-#ifdef EIGEN_VECTORIZE_AVX2
- Packet8i sign_flip_mask = _mm256_slli_epi32(shift_isodd, 31);
-#else
- __m128i lo =
- _mm_slli_epi32(_mm256_extractf128_si256((__m256i)shift_isodd, 0), 31);
- __m128i hi =
- _mm_slli_epi32(_mm256_extractf128_si256((__m256i)shift_isodd, 1), 31);
- Packet8i sign_flip_mask = _mm256_setr_m128(lo, hi);
-#endif
-
- // Create a mask for which interpolant to use, i.e. if z > 1, then the mask
- // is set to ones for that entry.
- Packet8f ival_mask = _mm256_cmp_ps(z, p8f_one, _CMP_GT_OQ);
-
- // Evaluate the polynomial for the interval [1,3] in z.
- _EIGEN_DECLARE_CONST_Packet8f(coeff_right_0, 9.999999724233232e-01f);
- _EIGEN_DECLARE_CONST_Packet8f(coeff_right_2, -3.084242535619928e-01);
- _EIGEN_DECLARE_CONST_Packet8f(coeff_right_4, 1.584991525700324e-02);
- _EIGEN_DECLARE_CONST_Packet8f(coeff_right_6, -3.188805084631342e-04);
- Packet8f z_minus_two = psub(z, p8f_two);
- Packet8f z_minus_two2 = pmul(z_minus_two, z_minus_two);
- Packet8f right = pmadd(p8f_coeff_right_6, z_minus_two2, p8f_coeff_right_4);
- right = pmadd(right, z_minus_two2, p8f_coeff_right_2);
- right = pmadd(right, z_minus_two2, p8f_coeff_right_0);
-
- // Evaluate the polynomial for the interval [-1,1] in z.
- _EIGEN_DECLARE_CONST_Packet8f(coeff_left_1, 7.853981525427295e-01);
- _EIGEN_DECLARE_CONST_Packet8f(coeff_left_3, -8.074536727092352e-02);
- _EIGEN_DECLARE_CONST_Packet8f(coeff_left_5, 2.489871967827018e-03);
- _EIGEN_DECLARE_CONST_Packet8f(coeff_left_7, -3.587725841214251e-05);
- Packet8f z2 = pmul(z, z);
- Packet8f left = pmadd(p8f_coeff_left_7, z2, p8f_coeff_left_5);
- left = pmadd(left, z2, p8f_coeff_left_3);
- left = pmadd(left, z2, p8f_coeff_left_1);
- left = pmul(left, z);
-
- // Assemble the results, i.e. select the left and right polynomials.
- left = _mm256_andnot_ps(ival_mask, left);
- right = _mm256_and_ps(ival_mask, right);
- Packet8f res = _mm256_or_ps(left, right);
-
- // Flip the sign on the odd intervals and return the result.
- res = _mm256_xor_ps(res, (__m256)sign_flip_mask);
- return res;
-}
-
-// Natural logarithm
-// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
-// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
-// be easily approximated by a polynomial centered on m=1 for stability.
-// TODO(gonnet): Further reduce the interval allowing for lower-degree
-// polynomial interpolants -> ... -> profit!
-template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
-plog<Packet8f>(const Packet8f& _x) {
- Packet8f x = _x;
- _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f);
- _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f);
- _EIGEN_DECLARE_CONST_Packet8f(126f, 126.0f);
-
- _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inv_mant_mask, ~0x7f800000);
-
- // The smallest non denormalized float number.
- _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(min_norm_pos, 0x00800000);
- _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(minus_inf, 0xff800000);
-
- // Polynomial coefficients.
- _EIGEN_DECLARE_CONST_Packet8f(cephes_SQRTHF, 0.707106781186547524f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p0, 7.0376836292E-2f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p1, -1.1514610310E-1f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p2, 1.1676998740E-1f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p3, -1.2420140846E-1f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p4, +1.4249322787E-1f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p5, -1.6668057665E-1f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p6, +2.0000714765E-1f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p7, -2.4999993993E-1f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p8, +3.3333331174E-1f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q1, -2.12194440e-4f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q2, 0.693359375f);
-
- // invalid_mask is set to true when x is NaN
- Packet8f invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_NGE_UQ);
- Packet8f iszero_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ);
-
- // Truncate input values to the minimum positive normal.
- x = pmax(x, p8f_min_norm_pos);
-
-// Extract the shifted exponents (No bitwise shifting in regular AVX, so
-// convert to SSE and do it there).
-#ifdef EIGEN_VECTORIZE_AVX2
- Packet8f emm0 = _mm256_cvtepi32_ps(_mm256_srli_epi32((__m256i)x, 23));
-#else
- __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256((__m256i)x, 0), 23);
- __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256((__m256i)x, 1), 23);
- Packet8f emm0 = _mm256_cvtepi32_ps(_mm256_setr_m128(lo, hi));
-#endif
- Packet8f e = _mm256_sub_ps(emm0, p8f_126f);
-
- // Set the exponents to -1, i.e. x are in the range [0.5,1).
- x = _mm256_and_ps(x, p8f_inv_mant_mask);
- x = _mm256_or_ps(x, p8f_half);
-
- // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
- // and shift by -1. The values are then centered around 0, which improves
- // the stability of the polynomial evaluation.
- // if( x < SQRTHF ) {
- // e -= 1;
- // x = x + x - 1.0;
- // } else { x = x - 1.0; }
- Packet8f mask = _mm256_cmp_ps(x, p8f_cephes_SQRTHF, _CMP_LT_OQ);
- Packet8f tmp = _mm256_and_ps(x, mask);
- x = psub(x, p8f_1);
- e = psub(e, _mm256_and_ps(p8f_1, mask));
- x = padd(x, tmp);
-
- Packet8f x2 = pmul(x, x);
- Packet8f x3 = pmul(x2, x);
-
- // Evaluate the polynomial approximant of degree 8 in three parts, probably
- // to improve instruction-level parallelism.
- Packet8f y, y1, y2;
- y = pmadd(p8f_cephes_log_p0, x, p8f_cephes_log_p1);
- y1 = pmadd(p8f_cephes_log_p3, x, p8f_cephes_log_p4);
- y2 = pmadd(p8f_cephes_log_p6, x, p8f_cephes_log_p7);
- y = pmadd(y, x, p8f_cephes_log_p2);
- y1 = pmadd(y1, x, p8f_cephes_log_p5);
- y2 = pmadd(y2, x, p8f_cephes_log_p8);
- y = pmadd(y, x3, y1);
- y = pmadd(y, x3, y2);
- y = pmul(y, x3);
-
- // Add the logarithm of the exponent back to the result of the interpolation.
- y1 = pmul(e, p8f_cephes_log_q1);
- tmp = pmul(x2, p8f_half);
- y = padd(y, y1);
- x = psub(x, tmp);
- y2 = pmul(e, p8f_cephes_log_q2);
- x = padd(x, y);
- x = padd(x, y2);
-
- // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
- return _mm256_or_ps(
- _mm256_andnot_ps(iszero_mask, _mm256_or_ps(x, invalid_mask)),
- _mm256_and_ps(iszero_mask, p8f_minus_inf));
-}
-
-// Exponential function. Works by writing "x = m*log(2) + r" where
-// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
-// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
-template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
-pexp<Packet8f>(const Packet8f& _x) {
- _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f);
- _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f);
- _EIGEN_DECLARE_CONST_Packet8f(127, 127.0f);
-
- _EIGEN_DECLARE_CONST_Packet8f(exp_hi, 88.3762626647950f);
- _EIGEN_DECLARE_CONST_Packet8f(exp_lo, -88.3762626647949f);
-
- _EIGEN_DECLARE_CONST_Packet8f(cephes_LOG2EF, 1.44269504088896341f);
-
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p0, 1.9875691500E-4f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p1, 1.3981999507E-3f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p2, 8.3334519073E-3f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p3, 4.1665795894E-2f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p4, 1.6666665459E-1f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p5, 5.0000001201E-1f);
-
- // Clamp x.
- Packet8f x = pmax(pmin(_x, p8f_exp_hi), p8f_exp_lo);
-
- // Express exp(x) as exp(m*ln(2) + r), start by extracting
- // m = floor(x/ln(2) + 0.5).
- Packet8f m = _mm256_floor_ps(pmadd(x, p8f_cephes_LOG2EF, p8f_half));
-
-// Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is
-// subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating
-// truncation errors. Note that we don't use the "pmadd" function here to
-// ensure that a precision-preserving FMA instruction is used.
-#ifdef EIGEN_VECTORIZE_FMA
- _EIGEN_DECLARE_CONST_Packet8f(nln2, -0.6931471805599453f);
- Packet8f r = _mm256_fmadd_ps(m, p8f_nln2, x);
-#else
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C1, 0.693359375f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C2, -2.12194440e-4f);
- Packet8f r = psub(x, pmul(m, p8f_cephes_exp_C1));
- r = psub(r, pmul(m, p8f_cephes_exp_C2));
-#endif
-
- Packet8f r2 = pmul(r, r);
-
- // TODO(gonnet): Split into odd/even polynomials and try to exploit
- // instruction-level parallelism.
- Packet8f y = p8f_cephes_exp_p0;
- y = pmadd(y, r, p8f_cephes_exp_p1);
- y = pmadd(y, r, p8f_cephes_exp_p2);
- y = pmadd(y, r, p8f_cephes_exp_p3);
- y = pmadd(y, r, p8f_cephes_exp_p4);
- y = pmadd(y, r, p8f_cephes_exp_p5);
- y = pmadd(y, r2, r);
- y = padd(y, p8f_1);
-
- // Build emm0 = 2^m.
- Packet8i emm0 = _mm256_cvttps_epi32(padd(m, p8f_127));
-#ifdef EIGEN_VECTORIZE_AVX2
- emm0 = _mm256_slli_epi32(emm0, 23);
-#else
- __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(emm0, 0), 23);
- __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(emm0, 1), 23);
- emm0 = _mm256_setr_m128(lo, hi);
-#endif
-
- // Return 2^m * exp(r).
- return pmax(pmul(y, _mm256_castsi256_ps(emm0)), _x);
-}
-template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
-pexp<Packet4d>(const Packet4d& _x) {
- Packet4d x = _x;
-
- _EIGEN_DECLARE_CONST_Packet4d(1, 1.0);
- _EIGEN_DECLARE_CONST_Packet4d(2, 2.0);
- _EIGEN_DECLARE_CONST_Packet4d(half, 0.5);
-
- _EIGEN_DECLARE_CONST_Packet4d(exp_hi, 709.437);
- _EIGEN_DECLARE_CONST_Packet4d(exp_lo, -709.436139303);
-
- _EIGEN_DECLARE_CONST_Packet4d(cephes_LOG2EF, 1.4426950408889634073599);
-
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p0, 1.26177193074810590878e-4);
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p1, 3.02994407707441961300e-2);
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p2, 9.99999999999999999910e-1);
-
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q0, 3.00198505138664455042e-6);
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q1, 2.52448340349684104192e-3);
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q2, 2.27265548208155028766e-1);
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q3, 2.00000000000000000009e0);
-
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C1, 0.693145751953125);
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C2, 1.42860682030941723212e-6);
- _EIGEN_DECLARE_CONST_Packet4i(1023, 1023);
-
- Packet4d tmp, fx;
-
- // clamp x
- x = pmax(pmin(x, p4d_exp_hi), p4d_exp_lo);
- // Express exp(x) as exp(g + n*log(2)).
- fx = pmadd(p4d_cephes_LOG2EF, x, p4d_half);
-
- // Get the integer modulus of log(2), i.e. the "n" described above.
- fx = _mm256_floor_pd(fx);
-
- // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
- // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
- // digits right.
- tmp = pmul(fx, p4d_cephes_exp_C1);
- Packet4d z = pmul(fx, p4d_cephes_exp_C2);
- x = psub(x, tmp);
- x = psub(x, z);
-
- Packet4d x2 = pmul(x, x);
-
- // Evaluate the numerator polynomial of the rational interpolant.
- Packet4d px = p4d_cephes_exp_p0;
- px = pmadd(px, x2, p4d_cephes_exp_p1);
- px = pmadd(px, x2, p4d_cephes_exp_p2);
- px = pmul(px, x);
-
- // Evaluate the denominator polynomial of the rational interpolant.
- Packet4d qx = p4d_cephes_exp_q0;
- qx = pmadd(qx, x2, p4d_cephes_exp_q1);
- qx = pmadd(qx, x2, p4d_cephes_exp_q2);
- qx = pmadd(qx, x2, p4d_cephes_exp_q3);
-
- // I don't really get this bit, copied from the SSE2 routines, so...
- // TODO(gonnet): Figure out what is going on here, perhaps find a better
- // rational interpolant?
- x = _mm256_div_pd(px, psub(qx, px));
- x = pmadd(p4d_2, x, p4d_1);
-
- // Build e=2^n by constructing the exponents in a 128-bit vector and
- // shifting them to where they belong in double-precision values.
- __m128i emm0 = _mm256_cvtpd_epi32(fx);
- emm0 = _mm_add_epi32(emm0, p4i_1023);
- emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
- __m128i lo = _mm_slli_epi64(emm0, 52);
- __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52);
- __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0);
- e = _mm256_insertf128_si256(e, hi, 1);
-
- // Construct the result 2^n * exp(g) = e * x. The max is used to catch
- // non-finite values in the input.
- return pmax(pmul(x, Packet4d(e)), _x);
-}
-
-// Functions for sqrt.
-// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
-// of Newton's method, at a cost of 1-2 bits of precision as opposed to the
-// exact solution. The main advantage of this approach is not just speed, but
-// also the fact that it can be inlined and pipelined with other computations,
-// further reducing its effective latency.
-#if EIGEN_FAST_MATH
-template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
-psqrt<Packet8f>(const Packet8f& _x) {
- _EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f);
- _EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f);
- _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000);
-
- Packet8f neg_half = pmul(_x, p8f_minus_half);
-
- // select only the inverse sqrt of positive normal inputs (denormals are
- // flushed to zero and cause infs as well).
- Packet8f non_zero_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_GE_OQ);
- Packet8f x = _mm256_and_ps(non_zero_mask, _mm256_rsqrt_ps(_x));
-
- // Do a single step of Newton's iteration.
- x = pmul(x, pmadd(neg_half, pmul(x, x), p8f_one_point_five));
-
- // Multiply the original _x by it's reciprocal square root to extract the
- // square root.
- return pmul(_x, x);
-}
-#else
-template <>
-EIGEN_STRONG_INLINE Packet8f psqrt<Packet8f>(const Packet8f& x) {
- return _mm256_sqrt_ps(x);
-}
-#endif
-template <>
-EIGEN_STRONG_INLINE Packet4d psqrt<Packet4d>(const Packet4d& x) {
- return _mm256_sqrt_pd(x);
-}
-
-// Functions for rsqrt.
-// Almost identical to the sqrt routine, just leave out the last multiplication
-// and fill in NaN/Inf where needed. Note that this function only exists as an
-// iterative version since there is no instruction for diretly computing the
-// reciprocal square root in AVX/AVX2 (there will be one in AVX-512).
-#ifdef EIGEN_FAST_MATH
-template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
-prsqrt<Packet8f>(const Packet8f& _x) {
- _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000);
- _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(nan, 0x7fc00000);
- _EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f);
- _EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f);
- _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000);
-
- Packet8f neg_half = pmul(_x, p8f_minus_half);
-
- // select only the inverse sqrt of positive normal inputs (denormals are
- // flushed to zero and cause infs as well).
- Packet8f le_zero_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ);
- Packet8f x = _mm256_andnot_ps(le_zero_mask, _mm256_rsqrt_ps(_x));
-
- // Fill in NaNs and Infs for the negative/zero entries.
- Packet8f neg_mask = _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_LT_OQ);
- Packet8f zero_mask = _mm256_andnot_ps(neg_mask, le_zero_mask);
- Packet8f infs_and_nans = _mm256_or_ps(_mm256_and_ps(neg_mask, p8f_nan),
- _mm256_and_ps(zero_mask, p8f_inf));
-
- // Do a single step of Newton's iteration.
- x = pmul(x, pmadd(neg_half, pmul(x, x), p8f_one_point_five));
-
- // Insert NaNs and Infs in all the right places.
- return _mm256_or_ps(x, infs_and_nans);
-}
-#else
-template <>
-EIGEN_STRONG_INLINE Packet8f prsqrt<Packet8f>(const Packet8f& x) {
- _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f);
- return _mm256_div_ps(p8f_one, _mm256_sqrt_ps(x));
-}
-#endif
-template <>
-EIGEN_STRONG_INLINE Packet4d prsqrt<Packet4d>(const Packet4d& x) {
- _EIGEN_DECLARE_CONST_Packet4d(one, 1.0);
- return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(x));
-}
-
-// Functions for division.
-// The EIGEN_FAST_MATH version uses the _mm_rcp_ps approximation and one step of
-// Newton's method, at a cost of 1-2 bits of precision as opposed to the exact
-// solution. The main advantage of this approach is not just speed, but also the
-// fact that it can be inlined and pipelined with other computations, further
-// reducing its effective latency.
-#if EIGEN_FAST_DIV
-template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
-pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) {
- _EIGEN_DECLARE_CONST_Packet8f(two, 2.0f);
- _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000);
-
- Packet8f neg_b = pnegate(b);
-
- /* select only the inverse of non-zero b */
- Packet8f non_zero_mask = _mm256_cmp_ps(b, _mm256_setzero_ps(), _CMP_NEQ_OQ);
- Packet8f x = _mm256_and_ps(non_zero_mask, _mm256_rcp_ps(b));
-
- /* One step of Newton's method on b - x^-1 == 0. */
- x = pmul(x, pmadd(neg_b, x, p8f_two));
-
- /* Return Infs wherever there were zeros. */
- return pmul(a, _mm256_or_ps(_mm256_and_ps(non_zero_mask, x),
- _mm256_andnot_ps(non_zero_mask, p8f_inf)));
-}
-#else
-template <>
-EIGEN_STRONG_INLINE Packet8f
-pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) {
- return _mm256_div_ps(a, b);
-}
-#endif
-template <>
-EIGEN_STRONG_INLINE Packet4d
-pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) {
- return _mm256_div_pd(a, b);
-}
-template <>
-EIGEN_STRONG_INLINE Packet8i
-pdiv<Packet8i>(const Packet8i& /*a*/, const Packet8i& /*b*/) {
- eigen_assert(false && "packet integer division are not supported by AVX");
- return pset1<Packet8i>(0);
-}
-
-// Identical to the ptanh in GenericPacketMath.h, but for doubles use
-// a small/medium approximation threshold of 0.001.
-template<> EIGEN_STRONG_INLINE Packet4d ptanh_approx_threshold() {
- return pset1<Packet4d>(0.001);
-}
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_MATH_FUNCTIONS_AVX_H
diff --git a/third_party/eigen3/Eigen/src/Core/arch/AVX/PacketMath.h b/third_party/eigen3/Eigen/src/Core/arch/AVX/PacketMath.h
deleted file mode 100644
index 03a7d5127c..0000000000
--- a/third_party/eigen3/Eigen/src/Core/arch/AVX/PacketMath.h
+++ /dev/null
@@ -1,650 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_PACKET_MATH_AVX_H
-#define EIGEN_PACKET_MATH_AVX_H
-
-namespace Eigen {
-
-namespace internal {
-
-#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
-#endif
-
-#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
-#endif
-
-#ifdef __FMA__
-#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
-#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
-#endif
-#endif
-
-typedef __m256 Packet8f;
-typedef __m256i Packet8i;
-typedef __m256d Packet4d;
-
-template<> struct is_arithmetic<__m256> { enum { value = true }; };
-template<> struct is_arithmetic<__m256i> { enum { value = true }; };
-template<> struct is_arithmetic<__m256d> { enum { value = true }; };
-
-#define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \
- const Packet8f p8f_##NAME = pset1<Packet8f>(X)
-
-#define _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(NAME,X) \
- const Packet8f p8f_##NAME = (__m256)pset1<Packet8i>(X)
-
-#define _EIGEN_DECLARE_CONST_Packet8i(NAME,X) \
- const Packet8i p8i_##NAME = pset1<Packet8i>(X)
-
-#define _EIGEN_DECLARE_CONST_Packet4d(NAME,X) \
- const Packet4d p4d_##NAME = pset1<Packet4d>(X)
-
-
-template<> struct packet_traits<float> : default_packet_traits
-{
- typedef Packet8f type;
- typedef Packet4f half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size=8,
- HasHalfPacket = 1,
-
- HasDiv = 1,
- HasSin = 1,
- HasCos = 0,
- HasTanH = 1,
- HasBlend = 1,
- HasLog = 1,
- HasExp = 1,
- HasSqrt = 1,
- HasRsqrt = 1,
- HasSelect = 1,
- HasEq = 1
- };
- };
-template<> struct packet_traits<double> : default_packet_traits
-{
- typedef Packet4d type;
- typedef Packet2d half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size = 4,
- HasHalfPacket = 1,
-
- HasDiv = 1,
- HasBlend = 1,
- HasExp = 1,
- HasSqrt = 1,
- HasRsqrt = 1,
- HasSelect = 1,
- HasEq = 1,
- };
-};
-
-/* Proper support for integers is only provided by AVX2. In the meantime, we'll
- use SSE instructions and packets to deal with integers.
-template<> struct packet_traits<int> : default_packet_traits
-{
- typedef Packet8i type;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size=8
- };
-};
-*/
-
-template<> struct unpacket_traits<Packet8f> { typedef float type; typedef Packet4f half; enum {size=8}; };
-template<> struct unpacket_traits<Packet4d> { typedef double type; typedef Packet2d half; enum {size=4}; };
-template<> struct unpacket_traits<Packet8i> { typedef int type; typedef Packet4i half; enum {size=8}; };
-
-template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) { return _mm256_set1_ps(from); }
-template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int& from) { return _mm256_set1_epi32(from); }
-
-template<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float* from) { return _mm256_broadcast_ss(from); }
-template<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); }
-
-template<> EIGEN_STRONG_INLINE Packet8f plset<float>(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); }
-template<> EIGEN_STRONG_INLINE Packet4d plset<double>(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); }
-
-template<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet8f ple<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_NGT_UQ); }
-template<> EIGEN_STRONG_INLINE Packet4d ple<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_NGT_UQ); }
-
-template<> EIGEN_STRONG_INLINE Packet8f plt<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_NGE_UQ); }
-template<> EIGEN_STRONG_INLINE Packet4d plt<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_NGE_UQ); }
-
-template<> EIGEN_STRONG_INLINE Packet8f peq<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_EQ_UQ); }
-template<> EIGEN_STRONG_INLINE Packet4d peq<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_EQ_UQ); }
-
-template<> EIGEN_STRONG_INLINE Packet8f pselect<Packet8f>(const Packet8f& a, const Packet8f& b, const Packet8f& false_mask) { return _mm256_blendv_ps(a,b,false_mask); }
-template<> EIGEN_STRONG_INLINE Packet4d pselect<Packet4d>(const Packet4d& a, const Packet4d& b, const Packet4d& false_mask) { return _mm256_blendv_pd(a,b,false_mask); }
-
-template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a)
-{
- return _mm256_sub_ps(_mm256_set1_ps(0.0),a);
-}
-template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a)
-{
- return _mm256_sub_pd(_mm256_set1_pd(0.0),a);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; }
-
-template<> EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d pmul<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); }
-
-#ifdef __FMA__
-template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
-#if EIGEN_GNUC_AT_MOST(4, 8) || EIGEN_COMP_CLANG
- // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
- // and gcc stupidly generates a vfmadd132ps instruction,
- // so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate
- // the result of the product. the issue has been fixed in gcc 4.9
- Packet8f res = c;
- asm("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
- return res;
-#else
- return _mm256_fmadd_ps(a,b,c);
-#endif
-}
-template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
-#if EIGEN_GNUC_AT_MOST(4, 8) || EIGEN_COMP_CLANG
- // see above
- Packet4d res = c;
- asm("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
- return res;
-#else
- return _mm256_fmadd_pd(a,b,c);
-#endif
-}
-#endif
-
-template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); }
-template<> EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet8i pload<Packet8i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); }
-
-template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from); }
-template<> EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); }
-
-// Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3}
-template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from)
-{
- // TODO try to find a way to avoid the need of a temporary register
-// Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from));
-// tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1);
-// return _mm256_unpacklo_ps(tmp,tmp);
-
- // _mm256_insertf128_ps is very slow on Haswell, thus:
- Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
- // mimic an "inplace" permutation of the lower 128bits using a blend
- tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15);
- // then we can perform a consistent permutation on the global register to get everything in shape:
- return _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2));
-}
-// Loads 2 doubles from memory a returns the packet {a0, a0 a1, a1}
-template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from)
-{
- Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from);
- return _mm256_permute_pd(tmp, 3<<2);
-}
-
-// Loads 2 floats from memory a returns the packet {a0, a0 a0, a0, a1, a1, a1, a1}
-template<> EIGEN_STRONG_INLINE Packet8f ploadquad<Packet8f>(const float* from)
-{
- Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from));
- return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1);
-}
-
-template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
-
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
-
-// NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available
-template<> EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, int stride)
-{
-#ifdef EIGEN_VECTORIZE_AVX2
- return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4);
-#else
- return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride],
- from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
-#endif
-}
-template<> EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const double* from, int stride)
-{
-#ifdef EIGEN_VECTORIZE_AVX2
- return _mm256_i32gather_pd(from, _mm_set1_epi32(stride), 8);
-#else
- return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
-#endif
-}
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet8f>(float* to, const Packet8f& from, int stride)
-{
- __m128 low = _mm256_extractf128_ps(from, 0);
- to[stride*0] = _mm_cvtss_f32(low);
- to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1));
- to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2));
- to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3));
-
- __m128 high = _mm256_extractf128_ps(from, 1);
- to[stride*4] = _mm_cvtss_f32(high);
- to[stride*5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1));
- to[stride*6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2));
- to[stride*7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3));
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet4d>(double* to, const Packet4d& from, int stride)
-{
- __m128d low = _mm256_extractf128_pd(from, 0);
- to[stride*0] = _mm_cvtsd_f64(low);
- to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1));
- __m128d high = _mm256_extractf128_pd(from, 1);
- to[stride*2] = _mm_cvtsd_f64(high);
- to[stride*3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1));
-}
-
-template<> EIGEN_STRONG_INLINE void pstore1<Packet8f>(float* to, const float& a)
-{
- Packet8f pa = pset1<Packet8f>(a);
- pstore(to, pa);
-}
-template<> EIGEN_STRONG_INLINE void pstore1<Packet4d>(double* to, const double& a)
-{
- Packet4d pa = pset1<Packet4d>(a);
- pstore(to, pa);
-}
-template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a)
-{
- Packet8i pa = pset1<Packet8i>(a);
- pstore(to, pa);
-}
-
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-
-template<> EIGEN_STRONG_INLINE float pfirst<Packet8f>(const Packet8f& a) {
- return _mm_cvtss_f32(_mm256_castps256_ps128(a));
-}
-template<> EIGEN_STRONG_INLINE double pfirst<Packet4d>(const Packet4d& a) {
- return _mm_cvtsd_f64(_mm256_castpd256_pd128(a));
-}
-template<> EIGEN_STRONG_INLINE int pfirst<Packet8i>(const Packet8i& a) {
- return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
-}
-
-
-template<> EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a)
-{
- __m256 tmp = _mm256_shuffle_ps(a,a,0x1b);
- return _mm256_permute2f128_ps(tmp, tmp, 1);
-}
-template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a)
-{
- __m256d tmp = _mm256_shuffle_pd(a,a,5);
- return _mm256_permute2f128_pd(tmp, tmp, 1);
-
- __m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
- return _mm256_permute_pd(swap_halves,5);
-}
-
-// pabs should be ok
-template<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a)
-{
- const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
- return _mm256_and_ps(a,mask);
-}
-template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a)
-{
- const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
- return _mm256_and_pd(a,mask);
-}
-
-// preduxp should be ok
-// FIXME: why is this ok? why isn't the simply implementation working as expected?
-template<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs)
-{
- __m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]);
- __m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]);
- __m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]);
- __m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]);
-
- __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1);
- __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2);
- __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3);
- __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4);
-
- __m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
- __m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
- __m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
- __m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
-
- __m256 sum1 = _mm256_add_ps(perm1, hsum5);
- __m256 sum2 = _mm256_add_ps(perm2, hsum6);
- __m256 sum3 = _mm256_add_ps(perm3, hsum7);
- __m256 sum4 = _mm256_add_ps(perm4, hsum8);
-
- __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
- __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
-
- __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0);
- return final;
-}
-template<> EIGEN_STRONG_INLINE Packet4d preduxp<Packet4d>(const Packet4d* vecs)
-{
- Packet4d tmp0, tmp1;
-
- tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]);
- tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
-
- tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]);
- tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
-
- return _mm256_blend_pd(tmp0, tmp1, 0xC);
-}
-
-template<> EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a)
-{
- Packet8f tmp0 = _mm256_hadd_ps(a,_mm256_permute2f128_ps(a,a,1));
- tmp0 = _mm256_hadd_ps(tmp0,tmp0);
- return pfirst(_mm256_hadd_ps(tmp0, tmp0));
-}
-template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
-{
- Packet4d tmp0 = _mm256_hadd_pd(a,_mm256_permute2f128_pd(a,a,1));
- return pfirst(_mm256_hadd_pd(tmp0,tmp0));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f predux4<Packet8f>(const Packet8f& a)
-{
- return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1));
-}
-
-template<> EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a)
-{
- Packet8f tmp;
- tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a,a,1));
- tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
- return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
-}
-template<> EIGEN_STRONG_INLINE double predux_mul<Packet4d>(const Packet4d& a)
-{
- Packet4d tmp;
- tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a,a,1));
- return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp,tmp,1)));
-}
-
-template<> EIGEN_STRONG_INLINE float predux_min<Packet8f>(const Packet8f& a)
-{
- Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a,a,1));
- tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
- return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
-}
-template<> EIGEN_STRONG_INLINE double predux_min<Packet4d>(const Packet4d& a)
-{
- Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a,a,1));
- return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
-}
-
-template<> EIGEN_STRONG_INLINE float predux_max<Packet8f>(const Packet8f& a)
-{
- Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a,a,1));
- tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
- return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
-}
-
-template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a)
-{
- Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a,a,1));
- return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
-}
-
-
-template<int Offset>
-struct palign_impl<Offset,Packet8f>
-{
- static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second)
- {
- if (Offset==1)
- {
- first = _mm256_blend_ps(first, second, 1);
- Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
- first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0x88);
- }
- else if (Offset==2)
- {
- first = _mm256_blend_ps(first, second, 3);
- Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
- first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0xcc);
- }
- else if (Offset==3)
- {
- first = _mm256_blend_ps(first, second, 7);
- Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
- first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0xee);
- }
- else if (Offset==4)
- {
- first = _mm256_blend_ps(first, second, 15);
- Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0));
- first = _mm256_permute_ps(_mm256_permute2f128_ps (tmp, tmp, 1), _MM_SHUFFLE(3,2,1,0));
- }
- else if (Offset==5)
- {
- first = _mm256_blend_ps(first, second, 31);
- first = _mm256_permute2f128_ps(first, first, 1);
- Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
- first = _mm256_permute2f128_ps(tmp, tmp, 1);
- first = _mm256_blend_ps(tmp, first, 0x88);
- }
- else if (Offset==6)
- {
- first = _mm256_blend_ps(first, second, 63);
- first = _mm256_permute2f128_ps(first, first, 1);
- Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
- first = _mm256_permute2f128_ps(tmp, tmp, 1);
- first = _mm256_blend_ps(tmp, first, 0xcc);
- }
- else if (Offset==7)
- {
- first = _mm256_blend_ps(first, second, 127);
- first = _mm256_permute2f128_ps(first, first, 1);
- Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
- first = _mm256_permute2f128_ps(tmp, tmp, 1);
- first = _mm256_blend_ps(tmp, first, 0xee);
- }
- }
-};
-
-template<int Offset>
-struct palign_impl<Offset,Packet4d>
-{
- static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second)
- {
- if (Offset==1)
- {
- first = _mm256_blend_pd(first, second, 1);
- __m256d tmp = _mm256_permute_pd(first, 5);
- first = _mm256_permute2f128_pd(tmp, tmp, 1);
- first = _mm256_blend_pd(tmp, first, 0xA);
- }
- else if (Offset==2)
- {
- first = _mm256_blend_pd(first, second, 3);
- first = _mm256_permute2f128_pd(first, first, 1);
- }
- else if (Offset==3)
- {
- first = _mm256_blend_pd(first, second, 7);
- __m256d tmp = _mm256_permute_pd(first, 5);
- first = _mm256_permute2f128_pd(tmp, tmp, 1);
- first = _mm256_blend_pd(tmp, first, 5);
- }
- }
-};
-
-template<> EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8f,8>& kernel) {
- __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
- __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
- __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
- __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
- __m256 T4 = _mm256_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
- __m256 T5 = _mm256_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
- __m256 T6 = _mm256_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
- __m256 T7 = _mm256_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
- __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
- __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
- __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
- __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
- __m256 S4 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(1,0,1,0));
- __m256 S5 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(3,2,3,2));
- __m256 S6 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(1,0,1,0));
- __m256 S7 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(3,2,3,2));
- kernel.packet[0] = _mm256_permute2f128_ps(S0, S4, 0x20);
- kernel.packet[1] = _mm256_permute2f128_ps(S1, S5, 0x20);
- kernel.packet[2] = _mm256_permute2f128_ps(S2, S6, 0x20);
- kernel.packet[3] = _mm256_permute2f128_ps(S3, S7, 0x20);
- kernel.packet[4] = _mm256_permute2f128_ps(S0, S4, 0x31);
- kernel.packet[5] = _mm256_permute2f128_ps(S1, S5, 0x31);
- kernel.packet[6] = _mm256_permute2f128_ps(S2, S6, 0x31);
- kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31);
-}
-
-template<> EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8f,4>& kernel) {
- __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
- __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
- __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
- __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
-
- __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
- __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
- __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
- __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
-
- kernel.packet[0] = _mm256_permute2f128_ps(S0, S1, 0x20);
- kernel.packet[1] = _mm256_permute2f128_ps(S2, S3, 0x20);
- kernel.packet[2] = _mm256_permute2f128_ps(S0, S1, 0x31);
- kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31);
-}
-
-template<> EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4d,4>& kernel) {
- __m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15);
- __m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
- __m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15);
- __m256d T3 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 0);
-
- kernel.packet[1] = _mm256_permute2f128_pd(T0, T2, 32);
- kernel.packet[3] = _mm256_permute2f128_pd(T0, T2, 49);
- kernel.packet[0] = _mm256_permute2f128_pd(T1, T3, 32);
- kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) {
- const __m256 zero = _mm256_setzero_ps();
- const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
- __m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ);
- return _mm256_blendv_ps(thenPacket, elsePacket, false_mask);
-}
-template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) {
- const __m256d zero = _mm256_setzero_pd();
- const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
- __m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ);
- return _mm256_blendv_pd(thenPacket, elsePacket, false_mask);
-}
-
-// Functions to print vectors of different types, makes debugging much easier.
-namespace{
-void print4f(char* name, __m128 val) {
- float temp[4] __attribute__((aligned(32)));
- _mm_store_ps(temp, val);
- printf("%s: ", name);
- for (int k = 0; k < 4; k++) printf("%.8e ", temp[k]);
- printf("\n");
-}
-void print8f(char* name, __m256 val) {
- float temp[8] __attribute__((aligned(32)));
- _mm256_store_ps(temp, val);
- printf("%s: ", name);
- for (int k = 0; k < 8; k++) printf("%.8e ", temp[k]);
- printf("\n");
-}
-void print4i(char* name, __m128i val) {
- int temp[4] __attribute__((aligned(32)));
- _mm_store_si128((__m128i*)temp, val);
- printf("%s: ", name);
- for (int k = 0; k < 4; k++) printf("%i ", temp[k]);
- printf("\n");
-}
-void print8i(char* name, __m256i val) {
- int temp[8] __attribute__((aligned(32)));
- _mm256_store_si256((__m256i*)temp, val);
- printf("%s: ", name);
- for (int k = 0; k < 8; k++) printf("%i ", temp[k]);
- printf("\n");
-}
-void print8b(char* name, __m256i val) {
- int temp[8] __attribute__((aligned(32)));
- _mm256_store_si256((__m256i*)temp, val);
- printf("%s: ", name);
- for (int k = 0; k < 8; k++) printf("0x%08x ", temp[k]);
- printf("\n");
-}
-void print4d(char* name, __m256d val) {
- double temp[4] __attribute__((aligned(32)));
- _mm256_store_pd(temp, val);
- printf("%s: ", name);
- for (int k = 0; k < 4; k++) printf("%.16e ", temp[k]);
- printf("\n");
-}
-};
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_PACKET_MATH_AVX_H
diff --git a/third_party/eigen3/Eigen/src/Core/arch/AVX/TypeCasting.h b/third_party/eigen3/Eigen/src/Core/arch/AVX/TypeCasting.h
deleted file mode 100644
index 83bfdc604b..0000000000
--- a/third_party/eigen3/Eigen/src/Core/arch/AVX/TypeCasting.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_TYPE_CASTING_AVX_H
-#define EIGEN_TYPE_CASTING_AVX_H
-
-namespace Eigen {
-
-namespace internal {
-
-// For now we use SSE to handle integers, so we can't use AVX instructions to cast
-// from int to float
-template <>
-struct type_casting_traits<float, int> {
- enum {
- VectorizedCast = 0,
- SrcCoeffRatio = 1,
- TgtCoeffRatio = 1
- };
-};
-
-template <>
-struct type_casting_traits<int, float> {
- enum {
- VectorizedCast = 0,
- SrcCoeffRatio = 1,
- TgtCoeffRatio = 1
- };
-};
-
-
-
-template<> EIGEN_STRONG_INLINE Packet8i pcast<Packet8f, Packet8i>(const Packet8f& a) {
- return _mm256_cvtps_epi32(a);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8i, Packet8f>(const Packet8i& a) {
- return _mm256_cvtepi32_ps(a);
-}
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_TYPE_CASTING_AVX_H
diff --git a/third_party/eigen3/Eigen/src/Core/arch/AltiVec/Complex.h b/third_party/eigen3/Eigen/src/Core/arch/AltiVec/Complex.h
deleted file mode 100644
index 57df9508b3..0000000000
--- a/third_party/eigen3/Eigen/src/Core/arch/AltiVec/Complex.h
+++ /dev/null
@@ -1,439 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_COMPLEX32_ALTIVEC_H
-#define EIGEN_COMPLEX32_ALTIVEC_H
-
-
-namespace Eigen {
-
-namespace internal {
-
-static Packet4ui p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_ZERO_);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
-#ifdef EIGEN_VECTORIZE_VSX
-#ifdef _BIG_ENDIAN
-static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
-static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 };
-#else
-static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 };
-static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
-#endif
-#endif // EIGEN_VECTORIZE_VSX
-
-//---------- float ----------
-struct Packet2cf
-{
- EIGEN_STRONG_INLINE Packet2cf() {}
- EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
- Packet4f v;
-};
-
-template<> struct packet_traits<std::complex<float> > : default_packet_traits
-{
- typedef Packet2cf type;
- typedef Packet2cf half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size = 2,
-
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 1,
- HasNegate = 1,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 0,
- HasMax = 0,
- HasSetLinear = 0
- };
-};
-
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; };
-
-template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
-{
- Packet2cf res;
- /* On AltiVec we cannot load 64-bit registers, so wa have to take care of alignment */
- if((ptrdiff_t(&from) % 16) == 0)
- res.v = pload<Packet4f>((const float *)&from);
- else
- res.v = ploadu<Packet4f>((const float *)&from);
- res.v = vec_perm(res.v, res.v, p16uc_PSET64_HI);
- return res;
-}
-
-template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, int stride)
-{
- std::complex<float> EIGEN_ALIGN16 af[2];
- af[0] = from[0*stride];
- af[1] = from[1*stride];
- return Packet2cf(vec_ld(0, (const float*)af));
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, int stride)
-{
- std::complex<float> EIGEN_ALIGN16 af[2];
- vec_st(from.v, 0, (float*)af);
- to[0*stride] = af[0];
- to[1*stride] = af[1];
-}
-
-
-template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_add(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_sub(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf((Packet4f)vec_xor((Packet4ui)a.v, p4ui_CONJ_XOR)); }
-
-template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
- Packet4f v1, v2;
-
- // Permute and multiply the real parts of a and b
- v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD);
- // Get the imaginary parts of a
- v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN);
- // multiply a_re * b
- v1 = vec_madd(v1, b.v, p4f_ZERO);
- // multiply a_im * b and get the conjugate result
- v2 = vec_madd(v2, b.v, p4f_ZERO);
- v2 = (Packet4f) vec_xor((Packet4ui)v2, p4ui_CONJ_XOR);
- // permute back to a proper order
- v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV);
-
- return Packet2cf(vec_add(v1, v2));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf pand <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_and(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf por <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_or(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pxor <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_xor(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_and(a.v, vec_nor(b.v,b.v))); }
-
-template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); }
-template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); }
-
-template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from)
-{
- return pset1<Packet2cf>(*from);
-}
-
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
-
-#ifndef __VSX__
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { vec_dstt((float *)addr, DST_CTRL(2,2,32), DST_CHAN); }
-#endif
-
-template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
-{
- std::complex<float> EIGEN_ALIGN16 res[2];
- pstore((float *)&res, a.v);
-
- return res[0];
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
-{
- Packet4f rev_a;
- rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX32_REV2);
- return Packet2cf(rev_a);
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
-{
- Packet4f b;
- b = (Packet4f) vec_sld(a.v, a.v, 8);
- b = padd(a.v, b);
- return pfirst(Packet2cf(b));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
-{
- Packet4f b1, b2;
-#ifdef _BIG_ENDIAN
- b1 = (Packet4f) vec_sld(vecs[0].v, vecs[1].v, 8);
- b2 = (Packet4f) vec_sld(vecs[1].v, vecs[0].v, 8);
-#else
- b1 = (Packet4f) vec_sld(vecs[1].v, vecs[0].v, 8);
- b2 = (Packet4f) vec_sld(vecs[0].v, vecs[1].v, 8);
-#endif
- b2 = (Packet4f) vec_sld(b2, b2, 8);
- b2 = padd(b1, b2);
-
- return Packet2cf(b2);
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
-{
- Packet4f b;
- Packet2cf prod;
- b = (Packet4f) vec_sld(a.v, a.v, 8);
- prod = pmul(a, Packet2cf(b));
-
- return pfirst(prod);
-}
-
-template<int Offset>
-struct palign_impl<Offset,Packet2cf>
-{
- static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second)
- {
- if (Offset==1)
- {
-#ifdef _BIG_ENDIAN
- first.v = vec_sld(first.v, second.v, 8);
-#else
- first.v = vec_sld(second.v, first.v, 8);
-#endif
- }
- }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
-{
- EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
- { return padd(pmul(x,y),c); }
-
- EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
- {
- return internal::pmul(a, pconj(b));
- }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
-{
- EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
- { return padd(pmul(x,y),c); }
-
- EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
- {
- return internal::pmul(pconj(a), b);
- }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
-{
- EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
- { return padd(pmul(x,y),c); }
-
- EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
- {
- return pconj(internal::pmul(a, b));
- }
-};
-
-template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
- // TODO optimize it for AltiVec
- Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
- Packet4f s = vec_madd(b.v, b.v, p4f_ZERO);
- return Packet2cf(pdiv(res.v, vec_add(s,vec_perm(s, s, p16uc_COMPLEX32_REV))));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x)
-{
- return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX32_REV));
-}
-
-template<> EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
-{
- Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
- kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
- kernel.packet[0].v = tmp;
-}
-
-//---------- double ----------
-#if defined(EIGEN_VECTORIZE_VSX)
-struct Packet1cd
-{
- EIGEN_STRONG_INLINE Packet1cd() {}
- EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
- Packet2d v;
-};
-
-template<> struct packet_traits<std::complex<double> > : default_packet_traits
-{
- typedef Packet1cd type;
- typedef Packet1cd half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 0,
- size = 1,
- HasHalfPacket = 0,
-
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 1,
- HasNegate = 1,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 0,
- HasMax = 0,
- HasSetLinear = 0
- };
-};
-
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; typedef Packet1cd half; };
-
-template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
-
-template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from)
-{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
-
-// Google-local: Change type from DenseIndex to int in patch.
-template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, int/*DenseIndex*/ stride)
-{
- std::complex<double> EIGEN_ALIGN16 af[2];
- af[0] = from[0*stride];
- af[1] = from[1*stride];
- return pload<Packet1cd>(af);
-}
-// Google-local: Change type from DenseIndex to int in patch.
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, int/*DenseIndex*/ stride)
-{
- std::complex<double> EIGEN_ALIGN16 af[2];
- pstore<std::complex<double> >(af, from);
- to[0*stride] = af[0];
- to[1*stride] = af[1];
-}
-
-template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_add(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_sub(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); }
-
-template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
- Packet2d a_re, a_im, v1, v2;
-
- // Permute and multiply the real parts of a and b
- a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI);
- // Get the imaginary parts of a
- a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO);
- // multiply a_re * b
- v1 = vec_madd(a_re, b.v, p2d_ZERO);
- // multiply a_im * b and get the conjugate result
- v2 = vec_madd(a_im, b.v, p2d_ZERO);
- v2 = (Packet2d) vec_sld((Packet4ui)v2, (Packet4ui)v2, 8);
- v2 = (Packet2d) vec_xor((Packet2d)v2, (Packet2d) p2ul_CONJ_XOR1);
-
- return Packet1cd(vec_add(v1, v2));
-}
-
-template<> EIGEN_STRONG_INLINE Packet1cd pand <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd por <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pxor <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); }
-
-template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from)
-{
- return pset1<Packet1cd>(*from);
-}
-
-#ifndef __VSX__
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { vec_dstt((long *)addr, DST_CTRL(2,2,32), DST_CHAN); }
-#endif
-
-template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
-{
- std::complex<double> EIGEN_ALIGN16 res[2];
- pstore<std::complex<double> >(res, a);
-
- return res[0];
-}
-
-template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
-
-template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a)
-{
- return pfirst(a);
-}
-
-template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)
-{
- return vecs[0];
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
-{
- return pfirst(a);
-}
-
-template<int Offset>
-struct palign_impl<Offset,Packet1cd>
-{
- static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
- {
- // FIXME is it sure we never have to align a Packet1cd?
- // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
- }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
-{
- EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
- { return padd(pmul(x,y),c); }
-
- EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
- {
- return internal::pmul(a, pconj(b));
- }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
-{
- EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
- { return padd(pmul(x,y),c); }
-
- EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
- {
- return internal::pmul(pconj(a), b);
- }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
-{
- EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
- { return padd(pmul(x,y),c); }
-
- EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
- {
- return pconj(internal::pmul(a, b));
- }
-};
-
-template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
- // TODO optimize it for AltiVec
- Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
- Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_);
- return Packet1cd(pdiv(res.v, vec_add(s,vec_perm(s, s, p16uc_REVERSE64))));
-}
-
-EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
-{
- return Packet1cd(preverse(Packet2d(x.v)));
-}
-
-EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
-{
- Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
- kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
- kernel.packet[0].v = tmp;
-}
-#endif // EIGEN_VECTORIZE_VSX
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_COMPLEX32_ALTIVEC_H
diff --git a/third_party/eigen3/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/third_party/eigen3/Eigen/src/Core/arch/AltiVec/MathFunctions.h
deleted file mode 100644
index e3545b4abc..0000000000
--- a/third_party/eigen3/Eigen/src/Core/arch/AltiVec/MathFunctions.h
+++ /dev/null
@@ -1,299 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2007 Julien Pommier
-// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/* The sin, cos, exp, and log functions of this file come from
- * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
- */
-
-#ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H
-#define EIGEN_MATH_FUNCTIONS_ALTIVEC_H
-
-#include <iostream>
-
-#define DUMP(v) do { std::cout << #v " = " << (v) << std::endl; } while(0)
-
-namespace Eigen {
-
-namespace internal {
-
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f plog<Packet4f>(const Packet4f& _x)
-{
- Packet4f x = _x;
- _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
- _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
- _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
- _EIGEN_DECLARE_CONST_Packet4i(23, 23);
-
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
-
- /* the smallest non denormalized float number */
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000);
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000); // -1.f/0.f
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan, 0xffffffff);
-
- /* natural logarithm computed for 4 simultaneous float
- return NaN for x <= 0
- */
- _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
-
-
- Packet4i emm0;
-
- /* isvalid_mask is 0 if x < 0 or x is NaN. */
- Packet4ui isvalid_mask = reinterpret_cast<Packet4ui>(vec_cmpge(x, p4f_ZERO));
- Packet4ui iszero_mask = reinterpret_cast<Packet4ui>(vec_cmpeq(x, p4f_ZERO));
-
- x = pmax(x, p4f_min_norm_pos); /* cut off denormalized stuff */
- emm0 = vec_sr(reinterpret_cast<Packet4i>(x),
- reinterpret_cast<Packet4ui>(p4i_23));
-
- /* keep only the fractional part */
- x = pand(x, p4f_inv_mant_mask);
- x = por(x, p4f_half);
-
- emm0 = psub(emm0, p4i_0x7f);
- Packet4f e = padd(vec_ctf(emm0, 0), p4f_1);
-
- /* part2:
- if( x < SQRTHF ) {
- e -= 1;
- x = x + x - 1.0;
- } else { x = x - 1.0; }
- */
- Packet4f mask = reinterpret_cast<Packet4f>(vec_cmplt(x, p4f_cephes_SQRTHF));
- Packet4f tmp = pand(x, mask);
- x = psub(x, p4f_1);
- e = psub(e, pand(p4f_1, mask));
- x = padd(x, tmp);
-
- Packet4f x2 = pmul(x,x);
- Packet4f x3 = pmul(x2,x);
-
- Packet4f y, y1, y2;
- y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
- y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
- y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
- y = pmadd(y , x, p4f_cephes_log_p2);
- y1 = pmadd(y1, x, p4f_cephes_log_p5);
- y2 = pmadd(y2, x, p4f_cephes_log_p8);
- y = pmadd(y, x3, y1);
- y = pmadd(y, x3, y2);
- y = pmul(y, x3);
-
- y1 = pmul(e, p4f_cephes_log_q1);
- tmp = pmul(x2, p4f_half);
- y = padd(y, y1);
- x = psub(x, tmp);
- y2 = pmul(e, p4f_cephes_log_q2);
- x = padd(x, y);
- x = padd(x, y2);
- // negative arg will be NAN, 0 will be -INF
- x = vec_sel(x, p4f_minus_inf, iszero_mask);
- x = vec_sel(p4f_minus_nan, x, isvalid_mask);
- return x;
-}
-
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f pexp<Packet4f>(const Packet4f& _x)
-{
- Packet4f x = _x;
- _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
- _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
- _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
- _EIGEN_DECLARE_CONST_Packet4i(23, 23);
-
-
- _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f);
- _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
-
- _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
-
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
-
- Packet4f tmp, fx;
- Packet4i emm0;
-
- // clamp x
- x = vec_max(vec_min(x, p4f_exp_hi), p4f_exp_lo);
-
- /* express exp(x) as exp(g + n*log(2)) */
- fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);
-
- fx = vec_floor(fx);
-
- tmp = pmul(fx, p4f_cephes_exp_C1);
- Packet4f z = pmul(fx, p4f_cephes_exp_C2);
- x = psub(x, tmp);
- x = psub(x, z);
-
- z = pmul(x,x);
-
- Packet4f y = p4f_cephes_exp_p0;
- y = pmadd(y, x, p4f_cephes_exp_p1);
- y = pmadd(y, x, p4f_cephes_exp_p2);
- y = pmadd(y, x, p4f_cephes_exp_p3);
- y = pmadd(y, x, p4f_cephes_exp_p4);
- y = pmadd(y, x, p4f_cephes_exp_p5);
- y = pmadd(y, z, x);
- y = padd(y, p4f_1);
-
- // build 2^n
- emm0 = vec_cts(fx, 0);
- emm0 = vec_add(emm0, p4i_0x7f);
- emm0 = vec_sl(emm0, reinterpret_cast<Packet4ui>(p4i_23));
-
- // Altivec's max & min operators just drop silent NaNs. Check NaNs in
- // inputs and return them unmodified.
- Packet4ui isnumber_mask = reinterpret_cast<Packet4ui>(vec_cmpeq(_x, _x));
- return vec_sel(_x, pmax(pmul(y, reinterpret_cast<Packet4f>(emm0)), _x),
- isnumber_mask);
-}
-
-#ifdef __VSX__
-
-#undef GCC_VERSION
-#define GCC_VERSION (__GNUC__ * 10000 \
- + __GNUC_MINOR__ * 100 \
- + __GNUC_PATCHLEVEL__)
-
-// VSX support varies between different compilers and even different
-// versions of the same compiler. For gcc version >= 4.9.3, we can use
-// vec_cts to efficiently convert Packet2d to Packet2l. Otherwise, use
-// a slow version that works with older compilers.
-static inline Packet2l ConvertToPacket2l(const Packet2d& x) {
-#if GCC_VERSION >= 40903 || defined(__clang__)
- return vec_cts(x, 0);
-#else
- double tmp[2];
- memcpy(tmp, &x, sizeof(tmp));
- Packet2l l = { static_cast<long long>(tmp[0]),
- static_cast<long long>(tmp[1]) };
- return l;
-#endif
-}
-
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet2d pexp<Packet2d>(const Packet2d& _x)
-{
- Packet2d x = _x;
-
- _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
- _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
- _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
-
- _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437);
- _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
-
- _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
-
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
-
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
-
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
-
- Packet2d tmp, fx;
- Packet2l emm0;
-
- // clamp x
- x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
- /* express exp(x) as exp(g + n*log(2)) */
- fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half);
-
- fx = vec_floor(fx);
-
- tmp = pmul(fx, p2d_cephes_exp_C1);
- Packet2d z = pmul(fx, p2d_cephes_exp_C2);
- x = psub(x, tmp);
- x = psub(x, z);
-
- Packet2d x2 = pmul(x,x);
-
- Packet2d px = p2d_cephes_exp_p0;
- px = pmadd(px, x2, p2d_cephes_exp_p1);
- px = pmadd(px, x2, p2d_cephes_exp_p2);
- px = pmul (px, x);
-
- Packet2d qx = p2d_cephes_exp_q0;
- qx = pmadd(qx, x2, p2d_cephes_exp_q1);
- qx = pmadd(qx, x2, p2d_cephes_exp_q2);
- qx = pmadd(qx, x2, p2d_cephes_exp_q3);
-
- x = pdiv(px,psub(qx,px));
- x = pmadd(p2d_2,x,p2d_1);
-
- // build 2^n
- emm0 = ConvertToPacket2l(fx);
-
-#ifdef __POWER8_VECTOR__
- static const Packet2l p2l_1023 = { 1023, 1023 };
- static const Packet2ul p2ul_52 = { 52, 52 };
-
- emm0 = vec_add(emm0, p2l_1023);
- emm0 = vec_sl(emm0, p2ul_52);
-#else
- // Code is a bit complex for POWER7. There is actually a
- // vec_xxsldi intrinsic but it is not supported by some gcc versions.
- // So we shift (52-32) bits and do a word swap with zeros.
- _EIGEN_DECLARE_CONST_Packet4i(1023, 1023);
- _EIGEN_DECLARE_CONST_Packet4i(20, 20); // 52 - 32
-
- Packet4i emm04i = reinterpret_cast<Packet4i>(emm0);
- emm04i = vec_add(emm04i, p4i_1023);
- emm04i = vec_sl(emm04i, reinterpret_cast<Packet4ui>(p4i_20));
- static const Packet16uc perm = {
- 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
- 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b };
-#ifdef _BIG_ENDIAN
- emm0 = reinterpret_cast<Packet2l>(vec_perm(p4i_ZERO, emm04i, perm));
-#else
- emm0 = reinterpret_cast<Packet2l>(vec_perm(emm04i, p4i_ZERO, perm));
-#endif
-
-#endif
-
- // Altivec's max & min operators just drop silent NaNs. Check NaNs in
- // inputs and return them unmodified.
- Packet2ul isnumber_mask = reinterpret_cast<Packet2ul>(vec_cmpeq(_x, _x));
- return vec_sel(_x, pmax(pmul(x, reinterpret_cast<Packet2d>(emm0)), _x),
- isnumber_mask);
-}
-#endif
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_MATH_FUNCTIONS_ALTIVEC_H
diff --git a/third_party/eigen3/Eigen/src/Core/arch/AltiVec/PacketMath.h b/third_party/eigen3/Eigen/src/Core/arch/AltiVec/PacketMath.h
deleted file mode 100644
index 640488e92b..0000000000
--- a/third_party/eigen3/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ /dev/null
@@ -1,943 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Konstantinos Margaritis <markos@codex.gr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_PACKET_MATH_ALTIVEC_H
-#define EIGEN_PACKET_MATH_ALTIVEC_H
-
-namespace Eigen {
-
-namespace internal {
-
-#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
-#endif
-
-#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
-#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
-#endif
-
-#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-#endif
-
-// NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
-#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
-#endif
-
-typedef __vector float Packet4f;
-typedef __vector int Packet4i;
-typedef __vector unsigned int Packet4ui;
-typedef __vector __bool int Packet4bi;
-typedef __vector short int Packet8i;
-typedef __vector unsigned char Packet16uc;
-
-// We don't want to write the same code all the time, but we need to reuse the constants
-// and it doesn't really work to declare them global, so we define macros instead
-
-#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
- Packet4f p4f_##NAME = (Packet4f) vec_splat_s32(X)
-
-#define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
- Packet4i p4i_##NAME = vec_splat_s32(X)
-
-#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
- Packet4f p4f_##NAME = pset1<Packet4f>(X)
-
-#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
- Packet4i p4i_##NAME = pset1<Packet4i>(X)
-
-#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
- Packet2d p2d_##NAME = pset1<Packet2d>(X)
-
-#define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \
- Packet2l p2l_##NAME = pset1<Packet2l>(X)
-
-#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
- const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
-
-#define DST_CHAN 1
-#define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
-
-// These constants are endian-agnostic
-static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0);
-static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0);
-#ifndef __VSX__
-static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1);
-static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0);
-#endif
-static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16);
-static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1);
-static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1);
-
-static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
-static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
-
-static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 };
-static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
-
-// Mask alignment
-#ifdef __PPC64__
-#define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0
-#else
-#define _EIGEN_MASK_ALIGNMENT 0xfffffff0
-#endif
-
-#define _EIGEN_ALIGNED_PTR(x) ((ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
-
-// Handle endianness properly while loading constants
-// Define global static constants:
-#ifdef _BIG_ENDIAN
-static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);
-static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
-static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
-static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
-static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
-#else
-static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
-static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
-static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
-static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
-static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO, 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
-#endif // _BIG_ENDIAN
-
-static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
-static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
-static Packet16uc p16uc_TRANSPOSE64_HI = vec_add(p16uc_PSET64_HI, p16uc_HALF64_0_16); //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
-static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0_16); //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
-
-static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
-
-#ifdef _BIG_ENDIAN
-static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
-#else
-static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_LO, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
-#endif // _BIG_ENDIAN
-
-template<> struct packet_traits<float> : default_packet_traits
-{
- typedef Packet4f type;
- typedef Packet4f half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size=4,
-
- // FIXME check the Has*
-#if defined(__VSX__)
- HasDiv = 1,
-#endif
- HasSin = 0,
- HasCos = 0,
- HasLog = 1,
- HasExp = 1,
- HasSqrt = 0
- };
-};
-template<> struct packet_traits<int> : default_packet_traits
-{
- typedef Packet4i type;
- typedef Packet4i half;
- enum {
- // FIXME check the Has*
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size=4
- };
-};
-
-
-template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; typedef Packet4i half; };
-
-inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v)
-{
- union {
- Packet16uc v;
- unsigned char n[16];
- } vt;
- vt.v = v;
- for (int i=0; i< 16; i++)
- s << (int)vt.n[i] << ", ";
- return s;
-}
-
-inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
-{
- union {
- Packet4f v;
- float n[4];
- } vt;
- vt.v = v;
- s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
- return s;
-}
-
-inline std::ostream & operator <<(std::ostream & s, const Packet4i & v)
-{
- union {
- Packet4i v;
- int n[4];
- } vt;
- vt.v = v;
- s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
- return s;
-}
-
-inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
-{
- union {
- Packet4ui v;
- unsigned int n[4];
- } vt;
- vt.v = v;
- s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
- return s;
-}
-/*
-inline std::ostream & operator <<(std::ostream & s, const Packetbi & v)
-{
- union {
- Packet4bi v;
- unsigned int n[4];
- } vt;
- vt.v = v;
- s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
- return s;
-}*/
-
-
-// Need to define them first or we get specialization after instantiation errors
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
-
-template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
-template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
- // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
- float EIGEN_ALIGN16 af[4];
- af[0] = from;
- Packet4f vc = pload<Packet4f>(af);
- vc = vec_splat(vc, 0);
- return vc;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
- int EIGEN_ALIGN16 ai[4];
- ai[0] = from;
- Packet4i vc = pload<Packet4i>(ai);
- vc = vec_splat(vc, 0);
- return vc;
-}
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet4f>(const float *a,
- Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
-{
- a3 = pload<Packet4f>(a);
- a0 = vec_splat(a3, 0);
- a1 = vec_splat(a3, 1);
- a2 = vec_splat(a3, 2);
- a3 = vec_splat(a3, 3);
-}
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet4i>(const int *a,
- Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
-{
- a3 = pload<Packet4i>(a);
- a0 = vec_splat(a3, 0);
- a1 = vec_splat(a3, 1);
- a2 = vec_splat(a3, 2);
- a3 = vec_splat(a3, 3);
-}
-
-template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, int stride)
-{
- float EIGEN_ALIGN16 af[4];
- af[0] = from[0*stride];
- af[1] = from[1*stride];
- af[2] = from[2*stride];
- af[3] = from[3*stride];
- return pload<Packet4f>(af);
-}
-template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, int stride)
-{
- int EIGEN_ALIGN16 ai[4];
- ai[0] = from[0*stride];
- ai[1] = from[1*stride];
- ai[2] = from[2*stride];
- ai[3] = from[3*stride];
- return pload<Packet4i>(ai);
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, int stride)
-{
- float EIGEN_ALIGN16 af[4];
- pstore<float>(af, from);
- to[0*stride] = af[0];
- to[1*stride] = af[1];
- to[2*stride] = af[2];
- to[3*stride] = af[3];
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, int stride)
-{
- int EIGEN_ALIGN16 ai[4];
- pstore<int>((int *)ai, from);
- to[0*stride] = ai[0];
- to[1*stride] = ai[1];
- to[2*stride] = ai[2];
- to[3*stride] = ai[3];
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return vec_add(pset1<Packet4f>(a), p4f_COUNTDOWN); }
-template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return vec_add(pset1<Packet4i>(a), p4i_COUNTDOWN); }
-
-template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_add(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_add(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_sub(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_sub(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return psub<Packet4f>(p4f_ZERO, a); }
-template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return psub<Packet4i>(p4i_ZERO, a); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
-
-template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b,p4f_ZERO); }
-/* Commented out: it's actually slower than processing it scalar
- *
-template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
-{
- // Detailed in: http://freevec.org/content/32bit_signed_integer_multiplication_altivec
- //Set up constants, variables
- Packet4i a1, b1, bswap, low_prod, high_prod, prod, prod_, v1sel;
-
- // Get the absolute values
- a1 = vec_abs(a);
- b1 = vec_abs(b);
-
- // Get the signs using xor
- Packet4bi sgn = (Packet4bi) vec_cmplt(vec_xor(a, b), p4i_ZERO);
-
- // Do the multiplication for the asbolute values.
- bswap = (Packet4i) vec_rl((Packet4ui) b1, (Packet4ui) p4i_MINUS16 );
- low_prod = vec_mulo((Packet8i) a1, (Packet8i)b1);
- high_prod = vec_msum((Packet8i) a1, (Packet8i) bswap, p4i_ZERO);
- high_prod = (Packet4i) vec_sl((Packet4ui) high_prod, (Packet4ui) p4i_MINUS16);
- prod = vec_add( low_prod, high_prod );
-
- // NOR the product and select only the negative elements according to the sign mask
- prod_ = vec_nor(prod, prod);
- prod_ = vec_sel(p4i_ZERO, prod_, sgn);
-
- // Add 1 to the result to get the negative numbers
- v1sel = vec_sel(p4i_ZERO, p4i_ONE, sgn);
- prod_ = vec_add(prod_, v1sel);
-
- // Merge the results back to the final vector.
- prod = vec_sel(prod, prod_, sgn);
-
- return prod;
-}
-*/
-template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-#if !defined(__VSX__) // VSX actually provides a div instruction
- Packet4f t, y_0, y_1;
-
- // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
- y_0 = vec_re(b);
-
- // Do one Newton-Raphson iteration to get the needed accuracy
- t = vec_nmsub(y_0, b, p4f_ONE);
- y_1 = vec_madd(y_0, t, y_0);
-
- return vec_madd(a, y_1, p4f_ZERO);
-#else
- return vec_div(a, b);
-#endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
-{ eigen_assert(false && "packet integer division are not supported by AltiVec");
- return pset1<Packet4i>(0);
-}
-
-// for some weird raisons, it has to be overloaded for packet of integers
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); }
-template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
-template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }
-
-#ifdef _BIG_ENDIAN
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
-{
- EIGEN_DEBUG_ALIGNED_LOAD
- Packet16uc MSQ, LSQ;
- Packet16uc mask;
- MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword
- LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword
- mask = vec_lvsl(0, from); // create the permute mask
- return (Packet4f) vec_perm(MSQ, LSQ, mask); // align the data
-
-}
-template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
-{
- EIGEN_DEBUG_ALIGNED_LOAD
- // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
- Packet16uc MSQ, LSQ;
- Packet16uc mask;
- MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword
- LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword
- mask = vec_lvsl(0, from); // create the permute mask
- return (Packet4i) vec_perm(MSQ, LSQ, mask); // align the data
-}
-#else
-// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
-template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
-{
- EIGEN_DEBUG_ALIGNED_LOAD
- return (Packet4i) vec_vsx_ld((long)from & 15, (const Packet4i*) _EIGEN_ALIGNED_PTR(from));
-}
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
-{
- EIGEN_DEBUG_ALIGNED_LOAD
- return (Packet4f) vec_vsx_ld((long)from & 15, (const Packet4f*) _EIGEN_ALIGNED_PTR(from));
-}
-#endif
-
-template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
-{
- Packet4f p;
- if((ptrdiff_t(from) % 16) == 0) p = pload<Packet4f>(from);
- else p = ploadu<Packet4f>(from);
- return vec_perm(p, p, p16uc_DUPLICATE32_HI);
-}
-template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
-{
- Packet4i p;
- if((ptrdiff_t(from) % 16) == 0) p = pload<Packet4i>(from);
- else p = ploadu<Packet4i>(from);
- return vec_perm(p, p, p16uc_DUPLICATE32_HI);
-}
-
-#ifdef _BIG_ENDIAN
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
-{
- EIGEN_DEBUG_UNALIGNED_STORE
- // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
- // Warning: not thread safe!
- Packet16uc MSQ, LSQ, edges;
- Packet16uc edgeAlign, align;
-
- MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword
- LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword
- edgeAlign = vec_lvsl(0, to); // permute map to extract edges
- edges=vec_perm(LSQ,MSQ,edgeAlign); // extract the edges
- align = vec_lvsr( 0, to ); // permute map to misalign data
- MSQ = vec_perm(edges,(Packet16uc)from,align); // misalign the data (MSQ)
- LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ)
- vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first
- vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part
-}
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from)
-{
- EIGEN_DEBUG_UNALIGNED_STORE
- // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
- // Warning: not thread safe!
- Packet16uc MSQ, LSQ, edges;
- Packet16uc edgeAlign, align;
-
- MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword
- LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword
- edgeAlign = vec_lvsl(0, to); // permute map to extract edges
- edges=vec_perm(LSQ, MSQ, edgeAlign); // extract the edges
- align = vec_lvsr( 0, to ); // permute map to misalign data
- MSQ = vec_perm(edges, (Packet16uc) from, align); // misalign the data (MSQ)
- LSQ = vec_perm((Packet16uc) from, edges, align); // misalign the data (LSQ)
- vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first
- vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part
-}
-#else
-// We also need to redefine little endian loading of Packet4i/Packet4f using VSX
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from)
-{
- EIGEN_DEBUG_ALIGNED_STORE
- vec_vsx_st(from, (long)to & 15, (Packet4i*) _EIGEN_ALIGNED_PTR(to));
-}
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
-{
- EIGEN_DEBUG_ALIGNED_STORE
- vec_vsx_st(from, (long)to & 15, (Packet4f*) _EIGEN_ALIGNED_PTR(to));
-}
-#endif
-
-#ifndef __VSX__
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
-#endif
-
-template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
-template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
-
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return (Packet4f)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE32); }
-template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return (Packet4i)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE32); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); }
-template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }
-
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
- Packet4f b, sum;
- b = (Packet4f) vec_sld(a, a, 8);
- sum = vec_add(a, b);
- b = (Packet4f) vec_sld(sum, sum, 4);
- sum = vec_add(sum, b);
- return pfirst(sum);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
-{
- Packet4f v[4], sum[4];
-
- // It's easier and faster to transpose then add as columns
- // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
- // Do the transpose, first set of moves
- v[0] = vec_mergeh(vecs[0], vecs[2]);
- v[1] = vec_mergel(vecs[0], vecs[2]);
- v[2] = vec_mergeh(vecs[1], vecs[3]);
- v[3] = vec_mergel(vecs[1], vecs[3]);
- // Get the resulting vectors
- sum[0] = vec_mergeh(v[0], v[2]);
- sum[1] = vec_mergel(v[0], v[2]);
- sum[2] = vec_mergeh(v[1], v[3]);
- sum[3] = vec_mergel(v[1], v[3]);
-
- // Now do the summation:
- // Lines 0+1
- sum[0] = vec_add(sum[0], sum[1]);
- // Lines 2+3
- sum[1] = vec_add(sum[2], sum[3]);
- // Add the results
- sum[0] = vec_add(sum[0], sum[1]);
-
- return sum[0];
-}
-
-template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
-{
- Packet4i sum;
- sum = vec_sums(a, p4i_ZERO);
-#ifdef _BIG_ENDIAN
- sum = vec_sld(sum, p4i_ZERO, 12);
-#else
- sum = vec_sld(p4i_ZERO, sum, 4);
-#endif
- return pfirst(sum);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
-{
- Packet4i v[4], sum[4];
-
- // It's easier and faster to transpose then add as columns
- // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
- // Do the transpose, first set of moves
- v[0] = vec_mergeh(vecs[0], vecs[2]);
- v[1] = vec_mergel(vecs[0], vecs[2]);
- v[2] = vec_mergeh(vecs[1], vecs[3]);
- v[3] = vec_mergel(vecs[1], vecs[3]);
- // Get the resulting vectors
- sum[0] = vec_mergeh(v[0], v[2]);
- sum[1] = vec_mergel(v[0], v[2]);
- sum[2] = vec_mergeh(v[1], v[3]);
- sum[3] = vec_mergel(v[1], v[3]);
-
- // Now do the summation:
- // Lines 0+1
- sum[0] = vec_add(sum[0], sum[1]);
- // Lines 2+3
- sum[1] = vec_add(sum[2], sum[3]);
- // Add the results
- sum[0] = vec_add(sum[0], sum[1]);
-
- return sum[0];
-}
-
-// Other reduction functions:
-// mul
-template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
-{
- Packet4f prod;
- prod = pmul(a, (Packet4f)vec_sld(a, a, 8));
- return pfirst(pmul(prod, (Packet4f)vec_sld(prod, prod, 4)));
-}
-
-template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
-{
- EIGEN_ALIGN16 int aux[4];
- pstore(aux, a);
- return aux[0] * aux[1] * aux[2] * aux[3];
-}
-
-// min
-template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
-{
- Packet4f b, res;
- b = vec_min(a, vec_sld(a, a, 8));
- res = vec_min(b, vec_sld(b, b, 4));
- return pfirst(res);
-}
-
-template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
-{
- Packet4i b, res;
- b = vec_min(a, vec_sld(a, a, 8));
- res = vec_min(b, vec_sld(b, b, 4));
- return pfirst(res);
-}
-
-// max
-template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
-{
- Packet4f b, res;
- b = vec_max(a, vec_sld(a, a, 8));
- res = vec_max(b, vec_sld(b, b, 4));
- return pfirst(res);
-}
-
-template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
-{
- Packet4i b, res;
- b = vec_max(a, vec_sld(a, a, 8));
- res = vec_max(b, vec_sld(b, b, 4));
- return pfirst(res);
-}
-
-template<int Offset>
-struct palign_impl<Offset,Packet4f>
-{
- static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
- {
-#ifdef _BIG_ENDIAN
- switch (Offset % 4) {
- case 1:
- first = vec_sld(first, second, 4); break;
- case 2:
- first = vec_sld(first, second, 8); break;
- case 3:
- first = vec_sld(first, second, 12); break;
- }
-#else
- switch (Offset % 4) {
- case 1:
- first = vec_sld(second, first, 12); break;
- case 2:
- first = vec_sld(second, first, 8); break;
- case 3:
- first = vec_sld(second, first, 4); break;
- }
-#endif
- }
-};
-
-template<int Offset>
-struct palign_impl<Offset,Packet4i>
-{
- static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
- {
-#ifdef _BIG_ENDIAN
- switch (Offset % 4) {
- case 1:
- first = vec_sld(first, second, 4); break;
- case 2:
- first = vec_sld(first, second, 8); break;
- case 3:
- first = vec_sld(first, second, 12); break;
- }
-#else
- switch (Offset % 4) {
- case 1:
- first = vec_sld(second, first, 12); break;
- case 2:
- first = vec_sld(second, first, 8); break;
- case 3:
- first = vec_sld(second, first, 4); break;
- }
-#endif
- }
-};
-
-template<> EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4f,4>& kernel) {
- Packet4f t0, t1, t2, t3;
- t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
- t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
- t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
- t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
- kernel.packet[0] = vec_mergeh(t0, t2);
- kernel.packet[1] = vec_mergel(t0, t2);
- kernel.packet[2] = vec_mergeh(t1, t3);
- kernel.packet[3] = vec_mergel(t1, t3);
-}
-
-template<> EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4i,4>& kernel) {
- Packet4i t0, t1, t2, t3;
- t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
- t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
- t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
- t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
- kernel.packet[0] = vec_mergeh(t0, t2);
- kernel.packet[1] = vec_mergel(t0, t2);
- kernel.packet[2] = vec_mergeh(t1, t3);
- kernel.packet[3] = vec_mergel(t1, t3);
-}
-
-
-//---------- double ----------
-#if defined(__VSX__)
-typedef __vector double Packet2d;
-typedef __vector unsigned long long Packet2ul;
-typedef __vector long long Packet2l;
-
-static Packet2l p2l_ZERO = (Packet2l) p4i_ZERO;
-static Packet2d p2d_ONE = { 1.0, 1.0 };
-static Packet2d p2d_ZERO = (Packet2d) p4f_ZERO;
-static Packet2d p2d_ZERO_ = { -0.0, -0.0 };
-
-#ifdef _BIG_ENDIAN
-static Packet2d p2d_COUNTDOWN = (Packet2d) vec_sld((Packet16uc) p2d_ZERO, (Packet16uc) p2d_ONE, 8);
-#else
-static Packet2d p2d_COUNTDOWN = (Packet2d) vec_sld((Packet16uc) p2d_ONE, (Packet16uc) p2d_ZERO, 8);
-#endif
-
-static EIGEN_STRONG_INLINE Packet2d vec_splat_dbl(Packet2d& a, int index)
-{
- switch (index) {
- case 0:
- return (Packet2d) vec_perm(a, a, p16uc_PSET64_HI);
- case 1:
- return (Packet2d) vec_perm(a, a, p16uc_PSET64_LO);
- }
- return a;
-}
-
-template<> struct packet_traits<double> : default_packet_traits
-{
- typedef Packet2d type;
- typedef Packet2d half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size=2,
- HasHalfPacket = 0,
-
- HasDiv = 1,
- HasExp = 1,
- HasSqrt = 0
- };
-};
-
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; typedef Packet2d half; };
-
-
-inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
-{
- union {
- Packet2d v;
- double n[2];
- } vt;
- vt.v = v;
- s << vt.n[0] << ", " << vt.n[1];
- return s;
-}
-
-// Need to define them first or we get specialization after instantiation errors
-template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return (Packet2d) vec_ld(0, (const float *) from); } //FIXME
-
-template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st((Packet4f)from, 0, (float *)to); }
-
-template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
- double EIGEN_ALIGN16 af[2];
- af[0] = from;
- Packet2d vc = pload<Packet2d>(af);
- vc = vec_splat_dbl(vc, 0);
- return vc;
-}
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet2d>(const double *a,
- Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
-{
- a1 = pload<Packet2d>(a);
- a0 = vec_splat_dbl(a1, 0);
- a1 = vec_splat_dbl(a1, 1);
- a3 = pload<Packet2d>(a+2);
- a2 = vec_splat_dbl(a3, 0);
- a3 = vec_splat_dbl(a3, 1);
-}
-// Google-local: Change type from DenseIndex to int in patch.
-template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, int/*DenseIndex*/ stride)
-{
- double EIGEN_ALIGN16 af[2];
- af[0] = from[0*stride];
- af[1] = from[1*stride];
- return pload<Packet2d>(af);
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, /*DenseIndex*/int stride)
-{
- double EIGEN_ALIGN16 af[2];
- pstore<double>(af, from);
- to[0*stride] = af[0];
- to[1*stride] = af[1];
-}
-template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return vec_add(pset1<Packet2d>(a), p2d_COUNTDOWN); }
-
-template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_add(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_sub(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return psub<Packet2d>(p2d_ZERO, a); }
-
-template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
-
-template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_madd(a,b,p2d_ZERO); }
-template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_div(a,b); }
-
-// for some weird raisons, it has to be overloaded for packet of integers
-template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
-
-template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); }
-
-template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); }
-
-template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
-
-template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
-
-template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); }
-
-template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
-
-template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
-{
- EIGEN_DEBUG_ALIGNED_LOAD
- return (Packet2d) vec_vsx_ld((long)from & 15, (const Packet2d*) _EIGEN_ALIGNED_PTR(from));
-}
-template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
-{
- Packet2d p;
- if((ptrdiff_t(from) % 16) == 0) p = pload<Packet2d>(from);
- else p = ploadu<Packet2d>(from);
- return vec_perm(p, p, p16uc_PSET64_HI);
-}
-
-template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from)
-{
- EIGEN_DEBUG_ALIGNED_STORE
- vec_vsx_st((Packet4f)from, (long)to & 15, (Packet4f*) _EIGEN_ALIGNED_PTR(to));
-}
-
-#ifndef __VSX__
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { vec_dstt((const float *) addr, DST_CTRL(2,2,32), DST_CHAN); }
-#endif
-
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; }
-
-template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return (Packet2d)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE64); }
-
-template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); }
-
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
-{
- Packet2d b, sum;
- b = (Packet2d) vec_sld((Packet4ui) a, (Packet4ui)a, 8);
- sum = vec_add(a, b);
- return pfirst(sum);
-}
-
-template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
-{
- Packet2d v[2], sum;
- v[0] = vec_add(vecs[0], (Packet2d) vec_sld((Packet4ui) vecs[0], (Packet4ui) vecs[0], 8));
- v[1] = vec_add(vecs[1], (Packet2d) vec_sld((Packet4ui) vecs[1], (Packet4ui) vecs[1], 8));
-
-#ifdef _BIG_ENDIAN
- sum = (Packet2d) vec_sld((Packet4ui) v[0], (Packet4ui) v[1], 8);
-#else
- sum = (Packet2d) vec_sld((Packet4ui) v[1], (Packet4ui) v[0], 8);
-#endif
-
- return sum;
-}
-// Other reduction functions:
-// mul
-template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
-{
- return pfirst(pmul(a, (Packet2d)vec_sld((Packet4ui) a, (Packet4ui) a, 8)));
-}
-
-// min
-template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
-{
- return pfirst(vec_min(a, (Packet2d) vec_sld((Packet4ui) a, (Packet4ui) a, 8)));
-}
-
-// max
-template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
-{
- return pfirst(vec_max(a, (Packet2d) vec_sld((Packet4ui) a, (Packet4ui) a, 8)));
-}
-
-template<int Offset>
-struct palign_impl<Offset,Packet2d>
-{
- static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
- {
- if (Offset == 1)
-#ifdef _BIG_ENDIAN
- first = (Packet2d) vec_sld((Packet4ui) first, (Packet4ui) second, 8);
-#else
- first = (Packet2d) vec_sld((Packet4ui) second, (Packet4ui) first, 8);
-#endif
- }
-};
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet2d,2>& kernel) {
- Packet2d t0, t1;
- t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI);
- t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO);
- kernel.packet[0] = t0;
- kernel.packet[1] = t1;
-}
-
-#endif // defined(__VSX__)
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_PACKET_MATH_ALTIVEC_H
-
diff --git a/third_party/eigen3/Eigen/src/Core/arch/CUDA/MathFunctions.h b/third_party/eigen3/Eigen/src/Core/arch/CUDA/MathFunctions.h
deleted file mode 100644
index 7e2fb7e699..0000000000
--- a/third_party/eigen3/Eigen/src/Core/arch/CUDA/MathFunctions.h
+++ /dev/null
@@ -1,112 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H
-#define EIGEN_MATH_FUNCTIONS_CUDA_H
-
-namespace Eigen {
-
-namespace internal {
-
-// Make sure this is only available when targeting a GPU: we don't want to
-// introduce conflicts between these packet_traits definitions and the ones
-// we'll use on the host side (SSE, AVX, ...)
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 plog<float4>(const float4& a)
-{
- return make_float4(logf(a.x), logf(a.y), logf(a.z), logf(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 plog<double2>(const double2& a)
-{
- return make_double2(log(a.x), log(a.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pexp<float4>(const float4& a)
-{
- return make_float4(expf(a.x), expf(a.y), expf(a.z), expf(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pexp<double2>(const double2& a)
-{
- return make_double2(exp(a.x), exp(a.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 psqrt<float4>(const float4& a)
-{
- return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 psqrt<double2>(const double2& a)
-{
- return make_double2(sqrt(a.x), sqrt(a.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 prsqrt<float4>(const float4& a)
-{
- return make_float4(rsqrtf(a.x), rsqrtf(a.y), rsqrtf(a.z), rsqrtf(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 prsqrt<double2>(const double2& a)
-{
- return make_double2(rsqrt(a.x), rsqrt(a.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 plgamma<float4>(const float4& a)
-{
- return make_float4(lgammaf(a.x), lgammaf(a.y), lgammaf(a.z), lgammaf(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 plgamma<double2>(const double2& a)
-{
- return make_double2(lgamma(a.x), lgamma(a.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 perf<float4>(const float4& a)
-{
- return make_float4(erf(a.x), erf(a.y), erf(a.z), erf(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 perf<double2>(const double2& a)
-{
- return make_double2(erf(a.x), erf(a.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 perfc<float4>(const float4& a)
-{
- return make_float4(erfc(a.x), erfc(a.y), erfc(a.z), erfc(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 perfc<double2>(const double2& a)
-{
- return make_double2(erfc(a.x), erfc(a.y));
-}
-
-
-#endif
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_MATH_FUNCTIONS_CUDA_H
diff --git a/third_party/eigen3/Eigen/src/Core/arch/CUDA/PacketMath.h b/third_party/eigen3/Eigen/src/Core/arch/CUDA/PacketMath.h
deleted file mode 100644
index 02aac06ed3..0000000000
--- a/third_party/eigen3/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ /dev/null
@@ -1,342 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_PACKET_MATH_CUDA_H
-#define EIGEN_PACKET_MATH_CUDA_H
-
-namespace Eigen {
-
-namespace internal {
-// Make sure this is only available when targeting a GPU: we don't want to
-// introduce conflicts between these packet_traits definitions and the ones
-// we'll use on the host side (SSE, AVX, ...)
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-template<> struct is_arithmetic<float4> { enum { value = true }; };
-template<> struct is_arithmetic<double2> { enum { value = true }; };
-
-
-template<> struct packet_traits<float> : default_packet_traits
-{
- typedef float4 type;
- typedef float4 half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size=4,
- HasHalfPacket = 0,
-
- HasDiv = 1,
- HasSin = 0,
- HasCos = 0,
- HasLog = 1,
- HasExp = 1,
- HasSqrt = 1,
- HasRsqrt = 1,
- HasLGamma = 1,
- HasErf = 1,
- HasErfc = 1,
-
- HasBlend = 0,
- HasSelect = 1,
- HasEq = 1,
- };
-};
-
-template<> struct packet_traits<double> : default_packet_traits
-{
- typedef double2 type;
- typedef double2 half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size=2,
- HasHalfPacket = 0,
-
- HasDiv = 1,
- HasLog = 1,
- HasExp = 1,
- HasSqrt = 1,
- HasRsqrt = 1,
- HasLGamma = 1,
- HasErf = 1,
- HasErfc = 1,
-
- HasBlend = 0,
- HasSelect = 1,
- HasEq = 1,
- };
-};
-
-
-template<> struct unpacket_traits<float4> { typedef float type; enum {size=4}; typedef float4 half; };
-template<> struct unpacket_traits<double2> { typedef double type; enum {size=2}; typedef double2 half; };
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float& from) {
- return make_float4(from, from, from, from);
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
- return make_double2(from, from);
-}
-
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float>(const float& a) {
- return make_float4(a, a+1, a+2, a+3);
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double>(const double& a) {
- return make_double2(a, a+1);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
- return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
- return make_double2(a.x+b.x, a.y+b.y);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
- return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
- return make_double2(a.x-b.x, a.y-b.y);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 peq<float4>(const float4& a, const float4& b) {
- return make_float4(a.x == b.x ? 1.f : 0, a.y == b.y ? 1.f : 0, a.z == b.z ? 1.f : 0, a.w == b.w ? 1.f : 0);
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 peq<double2>(const double2& a, const double2& b) {
- return make_double2(a.x == b.x ? 1. : 0, a.y == b.y ? 1. : 0);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ple<float4>(const float4& a, const float4& b) {
- return make_float4(a.x <= b.x ? 1.f : 0, a.y <= b.y ? 1.f : 0, a.z <= b.z ? 1.f : 0, a.w <= b.w ? 1.f : 0);
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ple<double2>(const double2& a, const double2& b) {
- return make_double2(a.x <= b.x ? 1. : 0, a.y <= b.y ? 1. : 0);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plt<float4>(const float4& a, const float4& b) {
- return make_float4(a.x < b.x ? 1.f : 0, a.y < b.y ? 1.f : 0, a.z < b.z ? 1.f : 0, a.w < b.w ? 1.f : 0);
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plt<double2>(const double2& a, const double2& b) {
- return make_double2(a.x < b.x ? 1. : 0, a.y < b.y ? 1. : 0);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pselect<float4>(const float4& a, const float4& b, const float4& c) {
- return make_float4(c.x ? b.x : a.x, c.y ? b.y : a.y, c.z ? b.z : a.z, c.w ? b.w : a.w);
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pselect<double2>(const double2& a, const double2& b, const double2& c) {
- return make_double2(c.x ? b.x : a.x, c.y ? b.y : a.y);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
- return make_float4(-a.x, -a.y, -a.z, -a.w);
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
- return make_double2(-a.x, -a.y);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; }
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
- return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
- return make_double2(a.x*b.x, a.y*b.y);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
- return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
- return make_double2(a.x/b.x, a.y/b.y);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
- return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
- return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
- return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
- return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
- return *reinterpret_cast<const float4*>(from);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
- return *reinterpret_cast<const double2*>(from);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
- return make_float4(from[0], from[1], from[2], from[3]);
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
- return make_double2(from[0], from[1]);
-}
-
-template<> EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float* from) {
- return make_float4(from[0], from[0], from[1], from[1]);
-}
-template<> EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double* from) {
- return make_double2(from[0], from[0]);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float* to, const float4& from) {
- *reinterpret_cast<float4*>(to) = from;
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
- *reinterpret_cast<double2*>(to) = from;
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const float4& from) {
- to[0] = from.x;
- to[1] = from.y;
- to[2] = from.z;
- to[3] = from.w;
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
- to[0] = from.x;
- to[1] = from.y;
-}
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
- return __ldg((const float4*)from);
-}
-template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
- return __ldg((const double2*)from);
-}
-
-template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
- return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
-}
-template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
- return make_double2(__ldg(from+0), __ldg(from+1));
-}
-#endif
-
-template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, int stride) {
- return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
-}
-
-template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, int stride) {
- return make_double2(from[0*stride], from[1*stride]);
-}
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, int stride) {
- to[stride*0] = from.x;
- to[stride*1] = from.y;
- to[stride*2] = from.z;
- to[stride*3] = from.w;
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, int stride) {
- to[stride*0] = from.x;
- to[stride*1] = from.y;
-}
-
-template<> EIGEN_DEVICE_FUNC inline float pfirst<float4>(const float4& a) {
- return a.x;
-}
-template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
- return a.x;
-}
-
-template<> EIGEN_DEVICE_FUNC inline float predux<float4>(const float4& a) {
- return a.x + a.y + a.z + a.w;
-}
-template<> EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
- return a.x + a.y;
-}
-
-template<> EIGEN_DEVICE_FUNC inline float predux_max<float4>(const float4& a) {
- return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
-}
-template<> EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
- return fmax(a.x, a.y);
-}
-
-template<> EIGEN_DEVICE_FUNC inline float predux_min<float4>(const float4& a) {
- return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
-}
-template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
- return fmin(a.x, a.y);
-}
-
-template <>
-EIGEN_DEVICE_FUNC inline float predux_mul<float4>(const float4& a) {
- return a.x * a.y * a.z * a.w;
-}
-template <>
-EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) {
- return a.x * a.y;
-}
-
-template<> EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) {
- return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
-}
-template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
- return make_double2(fabs(a.x), fabs(a.y));
-}
-
-
-template<> EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<float4,4>& kernel) {
- double tmp = kernel.packet[0].y;
- kernel.packet[0].y = kernel.packet[1].x;
- kernel.packet[1].x = tmp;
-
- tmp = kernel.packet[0].z;
- kernel.packet[0].z = kernel.packet[2].x;
- kernel.packet[2].x = tmp;
-
- tmp = kernel.packet[0].w;
- kernel.packet[0].w = kernel.packet[3].x;
- kernel.packet[3].x = tmp;
-
- tmp = kernel.packet[1].z;
- kernel.packet[1].z = kernel.packet[2].y;
- kernel.packet[2].y = tmp;
-
- tmp = kernel.packet[1].w;
- kernel.packet[1].w = kernel.packet[3].y;
- kernel.packet[3].y = tmp;
-
- tmp = kernel.packet[2].w;
- kernel.packet[2].w = kernel.packet[3].z;
- kernel.packet[3].z = tmp;
-}
-
-template<> EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<double2,2>& kernel) {
- double tmp = kernel.packet[0].y;
- kernel.packet[0].y = kernel.packet[1].x;
- kernel.packet[1].x = tmp;
-}
-
-#endif
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-
-#endif // EIGEN_PACKET_MATH_CUDA_H
diff --git a/third_party/eigen3/Eigen/src/Core/arch/Default/Settings.h b/third_party/eigen3/Eigen/src/Core/arch/Default/Settings.h
deleted file mode 100644
index 097373c84d..0000000000
--- a/third_party/eigen3/Eigen/src/Core/arch/Default/Settings.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-
-/* All the parameters defined in this file can be specialized in the
- * architecture specific files, and/or by the user.
- * More to come... */
-
-#ifndef EIGEN_DEFAULT_SETTINGS_H
-#define EIGEN_DEFAULT_SETTINGS_H
-
-/** Defines the maximal loop size to enable meta unrolling of loops.
- * Note that the value here is expressed in Eigen's own notion of "number of FLOPS",
- * it does not correspond to the number of iterations or the number of instructions
- */
-#ifndef EIGEN_UNROLLING_LIMIT
-#define EIGEN_UNROLLING_LIMIT 100
-#endif
-
-/** Defines the threshold between a "small" and a "large" matrix.
- * This threshold is mainly used to select the proper product implementation.
- */
-#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
-#endif
-
-/** Defines the maximal width of the blocks used in the triangular product and solver
- * for vectors (level 2 blas xTRMV and xTRSV). The default is 8.
- */
-#ifndef EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH
-#define EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH 8
-#endif
-
-
-/** Defines the default number of registers available for that architecture.
- * Currently it must be 8 or 16. Other values will fail.
- */
-#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 8
-#endif
-
-#endif // EIGEN_DEFAULT_SETTINGS_H
diff --git a/third_party/eigen3/Eigen/src/Core/arch/NEON/Complex.h b/third_party/eigen3/Eigen/src/Core/arch/NEON/Complex.h
deleted file mode 100644
index 49e3fa1b02..0000000000
--- a/third_party/eigen3/Eigen/src/Core/arch/NEON/Complex.h
+++ /dev/null
@@ -1,467 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_COMPLEX_NEON_H
-#define EIGEN_COMPLEX_NEON_H
-
-namespace Eigen {
-
-namespace internal {
-
-static uint32x4_t p4ui_CONJ_XOR = EIGEN_INIT_NEON_PACKET4(0x00000000, 0x80000000, 0x00000000, 0x80000000);
-static uint32x2_t p2ui_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x00000000, 0x80000000);
-
-//---------- float ----------
-struct Packet2cf
-{
- EIGEN_STRONG_INLINE Packet2cf() {}
- EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
- Packet4f v;
-};
-
-template<> struct packet_traits<std::complex<float> > : default_packet_traits
-{
- typedef Packet2cf type;
- typedef Packet2cf half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size = 2,
- HasHalfPacket = 0,
-
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 1,
- HasNegate = 1,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 0,
- HasMax = 0,
- HasSetLinear = 0
- };
-};
-
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; };
-
-template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
-{
- float32x2_t r64;
- r64 = vld1_f32((float *)&from);
-
- return Packet2cf(vcombine_f32(r64, r64));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd<Packet4f>(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub<Packet4f>(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate<Packet4f>(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
-{
- Packet4ui b = vreinterpretq_u32_f32(a.v);
- return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
- Packet4f v1, v2;
-
- // Get the real values of a | a1_re | a1_re | a2_re | a2_re |
- v1 = vcombine_f32(vdup_lane_f32(vget_low_f32(a.v), 0), vdup_lane_f32(vget_high_f32(a.v), 0));
- // Get the real values of a | a1_im | a1_im | a2_im | a2_im |
- v2 = vcombine_f32(vdup_lane_f32(vget_low_f32(a.v), 1), vdup_lane_f32(vget_high_f32(a.v), 1));
- // Multiply the real a with b
- v1 = vmulq_f32(v1, b.v);
- // Multiply the imag a with b
- v2 = vmulq_f32(v2, b.v);
- // Conjugate v2
- v2 = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v2), p4ui_CONJ_XOR));
- // Swap real/imag elements in v2.
- v2 = vrev64q_f32(v2);
- // Add and return the result
- return Packet2cf(vaddq_f32(v1, v2));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf pand <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
- return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
-}
-template<> EIGEN_STRONG_INLINE Packet2cf por <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
- return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
-}
-template<> EIGEN_STRONG_INLINE Packet2cf pxor <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
- return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
-}
-template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
- return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); }
-template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); }
-
-template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
-
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, int stride)
-{
- Packet4f res = pset1<Packet4f>(0.f);
- res = vsetq_lane_f32(std::real(from[0*stride]), res, 0);
- res = vsetq_lane_f32(std::imag(from[0*stride]), res, 1);
- res = vsetq_lane_f32(std::real(from[1*stride]), res, 2);
- res = vsetq_lane_f32(std::imag(from[1*stride]), res, 3);
- return Packet2cf(res);
-}
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, int stride)
-{
- to[stride*0] = std::complex<float>(vgetq_lane_f32(from.v, 0), vgetq_lane_f32(from.v, 1));
- to[stride*1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3));
-}
-
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { EIGEN_ARM_PREFETCH((float *)addr); }
-
-template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
-{
- std::complex<float> EIGEN_ALIGN16 x[2];
- vst1q_f32((float *)x, a.v);
- return x[0];
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
-{
- float32x2_t a_lo, a_hi;
- Packet4f a_r128;
-
- a_lo = vget_low_f32(a.v);
- a_hi = vget_high_f32(a.v);
- a_r128 = vcombine_f32(a_hi, a_lo);
-
- return Packet2cf(a_r128);
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a)
-{
- return Packet2cf(vrev64q_f32(a.v));
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
-{
- float32x2_t a1, a2;
- std::complex<float> s;
-
- a1 = vget_low_f32(a.v);
- a2 = vget_high_f32(a.v);
- a2 = vadd_f32(a1, a2);
- vst1_f32((float *)&s, a2);
-
- return s;
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
-{
- Packet4f sum1, sum2, sum;
-
- // Add the first two 64-bit float32x2_t of vecs[0]
- sum1 = vcombine_f32(vget_low_f32(vecs[0].v), vget_low_f32(vecs[1].v));
- sum2 = vcombine_f32(vget_high_f32(vecs[0].v), vget_high_f32(vecs[1].v));
- sum = vaddq_f32(sum1, sum2);
-
- return Packet2cf(sum);
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
-{
- float32x2_t a1, a2, v1, v2, prod;
- std::complex<float> s;
-
- a1 = vget_low_f32(a.v);
- a2 = vget_high_f32(a.v);
- // Get the real values of a | a1_re | a1_re | a2_re | a2_re |
- v1 = vdup_lane_f32(a1, 0);
- // Get the real values of a | a1_im | a1_im | a2_im | a2_im |
- v2 = vdup_lane_f32(a1, 1);
- // Multiply the real a with b
- v1 = vmul_f32(v1, a2);
- // Multiply the imag a with b
- v2 = vmul_f32(v2, a2);
- // Conjugate v2
- v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR));
- // Swap real/imag elements in v2.
- v2 = vrev64_f32(v2);
- // Add v1, v2
- prod = vadd_f32(v1, v2);
-
- vst1_f32((float *)&s, prod);
-
- return s;
-}
-
-template<int Offset>
-struct palign_impl<Offset,Packet2cf>
-{
- EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second)
- {
- if (Offset==1)
- {
- first.v = vextq_f32(first.v, second.v, 2);
- }
- }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
-{
- EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
- { return padd(pmul(x,y),c); }
-
- EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
- {
- return internal::pmul(a, pconj(b));
- }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
-{
- EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
- { return padd(pmul(x,y),c); }
-
- EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
- {
- return internal::pmul(pconj(a), b);
- }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
-{
- EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
- { return padd(pmul(x,y),c); }
-
- EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
- {
- return pconj(internal::pmul(a, b));
- }
-};
-
-template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
- // TODO optimize it for NEON
- Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
- Packet4f s, rev_s;
-
- // this computes the norm
- s = vmulq_f32(b.v, b.v);
- rev_s = vrev64q_f32(s);
-
- return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s)));
-}
-
-template<> EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet2cf,2>& kernel) {
- Packet4f tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v));
- kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v));
- kernel.packet[1].v = tmp;
-}
-
-//---------- double ----------
-#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
-
-static uint64x2_t p2ul_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x0, 0x8000000000000000);
-
-struct Packet1cd
-{
- EIGEN_STRONG_INLINE Packet1cd() {}
- EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
- Packet2d v;
-};
-
-template<> struct packet_traits<std::complex<double> > : default_packet_traits
-{
- typedef Packet1cd type;
- typedef Packet1cd half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 0,
- size = 1,
- HasHalfPacket = 0,
-
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 1,
- HasNegate = 1,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 0,
- HasMax = 0,
- HasSetLinear = 0
- };
-};
-
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; typedef Packet1cd half; };
-
-template<> EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
-
-template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from)
-{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
-
-template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(padd<Packet2d>(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(psub<Packet2d>(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate<Packet2d>(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR))); }
-
-template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
- Packet2d v1, v2;
-
- // Get the real values of a
- v1 = vdupq_lane_f64(vget_low_f64(a.v), 0);
- // Get the real values of a
- v2 = vdupq_lane_f64(vget_high_f64(a.v), 1);
- // Multiply the real a with b
- v1 = vmulq_f64(v1, b.v);
- // Multiply the imag a with b
- v2 = vmulq_f64(v2, b.v);
- // Conjugate v2
- v2 = vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(v2), p2ul_CONJ_XOR));
- // Swap real/imag elements in v2.
- v2 = preverse<Packet2d>(v2);
- // Add and return the result
- return Packet1cd(vaddq_f64(v1, v2));
-}
-
-template<> EIGEN_STRONG_INLINE Packet1cd pand <Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
- return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v))));
-}
-template<> EIGEN_STRONG_INLINE Packet1cd por <Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
- return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v))));
-}
-template<> EIGEN_STRONG_INLINE Packet1cd pxor <Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
- return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v))));
-}
-template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
- return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v))));
-}
-
-template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); }
-
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
-
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { EIGEN_ARM_PREFETCH((double *)addr); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, int stride)
-{
- Packet2d res = pset1<Packet2d>(0.0);
- res = vsetq_lane_f64(std::real(from[0*stride]), res, 0);
- res = vsetq_lane_f64(std::imag(from[0*stride]), res, 1);
- return Packet1cd(res);
-}
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, int stride)
-{
- to[stride*0] = std::complex<double>(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1));
-}
-
-
-template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
-{
- std::complex<double> EIGEN_ALIGN16 res;
- pstore<std::complex<double> >(&res, a);
-
- return res;
-}
-
-template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
-
-template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
-
-template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs) { return vecs[0]; }
-
-template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
-
-template<int Offset>
-struct palign_impl<Offset,Packet1cd>
-{
- static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
- {
- // FIXME is it sure we never have to align a Packet1cd?
- // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
- }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
-{
- EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
- { return padd(pmul(x,y),c); }
-
- EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
- {
- return internal::pmul(a, pconj(b));
- }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
-{
- EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
- { return padd(pmul(x,y),c); }
-
- EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
- {
- return internal::pmul(pconj(a), b);
- }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
-{
- EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
- { return padd(pmul(x,y),c); }
-
- EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
- {
- return pconj(internal::pmul(a, b));
- }
-};
-
-template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
- // TODO optimize it for NEON
- Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
- Packet2d s = pmul<Packet2d>(b.v, b.v);
- Packet2d rev_s = preverse<Packet2d>(s);
-
- return Packet1cd(pdiv(res.v, padd<Packet2d>(s,rev_s)));
-}
-
-EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
-{
- return Packet1cd(preverse(Packet2d(x.v)));
-}
-
-EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
-{
- Packet2d tmp = vcombine_f64(vget_high_f64(kernel.packet[0].v), vget_high_f64(kernel.packet[1].v));
- kernel.packet[0].v = vcombine_f64(vget_low_f64(kernel.packet[0].v), vget_low_f64(kernel.packet[1].v));
- kernel.packet[1].v = tmp;
-}
-#endif // EIGEN_ARCH_ARM64
-
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_COMPLEX_NEON_H
diff --git a/third_party/eigen3/Eigen/src/Core/arch/NEON/MathFunctions.h b/third_party/eigen3/Eigen/src/Core/arch/NEON/MathFunctions.h
deleted file mode 100644
index 6bb05bb922..0000000000
--- a/third_party/eigen3/Eigen/src/Core/arch/NEON/MathFunctions.h
+++ /dev/null
@@ -1,91 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/* The sin, cos, exp, and log functions of this file come from
- * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
- */
-
-#ifndef EIGEN_MATH_FUNCTIONS_NEON_H
-#define EIGEN_MATH_FUNCTIONS_NEON_H
-
-namespace Eigen {
-
-namespace internal {
-
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f pexp<Packet4f>(const Packet4f& _x)
-{
- Packet4f x = _x;
- Packet4f tmp, fx;
-
- _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
- _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
- _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
- _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f);
- _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
-
- x = vminq_f32(x, p4f_exp_hi);
- x = vmaxq_f32(x, p4f_exp_lo);
-
- /* express exp(x) as exp(g + n*log(2)) */
- fx = vmlaq_f32(p4f_half, x, p4f_cephes_LOG2EF);
-
- /* perform a floorf */
- tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
-
- /* if greater, substract 1 */
- Packet4ui mask = vcgtq_f32(tmp, fx);
- mask = vandq_u32(mask, vreinterpretq_u32_f32(p4f_1));
-
- fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
-
- tmp = vmulq_f32(fx, p4f_cephes_exp_C1);
- Packet4f z = vmulq_f32(fx, p4f_cephes_exp_C2);
- x = vsubq_f32(x, tmp);
- x = vsubq_f32(x, z);
-
- Packet4f y = vmulq_f32(p4f_cephes_exp_p0, x);
- z = vmulq_f32(x, x);
- y = vaddq_f32(y, p4f_cephes_exp_p1);
- y = vmulq_f32(y, x);
- y = vaddq_f32(y, p4f_cephes_exp_p2);
- y = vmulq_f32(y, x);
- y = vaddq_f32(y, p4f_cephes_exp_p3);
- y = vmulq_f32(y, x);
- y = vaddq_f32(y, p4f_cephes_exp_p4);
- y = vmulq_f32(y, x);
- y = vaddq_f32(y, p4f_cephes_exp_p5);
-
- y = vmulq_f32(y, z);
- y = vaddq_f32(y, x);
- y = vaddq_f32(y, p4f_1);
-
- /* build 2^n */
- int32x4_t mm;
- mm = vcvtq_s32_f32(fx);
- mm = vaddq_s32(mm, p4i_0x7f);
- mm = vshlq_n_s32(mm, 23);
- Packet4f pow2n = vreinterpretq_f32_s32(mm);
-
- y = vmulq_f32(y, pow2n);
- return y;
-}
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_MATH_FUNCTIONS_NEON_H
diff --git a/third_party/eigen3/Eigen/src/Core/arch/NEON/PacketMath.h b/third_party/eigen3/Eigen/src/Core/arch/NEON/PacketMath.h
deleted file mode 100644
index 856a65ad7b..0000000000
--- a/third_party/eigen3/Eigen/src/Core/arch/NEON/PacketMath.h
+++ /dev/null
@@ -1,745 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2010 Konstantinos Margaritis <markos@codex.gr>
-// Heavily based on Gael's SSE version.
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_PACKET_MATH_NEON_H
-#define EIGEN_PACKET_MATH_NEON_H
-
-namespace Eigen {
-
-namespace internal {
-
-#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16
-#endif
-
-// FIXME NEON has 16 quad registers, but since the current register allocator
-// is so bad, it is much better to reduce it to 8
-#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
-#endif
-
-#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
-#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
-#endif
-
-#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-#endif
-
-typedef float32x2_t Packet2f;
-typedef float32x4_t Packet4f;
-typedef int32x4_t Packet4i;
-typedef int32x2_t Packet2i;
-typedef uint32x4_t Packet4ui;
-
-#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
- const Packet4f p4f_##NAME = pset1<Packet4f>(X)
-
-#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
- const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X))
-
-#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
- const Packet4i p4i_##NAME = pset1<Packet4i>(X)
-
-#if EIGEN_COMP_LLVM && !EIGEN_COMP_CLANG
- //Special treatment for Apple's llvm-gcc, its NEON packet types are unions
- #define EIGEN_INIT_NEON_PACKET2(X, Y) {{X, Y}}
- #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {{X, Y, Z, W}}
-#else
- //Default initializer for packets
- #define EIGEN_INIT_NEON_PACKET2(X, Y) {X, Y}
- #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {X, Y, Z, W}
-#endif
-
-// arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function
-// which available on LLVM and GCC (at least)
-#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
- #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
-#elif defined __pld
- #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
-#elif !EIGEN_ARCH_ARM64
- #define EIGEN_ARM_PREFETCH(ADDR) asm volatile ( " pld [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
-#else
- // by default no explicit prefetching
- #define EIGEN_ARM_PREFETCH(ADDR)
-#endif
-
-template<> struct packet_traits<float> : default_packet_traits
-{
- typedef Packet4f type;
- typedef Packet4f half; // Packet2f intrinsics not implemented yet
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size = 4,
- HasHalfPacket=0, // Packet2f intrinsics not implemented yet
-
- HasDiv = 1,
- // FIXME check the Has*
- HasSin = 0,
- HasCos = 0,
- HasTanH = 1,
- HasLog = 0,
- HasExp = 1,
- HasSqrt = 0
- };
-};
-template<> struct packet_traits<int> : default_packet_traits
-{
- typedef Packet4i type;
- typedef Packet4i half; // Packet2i intrinsics not implemented yet
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size=4,
- HasHalfPacket=0 // Packet2i intrinsics not implemented yet
- // FIXME check the Has*
- };
-};
-
-#if EIGEN_GNUC_AT_MOST(4,4) && !EIGEN_COMP_LLVM
-// workaround gcc 4.2, 4.3 and 4.4 compilatin issue
-EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); }
-EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); }
-EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32 (const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); }
-EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); }
-EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
-#endif
-
-template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; typedef Packet4i half; };
-
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return vdupq_n_s32(from); }
-
-template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a)
-{
- Packet4f countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3);
- return vaddq_f32(pset1<Packet4f>(a), countdown);
-}
-template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a)
-{
- Packet4i countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3);
- return vaddq_s32(pset1<Packet4i>(a), countdown);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); }
-template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return vnegq_s32(a); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
-
-template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pselect<Packet4f>(const Packet4f& a, const Packet4f& b, const Packet4f& false_mask) {
- return vbslq_f32(vreinterpretq_u32_f32(false_mask), b, a);
-}
-template<> EIGEN_STRONG_INLINE Packet4i pselect<Packet4i>(const Packet4i& a, const Packet4i& b, const Packet4i& false_mask) {
- return vbslq_s32(vreinterpretq_u32_s32(false_mask), b, a);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-#if EIGEN_ARCH_ARM64
- return vdivq_f32(a,b);
-#else
- Packet4f inv, restep, div;
-
- // NEON does not offer a divide instruction, we have to do a reciprocal approximation
- // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
- // a reciprocal estimate AND a reciprocal step -which saves a few instructions
- // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
- // Newton-Raphson and vrecpsq_f32()
- inv = vrecpeq_f32(b);
-
- // This returns a differential, by which we will have to multiply inv to get a better
- // approximation of 1/b.
- restep = vrecpsq_f32(b, inv);
- inv = vmulq_f32(restep, inv);
-
- // Finally, multiply a by 1/b and get the wanted result of the division.
- div = vmulq_f32(a, inv);
-
- return div;
-#endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
-{ eigen_assert(false && "packet integer division are not supported by NEON");
- return pset1<Packet4i>(0);
-}
-
-#ifdef __ARM_FEATURE_FMA
-// See bug 936.
-// FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4.
-// FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding.
-// MLA is not fused i.e. does 2 roundings.
-// In addition to giving better accuracy, FMA also gives better performance here on a Krait (Nexus 4):
-// MLA: 10 GFlop/s ; FMA: 12 GFlops/s.
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); }
-#else
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vmlaq_f32(c,a,b); }
-#endif
-
-// No FMA instruction for int, so use MLA unconditionally.
-template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); }
-
-// TODO(ebrevdo): add support for ple, plt, peq using vcle_f32/s32 or
-// vcleq_f32/s32, and their ilk, respectively, once it's clear which condition code to use.
-
-// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
-template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
- return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
-}
-template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
- return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
-}
-template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
- return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
-}
-template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
- return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
-}
-template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
-
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
-
-template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
-{
- float32x2_t lo, hi;
- lo = vld1_dup_f32(from);
- hi = vld1_dup_f32(from+1);
- return vcombine_f32(lo, hi);
-}
-template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
-{
- int32x2_t lo, hi;
- lo = vld1_dup_s32(from);
- hi = vld1_dup_s32(from+1);
- return vcombine_s32(lo, hi);
-}
-
-template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }
-
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, int stride)
-{
- Packet4f res = pset1<Packet4f>(0);
- res = vsetq_lane_f32(from[0*stride], res, 0);
- res = vsetq_lane_f32(from[1*stride], res, 1);
- res = vsetq_lane_f32(from[2*stride], res, 2);
- res = vsetq_lane_f32(from[3*stride], res, 3);
- return res;
-}
-template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, int stride)
-{
- Packet4i res = pset1<Packet4i>(0);
- res = vsetq_lane_s32(from[0*stride], res, 0);
- res = vsetq_lane_s32(from[1*stride], res, 1);
- res = vsetq_lane_s32(from[2*stride], res, 2);
- res = vsetq_lane_s32(from[3*stride], res, 3);
- return res;
-}
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, int stride)
-{
- to[stride*0] = vgetq_lane_f32(from, 0);
- to[stride*1] = vgetq_lane_f32(from, 1);
- to[stride*2] = vgetq_lane_f32(from, 2);
- to[stride*3] = vgetq_lane_f32(from, 3);
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, int stride)
-{
- to[stride*0] = vgetq_lane_s32(from, 0);
- to[stride*1] = vgetq_lane_s32(from, 1);
- to[stride*2] = vgetq_lane_s32(from, 2);
- to[stride*3] = vgetq_lane_s32(from, 3);
-}
-
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_ARM_PREFETCH(addr); }
-
-// FIXME only store the 2 first elements ?
-template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
-template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
-
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
- float32x2_t a_lo, a_hi;
- Packet4f a_r64;
-
- a_r64 = vrev64q_f32(a);
- a_lo = vget_low_f32(a_r64);
- a_hi = vget_high_f32(a_r64);
- return vcombine_f32(a_hi, a_lo);
-}
-template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
- int32x2_t a_lo, a_hi;
- Packet4i a_r64;
-
- a_r64 = vrev64q_s32(a);
- a_lo = vget_low_s32(a_r64);
- a_hi = vget_high_s32(a_r64);
- return vcombine_s32(a_hi, a_lo);
-}
-
-template<size_t offset>
-struct protate_impl<offset, Packet4f>
-{
- static Packet4f run(const Packet4f& a) {
- return vextq_f32(a, a, offset);
- }
-};
-
-template<size_t offset>
-struct protate_impl<offset, Packet4i>
-{
- static Packet4i run(const Packet4i& a) {
- return vextq_s32(a, a, offset);
- }
-};
-
-template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); }
-template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); }
-
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
- float32x2_t a_lo, a_hi, sum;
-
- a_lo = vget_low_f32(a);
- a_hi = vget_high_f32(a);
- sum = vpadd_f32(a_lo, a_hi);
- sum = vpadd_f32(sum, sum);
- return vget_lane_f32(sum, 0);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
-{
- float32x4x2_t vtrn1, vtrn2, res1, res2;
- Packet4f sum1, sum2, sum;
-
- // NEON zip performs interleaving of the supplied vectors.
- // We perform two interleaves in a row to acquire the transposed vector
- vtrn1 = vzipq_f32(vecs[0], vecs[2]);
- vtrn2 = vzipq_f32(vecs[1], vecs[3]);
- res1 = vzipq_f32(vtrn1.val[0], vtrn2.val[0]);
- res2 = vzipq_f32(vtrn1.val[1], vtrn2.val[1]);
-
- // Do the addition of the resulting vectors
- sum1 = vaddq_f32(res1.val[0], res1.val[1]);
- sum2 = vaddq_f32(res2.val[0], res2.val[1]);
- sum = vaddq_f32(sum1, sum2);
-
- return sum;
-}
-
-template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
-{
- int32x2_t a_lo, a_hi, sum;
-
- a_lo = vget_low_s32(a);
- a_hi = vget_high_s32(a);
- sum = vpadd_s32(a_lo, a_hi);
- sum = vpadd_s32(sum, sum);
- return vget_lane_s32(sum, 0);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
-{
- int32x4x2_t vtrn1, vtrn2, res1, res2;
- Packet4i sum1, sum2, sum;
-
- // NEON zip performs interleaving of the supplied vectors.
- // We perform two interleaves in a row to acquire the transposed vector
- vtrn1 = vzipq_s32(vecs[0], vecs[2]);
- vtrn2 = vzipq_s32(vecs[1], vecs[3]);
- res1 = vzipq_s32(vtrn1.val[0], vtrn2.val[0]);
- res2 = vzipq_s32(vtrn1.val[1], vtrn2.val[1]);
-
- // Do the addition of the resulting vectors
- sum1 = vaddq_s32(res1.val[0], res1.val[1]);
- sum2 = vaddq_s32(res2.val[0], res2.val[1]);
- sum = vaddq_s32(sum1, sum2);
-
- return sum;
-}
-
-// Other reduction functions:
-// mul
-template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
-{
- float32x2_t a_lo, a_hi, prod;
-
- // Get a_lo = |a1|a2| and a_hi = |a3|a4|
- a_lo = vget_low_f32(a);
- a_hi = vget_high_f32(a);
- // Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
- prod = vmul_f32(a_lo, a_hi);
- // Multiply prod with its swapped value |a2*a4|a1*a3|
- prod = vmul_f32(prod, vrev64_f32(prod));
-
- return vget_lane_f32(prod, 0);
-}
-template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
-{
- int32x2_t a_lo, a_hi, prod;
-
- // Get a_lo = |a1|a2| and a_hi = |a3|a4|
- a_lo = vget_low_s32(a);
- a_hi = vget_high_s32(a);
- // Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
- prod = vmul_s32(a_lo, a_hi);
- // Multiply prod with its swapped value |a2*a4|a1*a3|
- prod = vmul_s32(prod, vrev64_s32(prod));
-
- return vget_lane_s32(prod, 0);
-}
-
-// min
-template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
-{
- float32x2_t a_lo, a_hi, min;
-
- a_lo = vget_low_f32(a);
- a_hi = vget_high_f32(a);
- min = vpmin_f32(a_lo, a_hi);
- min = vpmin_f32(min, min);
-
- return vget_lane_f32(min, 0);
-}
-
-template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
-{
- int32x2_t a_lo, a_hi, min;
-
- a_lo = vget_low_s32(a);
- a_hi = vget_high_s32(a);
- min = vpmin_s32(a_lo, a_hi);
- min = vpmin_s32(min, min);
-
- return vget_lane_s32(min, 0);
-}
-
-// max
-template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
-{
- float32x2_t a_lo, a_hi, max;
-
- a_lo = vget_low_f32(a);
- a_hi = vget_high_f32(a);
- max = vpmax_f32(a_lo, a_hi);
- max = vpmax_f32(max, max);
-
- return vget_lane_f32(max, 0);
-}
-
-template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
-{
- int32x2_t a_lo, a_hi, max;
-
- a_lo = vget_low_s32(a);
- a_hi = vget_high_s32(a);
- max = vpmax_s32(a_lo, a_hi);
- max = vpmax_s32(max, max);
-
- return vget_lane_s32(max, 0);
-}
-
-// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
-// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
-#define PALIGN_NEON(Offset,Type,Command) \
-template<>\
-struct palign_impl<Offset,Type>\
-{\
- EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
- {\
- if (Offset!=0)\
- first = Command(first, second, Offset);\
- }\
-};\
-
-PALIGN_NEON(0,Packet4f,vextq_f32)
-PALIGN_NEON(1,Packet4f,vextq_f32)
-PALIGN_NEON(2,Packet4f,vextq_f32)
-PALIGN_NEON(3,Packet4f,vextq_f32)
-PALIGN_NEON(0,Packet4i,vextq_s32)
-PALIGN_NEON(1,Packet4i,vextq_s32)
-PALIGN_NEON(2,Packet4i,vextq_s32)
-PALIGN_NEON(3,Packet4i,vextq_s32)
-
-#undef PALIGN_NEON
-
-template<> EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4f,4>& kernel) {
- float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]);
- float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]);
-
- kernel.packet[0] = vcombine_f32(vget_low_f32(tmp1.val[0]), vget_low_f32(tmp2.val[0]));
- kernel.packet[1] = vcombine_f32(vget_high_f32(tmp1.val[0]), vget_high_f32(tmp2.val[0]));
- kernel.packet[2] = vcombine_f32(vget_low_f32(tmp1.val[1]), vget_low_f32(tmp2.val[1]));
- kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1]));
-}
-
-template<> EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4i,4>& kernel) {
- int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]);
- int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]);
- kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0]));
- kernel.packet[1] = vcombine_s32(vget_high_s32(tmp1.val[0]), vget_high_s32(tmp2.val[0]));
- kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1]));
- kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1]));
-}
-
-//---------- double ----------
-
-// Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrisics for double.
-// Confirmed at least with __apple_build_version__ = 6000054.
-#ifdef __apple_build_version__
-// Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed.
-// https://gist.github.com/yamaya/2924292 suggests that the 3 first digits are only updated with
-// major toolchain updates.
-#define EIGEN_APPLE_DOUBLE_NEON_BUG (__apple_build_version__ < 6010000)
-#else
-#define EIGEN_APPLE_DOUBLE_NEON_BUG 0
-#endif
-
-#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
-
-#if (EIGEN_COMP_GNUC_STRICT && defined(__ANDROID__)) || defined(__apple_build_version__)
-// Bug 907: workaround missing declarations of the following two functions in the ADK
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vreinterpretq_u64_f64 (float64x2_t __a)
-{
- return (uint64x2_t) __a;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vreinterpretq_f64_u64 (uint64x2_t __a)
-{
- return (float64x2_t) __a;
-}
-#endif
-
-typedef float64x2_t Packet2d;
-typedef float64x1_t Packet1d;
-
-template<> struct packet_traits<double> : default_packet_traits
-{
- typedef Packet2d type;
- typedef Packet2d half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size = 2,
- HasHalfPacket=0,
-
- HasDiv = 1,
- // FIXME check the Has*
- HasSin = 0,
- HasCos = 0,
- HasLog = 0,
- HasExp = 0,
- HasSqrt = 0
- };
-};
-
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; typedef Packet2d half; };
-
-template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return vdupq_n_f64(from); }
-
-template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a)
-{
- Packet2d countdown = EIGEN_INIT_NEON_PACKET2(0, 1);
- return vaddq_f64(pset1<Packet2d>(a), countdown);
-}
-template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); }
-
-template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
-
-template<> EIGEN_STRONG_INLINE Packet2d pselect<Packet2d>(const Packet2d& a, const Packet2d& b, const Packet2d& false_mask) {
- return vbslq_f64(vreinterpretq_u64_f64(false_mask), b, a);
-}
-
-template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmulq_f64(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); }
-
-#ifdef __ARM_FEATURE_FMA
-// See bug 936. See above comment about FMA for float.
-template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); }
-#else
-template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vmlaq_f64(c,a,b); }
-#endif
-
-template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); }
-
-// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
-template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b)
-{
- return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b)
-{
- return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b)
-{
- return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b)
-{
- return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); }
-
-template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); }
-
-template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
-{
- return vld1q_dup_f64(from);
-}
-template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from); }
-
-template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to, from); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, int stride)
-{
- Packet2d res = pset1<Packet2d>(0.0);
- res = vsetq_lane_f64(from[0*stride], res, 0);
- res = vsetq_lane_f64(from[1*stride], res, 1);
- return res;
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, int stride)
-{
- to[stride*0] = vgetq_lane_f64(from, 0);
- to[stride*1] = vgetq_lane_f64(from, 1);
-}
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ARM_PREFETCH(addr); }
-
-// FIXME only store the 2 first elements ?
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(a, 0); }
-
-template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); }
-
-template<size_t offset>
-struct protate_impl<offset, Packet2d>
-{
- static Packet2d run(const Packet2d& a) {
- return vextq_f64(a, a, offset);
- }
-};
-
-template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }
-
-#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
-// workaround ICE, see bug 907
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) + vget_high_f64(a))[0]; }
-#else
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); }
-#endif
-
-template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
-{
- float64x2_t trn1, trn2;
-
- // NEON zip performs interleaving of the supplied vectors.
- // We perform two interleaves in a row to acquire the transposed vector
- trn1 = vzip1q_f64(vecs[0], vecs[1]);
- trn2 = vzip2q_f64(vecs[0], vecs[1]);
-
- // Do the addition of the resulting vectors
- return vaddq_f64(trn1, trn2);
-}
-// Other reduction functions:
-// mul
-#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
-template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; }
-#else
-template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); }
-#endif
-
-// min
-template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); }
-
-// max
-template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpmaxq_f64(a, a), 0); }
-
-// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
-// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
-#define PALIGN_NEON(Offset,Type,Command) \
-template<>\
-struct palign_impl<Offset,Type>\
-{\
- EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
- {\
- if (Offset!=0)\
- first = Command(first, second, Offset);\
- }\
-};\
-
-PALIGN_NEON(0,Packet2d,vextq_f64)
-PALIGN_NEON(1,Packet2d,vextq_f64)
-#undef PALIGN_NEON
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet2d,2>& kernel) {
- float64x2_t trn1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]);
- float64x2_t trn2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]);
-
- kernel.packet[0] = trn1;
- kernel.packet[1] = trn2;
-}
-#endif // EIGEN_ARCH_ARM64
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_PACKET_MATH_NEON_H
diff --git a/third_party/eigen3/Eigen/src/Core/arch/SSE/Complex.h b/third_party/eigen3/Eigen/src/Core/arch/SSE/Complex.h
deleted file mode 100644
index 2722893dcf..0000000000
--- a/third_party/eigen3/Eigen/src/Core/arch/SSE/Complex.h
+++ /dev/null
@@ -1,486 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_COMPLEX_SSE_H
-#define EIGEN_COMPLEX_SSE_H
-
-namespace Eigen {
-
-namespace internal {
-
-//---------- float ----------
-struct Packet2cf
-{
- EIGEN_STRONG_INLINE Packet2cf() {}
- EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {}
- __m128 v;
-};
-
-// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
-// to leverage AVX instructions.
-#ifndef EIGEN_VECTORIZE_AVX
-template<> struct packet_traits<std::complex<float> > : default_packet_traits
-{
- typedef Packet2cf type;
- typedef Packet2cf half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size = 2,
- HasHalfPacket = 0,
-
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 1,
- HasNegate = 1,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 0,
- HasMax = 0,
- HasSetLinear = 0,
- HasBlend = 1,
- };
-};
-#endif
-
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; };
-
-template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a)
-{
- const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
- return Packet2cf(_mm_xor_ps(a.v,mask));
-}
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
-{
- const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
- return Packet2cf(_mm_xor_ps(a.v,mask));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
- // TODO optimize it for SSE3 and 4
- #ifdef EIGEN_VECTORIZE_SSE3
- return Packet2cf(_mm_addsub_ps(_mm_mul_ps(_mm_moveldup_ps(a.v), b.v),
- _mm_mul_ps(_mm_movehdup_ps(a.v),
- vec4f_swizzle1(b.v, 1, 0, 3, 2))));
-// return Packet2cf(_mm_addsub_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
-// _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
-// vec4f_swizzle1(b.v, 1, 0, 3, 2))));
- #else
- const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x00000000,0x80000000,0x00000000));
- return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
- _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
- vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask)));
- #endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf pand <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf por <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pxor <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(a.v,b.v)); }
-
-template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(&numext::real_ref(*from))); }
-template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>(&numext::real_ref(*from))); }
-
-template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
-{
- Packet2cf res;
-#if EIGEN_GNUC_AT_MOST(4,2)
- // Workaround annoying "may be used uninitialized in this function" warning with gcc 4.2
- res.v = _mm_loadl_pi(_mm_set1_ps(0.0f), reinterpret_cast<const __m64*>(&from));
-#elif EIGEN_GNUC_AT_LEAST(4,6)
- // Suppress annoying "may be used uninitialized in this function" warning with gcc >= 4.6
- #pragma GCC diagnostic push
- #pragma GCC diagnostic ignored "-Wuninitialized"
- res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
- #pragma GCC diagnostic pop
-#else
- res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
-#endif
- return Packet2cf(_mm_movelh_ps(res.v,res.v));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
-
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), Packet4f(from.v)); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), Packet4f(from.v)); }
-
-
-template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, int stride)
-{
- return Packet2cf(_mm_set_ps(std::imag(from[1*stride]), std::real(from[1*stride]),
- std::imag(from[0*stride]), std::real(from[0*stride])));
-}
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, int stride)
-{
- to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 0)),
- _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 1)));
- to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 2)),
- _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3)));
-}
-
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-
-template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
-{
- #if EIGEN_GNUC_AT_MOST(4,3)
- // Workaround gcc 4.2 ICE - this is not performance wise ideal, but who cares...
- // This workaround also fix invalid code generation with gcc 4.3
- EIGEN_ALIGN16 std::complex<float> res[2];
- _mm_store_ps((float*)res, a.v);
- return res[0];
- #else
- std::complex<float> res;
- _mm_storel_pi((__m64*)&res, a.v);
- return res;
- #endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(Packet2d(_mm_castps_pd(a.v))))); }
-
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
-{
- return pfirst(Packet2cf(_mm_add_ps(a.v, _mm_movehl_ps(a.v,a.v))));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
-{
- return Packet2cf(_mm_add_ps(_mm_movelh_ps(vecs[0].v,vecs[1].v), _mm_movehl_ps(vecs[1].v,vecs[0].v)));
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
-{
- return pfirst(pmul(a, Packet2cf(_mm_movehl_ps(a.v,a.v))));
-}
-
-template<int Offset>
-struct palign_impl<Offset,Packet2cf>
-{
- static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second)
- {
- if (Offset==1)
- {
- first.v = _mm_movehl_ps(first.v, first.v);
- first.v = _mm_movelh_ps(first.v, second.v);
- }
- }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
-{
- EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
- { return padd(pmul(x,y),c); }
-
- EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
- {
- #ifdef EIGEN_VECTORIZE_SSE3
- return internal::pmul(a, pconj(b));
- #else
- const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
- return Packet2cf(_mm_add_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask),
- _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
- vec4f_swizzle1(b.v, 1, 0, 3, 2))));
- #endif
- }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
-{
- EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
- { return padd(pmul(x,y),c); }
-
- EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
- {
- #ifdef EIGEN_VECTORIZE_SSE3
- return internal::pmul(pconj(a), b);
- #else
- const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
- return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
- _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
- vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask)));
- #endif
- }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
-{
- EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
- { return padd(pmul(x,y),c); }
-
- EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
- {
- #ifdef EIGEN_VECTORIZE_SSE3
- return pconj(internal::pmul(a, b));
- #else
- const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
- return Packet2cf(_mm_sub_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask),
- _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
- vec4f_swizzle1(b.v, 1, 0, 3, 2))));
- #endif
- }
-};
-
-template<> struct conj_helper<Packet4f, Packet2cf, false,false>
-{
- EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
- { return padd(c, pmul(x,y)); }
-
- EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
- { return Packet2cf(Eigen::internal::pmul<Packet4f>(x, y.v)); }
-};
-
-template<> struct conj_helper<Packet2cf, Packet4f, false,false>
-{
- EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
- { return padd(c, pmul(x,y)); }
-
- EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
- { return Packet2cf(Eigen::internal::pmul<Packet4f>(x.v, y)); }
-};
-
-template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
- // TODO optimize it for SSE3 and 4
- Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
- __m128 s = _mm_mul_ps(b.v,b.v);
- return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1)))));
-}
-
-EIGEN_STRONG_INLINE Packet2cf pcplxflip/*<Packet2cf>*/(const Packet2cf& x)
-{
- return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2));
-}
-
-
-//---------- double ----------
-struct Packet1cd
-{
- EIGEN_STRONG_INLINE Packet1cd() {}
- EIGEN_STRONG_INLINE explicit Packet1cd(const __m128d& a) : v(a) {}
- __m128d v;
-};
-
-// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
-// to leverage AVX instructions.
-#ifndef EIGEN_VECTORIZE_AVX
-template<> struct packet_traits<std::complex<double> > : default_packet_traits
-{
- typedef Packet1cd type;
- typedef Packet1cd half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 0,
- size = 1,
- HasHalfPacket = 0,
-
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 1,
- HasNegate = 1,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 0,
- HasMax = 0,
- HasSetLinear = 0
- };
-};
-#endif
-
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; typedef Packet1cd half; };
-
-template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a)
-{
- const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
- return Packet1cd(_mm_xor_pd(a.v,mask));
-}
-
-template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
- // TODO optimize it for SSE3 and 4
- #ifdef EIGEN_VECTORIZE_SSE3
- return Packet1cd(_mm_addsub_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v),
- _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
- vec2d_swizzle1(b.v, 1, 0))));
- #else
- const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0));
- return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v),
- _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
- vec2d_swizzle1(b.v, 1, 0)), mask)));
- #endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet1cd pand <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd por <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pxor <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); }
-
-// FIXME force unaligned load, this is a temporary fix
-template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from)
-{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
-
-template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); }
-
-// FIXME force unaligned store, this is a temporary fix
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); }
-
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-
-template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
-{
- EIGEN_ALIGN16 double res[2];
- _mm_store_pd(res, a.v);
- return std::complex<double>(res[0],res[1]);
-}
-
-template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
-
-template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a)
-{
- return pfirst(a);
-}
-
-template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)
-{
- return vecs[0];
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
-{
- return pfirst(a);
-}
-
-template<int Offset>
-struct palign_impl<Offset,Packet1cd>
-{
- static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
- {
- // FIXME is it sure we never have to align a Packet1cd?
- // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
- }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
-{
- EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
- { return padd(pmul(x,y),c); }
-
- EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
- {
- #ifdef EIGEN_VECTORIZE_SSE3
- return internal::pmul(a, pconj(b));
- #else
- const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
- return Packet1cd(_mm_add_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask),
- _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
- vec2d_swizzle1(b.v, 1, 0))));
- #endif
- }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
-{
- EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
- { return padd(pmul(x,y),c); }
-
- EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
- {
- #ifdef EIGEN_VECTORIZE_SSE3
- return internal::pmul(pconj(a), b);
- #else
- const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
- return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v),
- _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
- vec2d_swizzle1(b.v, 1, 0)), mask)));
- #endif
- }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
-{
- EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
- { return padd(pmul(x,y),c); }
-
- EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
- {
- #ifdef EIGEN_VECTORIZE_SSE3
- return pconj(internal::pmul(a, b));
- #else
- const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
- return Packet1cd(_mm_sub_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask),
- _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
- vec2d_swizzle1(b.v, 1, 0))));
- #endif
- }
-};
-
-template<> struct conj_helper<Packet2d, Packet1cd, false,false>
-{
- EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
- { return padd(c, pmul(x,y)); }
-
- EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
- { return Packet1cd(Eigen::internal::pmul<Packet2d>(x, y.v)); }
-};
-
-template<> struct conj_helper<Packet1cd, Packet2d, false,false>
-{
- EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
- { return padd(c, pmul(x,y)); }
-
- EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
- { return Packet1cd(Eigen::internal::pmul<Packet2d>(x.v, y)); }
-};
-
-template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
- // TODO optimize it for SSE3 and 4
- Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
- __m128d s = _mm_mul_pd(b.v,b.v);
- return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1))));
-}
-
-EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
-{
- return Packet1cd(preverse(Packet2d(x.v)));
-}
-
-template<> EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet2cf,2>& kernel) {
- __m128d w1 = _mm_castps_pd(kernel.packet[0].v);
- __m128d w2 = _mm_castps_pd(kernel.packet[1].v);
-
- __m128 tmp = _mm_castpd_ps(_mm_unpackhi_pd(w1, w2));
- kernel.packet[0].v = _mm_castpd_ps(_mm_unpacklo_pd(w1, w2));
- kernel.packet[1].v = tmp;
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
- __m128d result = pblend(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v));
- return Packet2cf(_mm_castpd_ps(result));
-}
-
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_COMPLEX_SSE_H
diff --git a/third_party/eigen3/Eigen/src/Core/arch/SSE/MathFunctions.h b/third_party/eigen3/Eigen/src/Core/arch/SSE/MathFunctions.h
deleted file mode 100644
index 0baa7b4b58..0000000000
--- a/third_party/eigen3/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ /dev/null
@@ -1,529 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2007 Julien Pommier
-// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/* The sin, cos, exp, and log functions of this file come from
- * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
- */
-
-#ifndef EIGEN_MATH_FUNCTIONS_SSE_H
-#define EIGEN_MATH_FUNCTIONS_SSE_H
-
-namespace Eigen {
-
-namespace internal {
-
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f plog<Packet4f>(const Packet4f& _x)
-{
- Packet4f x = _x;
- _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
- _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
- _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
-
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
-
- /* the smallest non denormalized float number */
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000);
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000);//-1.f/0.f);
-
- /* natural logarithm computed for 4 simultaneous float
- return NaN for x <= 0
- */
- _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
-
-
- Packet4i emm0;
-
- // invalid_mask is set to true when x is NaN
- Packet4f invalid_mask = _mm_cmpnge_ps(x, _mm_setzero_ps());
- Packet4f iszero_mask = _mm_cmpeq_ps(x, _mm_setzero_ps());
-
- x = pmax(x, p4f_min_norm_pos); /* cut off denormalized stuff */
- emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
-
- /* keep only the fractional part */
- x = _mm_and_ps(x, p4f_inv_mant_mask);
- x = _mm_or_ps(x, p4f_half);
-
- emm0 = _mm_sub_epi32(emm0, p4i_0x7f);
- Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1);
-
- /* part2:
- if( x < SQRTHF ) {
- e -= 1;
- x = x + x - 1.0;
- } else { x = x - 1.0; }
- */
- Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF);
- Packet4f tmp = pand(x, mask);
- x = psub(x, p4f_1);
- e = psub(e, pand(p4f_1, mask));
- x = padd(x, tmp);
-
- Packet4f x2 = pmul(x,x);
- Packet4f x3 = pmul(x2,x);
-
- Packet4f y, y1, y2;
- y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
- y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
- y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
- y = pmadd(y , x, p4f_cephes_log_p2);
- y1 = pmadd(y1, x, p4f_cephes_log_p5);
- y2 = pmadd(y2, x, p4f_cephes_log_p8);
- y = pmadd(y, x3, y1);
- y = pmadd(y, x3, y2);
- y = pmul(y, x3);
-
- y1 = pmul(e, p4f_cephes_log_q1);
- tmp = pmul(x2, p4f_half);
- y = padd(y, y1);
- x = psub(x, tmp);
- y2 = pmul(e, p4f_cephes_log_q2);
- x = padd(x, y);
- x = padd(x, y2);
- // negative arg will be NAN, 0 will be -INF
- return _mm_or_ps(_mm_andnot_ps(iszero_mask, _mm_or_ps(x, invalid_mask)),
- _mm_and_ps(iszero_mask, p4f_minus_inf));
-}
-
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f pexp<Packet4f>(const Packet4f& _x)
-{
- Packet4f x = _x;
- _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
- _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
- _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
-
-
- _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f);
- _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
-
- _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
-
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
-
- Packet4f tmp, fx;
- Packet4i emm0;
-
- // clamp x
- x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo);
-
- /* express exp(x) as exp(g + n*log(2)) */
- fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);
-
-#ifdef EIGEN_VECTORIZE_SSE4_1
- fx = _mm_floor_ps(fx);
-#else
- emm0 = _mm_cvttps_epi32(fx);
- tmp = _mm_cvtepi32_ps(emm0);
- /* if greater, substract 1 */
- Packet4f mask = _mm_cmpgt_ps(tmp, fx);
- mask = _mm_and_ps(mask, p4f_1);
- fx = psub(tmp, mask);
-#endif
-
- tmp = pmul(fx, p4f_cephes_exp_C1);
- Packet4f z = pmul(fx, p4f_cephes_exp_C2);
- x = psub(x, tmp);
- x = psub(x, z);
-
- z = pmul(x,x);
-
- Packet4f y = p4f_cephes_exp_p0;
- y = pmadd(y, x, p4f_cephes_exp_p1);
- y = pmadd(y, x, p4f_cephes_exp_p2);
- y = pmadd(y, x, p4f_cephes_exp_p3);
- y = pmadd(y, x, p4f_cephes_exp_p4);
- y = pmadd(y, x, p4f_cephes_exp_p5);
- y = pmadd(y, z, x);
- y = padd(y, p4f_1);
-
- // build 2^n
- emm0 = _mm_cvttps_epi32(fx);
- emm0 = _mm_add_epi32(emm0, p4i_0x7f);
- emm0 = _mm_slli_epi32(emm0, 23);
- return pmax(pmul(y, Packet4f(_mm_castsi128_ps(emm0))), _x);
-}
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet2d pexp<Packet2d>(const Packet2d& _x)
-{
- Packet2d x = _x;
-
- _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
- _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
- _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
-
- _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437);
- _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
-
- _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
-
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
-
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
-
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
- static const __m128i p4i_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0);
-
- Packet2d tmp, fx;
- Packet4i emm0;
-
- // clamp x
- x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
- /* express exp(x) as exp(g + n*log(2)) */
- fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half);
-
-#ifdef EIGEN_VECTORIZE_SSE4_1
- fx = _mm_floor_pd(fx);
-#else
- emm0 = _mm_cvttpd_epi32(fx);
- tmp = _mm_cvtepi32_pd(emm0);
- /* if greater, substract 1 */
- Packet2d mask = _mm_cmpgt_pd(tmp, fx);
- mask = _mm_and_pd(mask, p2d_1);
- fx = psub(tmp, mask);
-#endif
-
- tmp = pmul(fx, p2d_cephes_exp_C1);
- Packet2d z = pmul(fx, p2d_cephes_exp_C2);
- x = psub(x, tmp);
- x = psub(x, z);
-
- Packet2d x2 = pmul(x,x);
-
- Packet2d px = p2d_cephes_exp_p0;
- px = pmadd(px, x2, p2d_cephes_exp_p1);
- px = pmadd(px, x2, p2d_cephes_exp_p2);
- px = pmul (px, x);
-
- Packet2d qx = p2d_cephes_exp_q0;
- qx = pmadd(qx, x2, p2d_cephes_exp_q1);
- qx = pmadd(qx, x2, p2d_cephes_exp_q2);
- qx = pmadd(qx, x2, p2d_cephes_exp_q3);
-
- x = pdiv(px,psub(qx,px));
- x = pmadd(p2d_2,x,p2d_1);
-
- // build 2^n
- emm0 = _mm_cvttpd_epi32(fx);
- emm0 = _mm_add_epi32(emm0, p4i_1023_0);
- emm0 = _mm_slli_epi32(emm0, 20);
- emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3));
- return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x);
-}
-
-/* evaluation of 4 sines at onces, using SSE2 intrinsics.
-
- The code is the exact rewriting of the cephes sinf function.
- Precision is excellent as long as x < 8192 (I did not bother to
- take into account the special handling they have for greater values
- -- it does not return garbage for arguments over 8192, though, but
- the extra precision is missing).
-
- Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
- surprising but correct result.
-*/
-
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f psin<Packet4f>(const Packet4f& _x)
-{
- Packet4f x = _x;
- _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
- _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-
- _EIGEN_DECLARE_CONST_Packet4i(1, 1);
- _EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
- _EIGEN_DECLARE_CONST_Packet4i(2, 2);
- _EIGEN_DECLARE_CONST_Packet4i(4, 4);
-
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000);
-
- _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f);
- _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
- _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
- _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f);
- _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f);
- _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f);
- _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f);
- _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
-
- Packet4f xmm1, xmm2, xmm3, sign_bit, y;
-
- Packet4i emm0, emm2;
- sign_bit = x;
- /* take the absolute value */
- x = pabs(x);
-
- /* take the modulo */
-
- /* extract the sign bit (upper one) */
- sign_bit = _mm_and_ps(sign_bit, p4f_sign_mask);
-
- /* scale by 4/Pi */
- y = pmul(x, p4f_cephes_FOPI);
-
- /* store the integer part of y in mm0 */
- emm2 = _mm_cvttps_epi32(y);
- /* j=(j+1) & (~1) (see the cephes sources) */
- emm2 = _mm_add_epi32(emm2, p4i_1);
- emm2 = _mm_and_si128(emm2, p4i_not1);
- y = _mm_cvtepi32_ps(emm2);
- /* get the swap sign flag */
- emm0 = _mm_and_si128(emm2, p4i_4);
- emm0 = _mm_slli_epi32(emm0, 29);
- /* get the polynom selection mask
- there is one polynom for 0 <= x <= Pi/4
- and another one for Pi/4<x<=Pi/2
-
- Both branches will be computed.
- */
- emm2 = _mm_and_si128(emm2, p4i_2);
- emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
-
- Packet4f swap_sign_bit = _mm_castsi128_ps(emm0);
- Packet4f poly_mask = _mm_castsi128_ps(emm2);
- sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
-
- /* The magic pass: "Extended precision modular arithmetic"
- x = ((x - y * DP1) - y * DP2) - y * DP3; */
- xmm1 = pmul(y, p4f_minus_cephes_DP1);
- xmm2 = pmul(y, p4f_minus_cephes_DP2);
- xmm3 = pmul(y, p4f_minus_cephes_DP3);
- x = padd(x, xmm1);
- x = padd(x, xmm2);
- x = padd(x, xmm3);
-
- /* Evaluate the first polynom (0 <= x <= Pi/4) */
- y = p4f_coscof_p0;
- Packet4f z = _mm_mul_ps(x,x);
-
- y = pmadd(y, z, p4f_coscof_p1);
- y = pmadd(y, z, p4f_coscof_p2);
- y = pmul(y, z);
- y = pmul(y, z);
- Packet4f tmp = pmul(z, p4f_half);
- y = psub(y, tmp);
- y = padd(y, p4f_1);
-
- /* Evaluate the second polynom (Pi/4 <= x <= 0) */
-
- Packet4f y2 = p4f_sincof_p0;
- y2 = pmadd(y2, z, p4f_sincof_p1);
- y2 = pmadd(y2, z, p4f_sincof_p2);
- y2 = pmul(y2, z);
- y2 = pmul(y2, x);
- y2 = padd(y2, x);
-
- /* select the correct result from the two polynoms */
- y2 = _mm_and_ps(poly_mask, y2);
- y = _mm_andnot_ps(poly_mask, y);
- y = _mm_or_ps(y,y2);
- /* update the sign */
- return _mm_xor_ps(y, sign_bit);
-}
-
-/* almost the same as psin */
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f pcos<Packet4f>(const Packet4f& _x)
-{
- Packet4f x = _x;
- _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
- _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-
- _EIGEN_DECLARE_CONST_Packet4i(1, 1);
- _EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
- _EIGEN_DECLARE_CONST_Packet4i(2, 2);
- _EIGEN_DECLARE_CONST_Packet4i(4, 4);
-
- _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f);
- _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
- _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
- _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f);
- _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f);
- _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f);
- _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f);
- _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
-
- Packet4f xmm1, xmm2, xmm3, y;
- Packet4i emm0, emm2;
-
- x = pabs(x);
-
- /* scale by 4/Pi */
- y = pmul(x, p4f_cephes_FOPI);
-
- /* get the integer part of y */
- emm2 = _mm_cvttps_epi32(y);
- /* j=(j+1) & (~1) (see the cephes sources) */
- emm2 = _mm_add_epi32(emm2, p4i_1);
- emm2 = _mm_and_si128(emm2, p4i_not1);
- y = _mm_cvtepi32_ps(emm2);
-
- emm2 = _mm_sub_epi32(emm2, p4i_2);
-
- /* get the swap sign flag */
- emm0 = _mm_andnot_si128(emm2, p4i_4);
- emm0 = _mm_slli_epi32(emm0, 29);
- /* get the polynom selection mask */
- emm2 = _mm_and_si128(emm2, p4i_2);
- emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
-
- Packet4f sign_bit = _mm_castsi128_ps(emm0);
- Packet4f poly_mask = _mm_castsi128_ps(emm2);
-
- /* The magic pass: "Extended precision modular arithmetic"
- x = ((x - y * DP1) - y * DP2) - y * DP3; */
- xmm1 = pmul(y, p4f_minus_cephes_DP1);
- xmm2 = pmul(y, p4f_minus_cephes_DP2);
- xmm3 = pmul(y, p4f_minus_cephes_DP3);
- x = padd(x, xmm1);
- x = padd(x, xmm2);
- x = padd(x, xmm3);
-
- /* Evaluate the first polynom (0 <= x <= Pi/4) */
- y = p4f_coscof_p0;
- Packet4f z = pmul(x,x);
-
- y = pmadd(y,z,p4f_coscof_p1);
- y = pmadd(y,z,p4f_coscof_p2);
- y = pmul(y, z);
- y = pmul(y, z);
- Packet4f tmp = _mm_mul_ps(z, p4f_half);
- y = psub(y, tmp);
- y = padd(y, p4f_1);
-
- /* Evaluate the second polynom (Pi/4 <= x <= 0) */
- Packet4f y2 = p4f_sincof_p0;
- y2 = pmadd(y2, z, p4f_sincof_p1);
- y2 = pmadd(y2, z, p4f_sincof_p2);
- y2 = pmul(y2, z);
- y2 = pmadd(y2, x, x);
-
- /* select the correct result from the two polynoms */
- y2 = _mm_and_ps(poly_mask, y2);
- y = _mm_andnot_ps(poly_mask, y);
- y = _mm_or_ps(y,y2);
-
- /* update the sign */
- return _mm_xor_ps(y, sign_bit);
-}
-
-#if EIGEN_FAST_MATH
-
-// This is based on Quake3's fast inverse square root.
-// For detail see here: http://www.beyond3d.com/content/articles/8/
-// It lacks 1 (or 2 bits in some rare cases) of precision, and does not handle negative, +inf, or denormalized numbers correctly.
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f psqrt<Packet4f>(const Packet4f& _x)
-{
- Packet4f half = pmul(_x, pset1<Packet4f>(.5f));
-
- /* select only the inverse sqrt of non-zero inputs */
- Packet4f non_zero_mask = _mm_cmpge_ps(_x, pset1<Packet4f>((std::numeric_limits<float>::min)()));
- Packet4f x = _mm_and_ps(non_zero_mask, _mm_rsqrt_ps(_x));
-
- x = pmul(x, psub(pset1<Packet4f>(1.5f), pmul(half, pmul(x,x))));
- return pmul(_x,x);
-}
-
-#else
-
-template<> EIGEN_STRONG_INLINE Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); }
-
-#endif
-
-template<> EIGEN_STRONG_INLINE Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
-
-
-#if EIGEN_FAST_MATH
-
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f prsqrt<Packet4f>(const Packet4f& _x) {
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000);
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000);
- _EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f);
- _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f);
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000);
-
- Packet4f neg_half = pmul(_x, p4f_minus_half);
-
- // select only the inverse sqrt of positive normal inputs (denormals are
- // flushed to zero and cause infs as well).
- Packet4f le_zero_mask = _mm_cmple_ps(_x, p4f_flt_min);
- Packet4f x = _mm_andnot_ps(le_zero_mask, _mm_rsqrt_ps(_x));
-
- // Fill in NaNs and Infs for the negative/zero entries.
- Packet4f neg_mask = _mm_cmplt_ps(_x, _mm_setzero_ps());
- Packet4f zero_mask = _mm_andnot_ps(neg_mask, le_zero_mask);
- Packet4f infs_and_nans = _mm_or_ps(_mm_and_ps(neg_mask, p4f_nan),
- _mm_and_ps(zero_mask, p4f_inf));
-
- // Do a single step of Newton's iteration.
- x = pmul(x, pmadd(neg_half, pmul(x, x), p4f_one_point_five));
-
- // Insert NaNs and Infs in all the right places.
- return _mm_or_ps(x, infs_and_nans);
-}
-
-#else
-
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f prsqrt<Packet4f>(const Packet4f& x) {
- // Unfortunately we can't use the much faster mm_rqsrt_ps since it only provides an approximation.
- return _mm_div_ps(pset1<Packet4f>(1.0f), _mm_sqrt_ps(x));
-}
-
-#endif
-
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet2d prsqrt<Packet2d>(const Packet2d& x) {
- // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation.
- return _mm_div_pd(pset1<Packet2d>(1.0), _mm_sqrt_pd(x));
-}
-
-// Identical to the ptanh in GenericPacketMath.h, but for doubles use
-// a small/medium approximation threshold of 0.001.
-template<> EIGEN_STRONG_INLINE Packet2d ptanh_approx_threshold() {
- return pset1<Packet2d>(0.001);
-}
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_MATH_FUNCTIONS_SSE_H
diff --git a/third_party/eigen3/Eigen/src/Core/arch/SSE/PacketMath.h b/third_party/eigen3/Eigen/src/Core/arch/SSE/PacketMath.h
deleted file mode 100644
index 7f4274fd99..0000000000
--- a/third_party/eigen3/Eigen/src/Core/arch/SSE/PacketMath.h
+++ /dev/null
@@ -1,883 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_PACKET_MATH_SSE_H
-#define EIGEN_PACKET_MATH_SSE_H
-
-namespace Eigen {
-
-namespace internal {
-
-#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
-#endif
-
-#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
-#endif
-
-#ifdef __FMA__
-#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
-#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1
-#endif
-#endif
-
-typedef __m128 Packet4f;
-typedef __m128i Packet4i;
-typedef __m128d Packet2d;
-
-template<> struct is_arithmetic<__m128> { enum { value = true }; };
-template<> struct is_arithmetic<__m128i> { enum { value = true }; };
-template<> struct is_arithmetic<__m128d> { enum { value = true }; };
-
-#define vec4f_swizzle1(v,p,q,r,s) \
- (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p)))))
-
-#define vec4i_swizzle1(v,p,q,r,s) \
- (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p))))
-
-#define vec2d_swizzle1(v,p,q) \
- (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2)))))
-
-#define vec4f_swizzle2(a,b,p,q,r,s) \
- (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p))))
-
-#define vec4i_swizzle2(a,b,p,q,r,s) \
- (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p))))))
-
-#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
- const Packet4f p4f_##NAME = pset1<Packet4f>(X)
-
-#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
- const Packet2d p2d_##NAME = pset1<Packet2d>(X)
-
-#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
- const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1<Packet4i>(X))
-
-#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
- const Packet4i p4i_##NAME = pset1<Packet4i>(X)
-
-
-// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
-// to leverage AVX instructions.
-#ifndef EIGEN_VECTORIZE_AVX
-template<> struct packet_traits<float> : default_packet_traits
-{
- typedef Packet4f type;
- typedef Packet4f half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size=4,
- HasHalfPacket = 0,
-
- HasDiv = 1,
- HasSin = EIGEN_FAST_MATH,
- HasCos = EIGEN_FAST_MATH,
- HasTanH = 1,
- HasLog = 1,
- HasExp = 1,
- HasSqrt = 1,
- HasRsqrt = 1,
-
- HasBlend = 1,
- HasSelect = 1,
- HasEq = 1,
- };
-};
-template<> struct packet_traits<double> : default_packet_traits
-{
- typedef Packet2d type;
- typedef Packet2d half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size=2,
- HasHalfPacket = 0,
-
- HasDiv = 1,
- HasTanH = 1,
- HasExp = 1,
- HasSqrt = 1,
- HasRsqrt = 1,
-
- HasBlend = 1,
- HasSelect = 1,
- HasEq = 1,
- };
-};
-#endif
-template<> struct packet_traits<int> : default_packet_traits
-{
- typedef Packet4i type;
- typedef Packet4i half;
- enum {
- // FIXME check the Has*
- Vectorizable = 1,
- AlignedOnScalar = 1,
- size=4,
-
- HasBlend = 1,
- };
-};
-
-template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; typedef Packet2d half; };
-template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; typedef Packet4i half; };
-
-#if EIGEN_COMP_MSVC==1500
-// Workaround MSVC 9 internal compiler error.
-// TODO: It has been detected with win64 builds (amd64), so let's check whether it also happens in 32bits+SSE mode
-// TODO: let's check whether there does not exist a better fix, like adding a pset0() function. (it crashed on pset1(0)).
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set_ps(from,from,from,from); }
-template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set_pd(from,from); }
-template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set_epi32(from,from,from,from); }
-#else
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set_ps1(from); }
-template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); }
-#endif
-
-// GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
-// However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
-// Using inline assembly is also not an option because then gcc fails to reorder properly the instructions.
-// Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply.
-// Also note that with AVX, we want it to generate a vbroadcastss.
-#if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__)
-template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) {
- return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0);
-}
-#endif
-
-#ifndef EIGEN_VECTORIZE_AVX
-template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
-template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
-#endif
-template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
-
-template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f ple<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_cmple_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d ple<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_cmple_pd(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f plt<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_cmplt_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d plt<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_cmplt_pd(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f peq<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d peq<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pselect<Packet4f>(const Packet4f& a, const Packet4f& b, const Packet4f& false_mask) {
-#if defined(EIGEN_VECTORIZE_SSE4_1)
- return _mm_blendv_ps(a, b, false_mask);
-#else
- return _mm_or_ps(_mm_andnot_ps(false_mask, a), _mm_and_ps(false_mask, b));
-#endif
-}
-template<> EIGEN_STRONG_INLINE Packet2d pselect<Packet2d>(const Packet2d& a, const Packet2d& b, const Packet2d& false_mask) {
-#if defined(EIGEN_VECTORIZE_SSE4_1)
- return _mm_blendv_pd(a, b, false_mask);
-#else
- return _mm_or_pd(_mm_andnot_pd(false_mask, a), _mm_and_pd(false_mask, b));
-#endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
-{
- const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
- return _mm_xor_ps(a,mask);
-}
-template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
-{
- const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x80000000));
- return _mm_xor_pd(a,mask);
-}
-template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
-{
- return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
-
-template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_mul_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_mul_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
-{
-#ifdef EIGEN_VECTORIZE_SSE4_1
- return _mm_mullo_epi32(a,b);
-#else
- // this version is slightly faster than 4 scalar products
- return vec4i_swizzle1(
- vec4i_swizzle2(
- _mm_mul_epu32(a,b),
- _mm_mul_epu32(vec4i_swizzle1(a,1,0,3,2),
- vec4i_swizzle1(b,1,0,3,2)),
- 0,2,0,2),
- 0,2,1,3);
-#endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
-{ eigen_assert(false && "packet integer division are not supported by SSE");
- return pset1<Packet4i>(0);
-}
-
-// for some weird raisons, it has to be overloaded for packet of integers
-template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
-#ifdef __FMA__
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); }
-#endif
-
-template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b)
-{
-#ifdef EIGEN_VECTORIZE_SSE4_1
- return _mm_min_epi32(a,b);
-#else
- // after some bench, this version *is* faster than a scalar implementation
- Packet4i mask = _mm_cmplt_epi32(a,b);
- return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
-#endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_max_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_max_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b)
-{
-#ifdef EIGEN_VECTORIZE_SSE4_1
- return _mm_max_epi32(a,b);
-#else
- // after some bench, this version *is* faster than a scalar implementation
- Packet4i mask = _mm_cmpgt_epi32(a,b);
- return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
-#endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
-template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
-
-#if EIGEN_COMP_MSVC
- template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
- EIGEN_DEBUG_UNALIGNED_LOAD
- #if (EIGEN_COMP_MSVC==1600)
- // NOTE Some version of MSVC10 generates bad code when using _mm_loadu_ps
- // (i.e., it does not generate an unaligned load!!
- // TODO On most architectures this version should also be faster than a single _mm_loadu_ps
- // so we could also enable it for MSVC08 but first we have to make this later does not generate crap when doing so...
- __m128 res = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from));
- res = _mm_loadh_pi(res, (const __m64*)(from+2));
- return res;
- #else
- return _mm_loadu_ps(from);
- #endif
- }
- template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
- template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); }
-#else
-// Fast unaligned loads. Note that here we cannot directly use intrinsics: this would
-// require pointer casting to incompatible pointer types and leads to invalid code
-// because of the strict aliasing rule. The "dummy" stuff are required to enforce
-// a correct instruction dependency.
-// TODO: do the same for MSVC (ICC is compatible)
-// NOTE: with the code below, MSVC's compiler crashes!
-
-#if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386 || (EIGEN_ARCH_x86_64 && EIGEN_GNUC_AT_LEAST(4, 8)))
- // bug 195: gcc/i386 emits weird x87 fldl/fstpl instructions for _mm_load_sd
- #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
- #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 1
-#elif EIGEN_COMP_CLANG
- // bug 201: Segfaults in __mm_loadh_pd with clang 2.8
- #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
- #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 0
-#else
- #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 0
- #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 0
-#endif
-
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
-{
- EIGEN_DEBUG_UNALIGNED_LOAD
-#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
- return _mm_loadu_ps(from);
-#else
- __m128d res;
- res = _mm_load_sd((const double*)(from)) ;
- res = _mm_loadh_pd(res, (const double*)(from+2)) ;
- return _mm_castpd_ps(res);
-#endif
-}
-template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
-{
- EIGEN_DEBUG_UNALIGNED_LOAD
-#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
- return _mm_loadu_pd(from);
-#else
- __m128d res;
- res = _mm_load_sd(from) ;
- res = _mm_loadh_pd(res,from+1);
- return res;
-#endif
-}
-template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
-{
- EIGEN_DEBUG_UNALIGNED_LOAD
-#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
- return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
-#else
- __m128d res;
- res = _mm_load_sd((const double*)(from)) ;
- res = _mm_loadh_pd(res, (const double*)(from+2)) ;
- return _mm_castpd_si128(res);
-#endif
-}
-#endif
-
-template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
-{
- return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))), 0, 0, 1, 1);
-}
-template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
-{ return pset1<Packet2d>(from[0]); }
-template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
-{
- Packet4i tmp;
- tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
- return vec4i_swizzle1(tmp, 0, 0, 1, 1);
-}
-
-template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
-
-template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
- EIGEN_DEBUG_UNALIGNED_STORE
-#if EIGEN_AVOID_CUSTOM_UNALIGNED_STORES
- _mm_storeu_pd(to, from);
-#else
- _mm_storel_pd((to), from);
- _mm_storeh_pd((to+1), from);
-#endif
-}
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), Packet2d(_mm_castps_pd(from))); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), Packet2d(_mm_castsi128_pd(from))); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, int stride)
-{
- return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
-}
-template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, int stride)
-{
- return _mm_set_pd(from[1*stride], from[0*stride]);
-}
-template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, int stride)
-{
- return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
- }
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, int stride)
-{
- to[stride*0] = _mm_cvtss_f32(from);
- to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1));
- to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2));
- to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3));
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, int stride)
-{
- to[stride*0] = _mm_cvtsd_f64(from);
- to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1));
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, int stride)
-{
- to[stride*0] = _mm_cvtsi128_si32(from);
- to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
- to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
- to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
-}
-
-// some compilers might be tempted to perform multiple moves instead of using a vector path.
-template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a)
-{
- Packet4f pa = _mm_set_ss(a);
- pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0)));
-}
-// some compilers might be tempted to perform multiple moves instead of using a vector path.
-template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a)
-{
- Packet2d pa = _mm_set_sd(a);
- pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
-}
-
-#ifndef EIGEN_VECTORIZE_AVX
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-#endif
-
-#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
-// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
-// Direct of the struct members fixed bug #62.
-template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return a.m128_f32[0]; }
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return a.m128d_f64[0]; }
-template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
-#elif EIGEN_COMP_MSVC_STRICT
-// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
-template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; }
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; }
-template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
-#else
-template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return _mm_cvtss_f32(a); }
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return _mm_cvtsd_f64(a); }
-template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { return _mm_cvtsi128_si32(a); }
-#endif
-
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
-{ return _mm_shuffle_ps(a,a,0x1B); }
-template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
-{ return _mm_shuffle_pd(a,a,0x1); }
-template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
-{ return _mm_shuffle_epi32(a,0x1B); }
-
-template<size_t offset>
-struct protate_impl<offset, Packet4f>
-{
- static Packet4f run(const Packet4f& a) {
- return vec4f_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4);
- }
-};
-
-template<size_t offset>
-struct protate_impl<offset, Packet4i>
-{
- static Packet4i run(const Packet4i& a) {
- return vec4i_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4);
- }
-};
-
-template<size_t offset>
-struct protate_impl<offset, Packet2d>
-{
- static Packet2d run(const Packet2d& a) {
- return vec2d_swizzle1(a, offset, (offset + 1) % 2);
- }
-};
-
-template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
-{
- const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
- return _mm_and_ps(a,mask);
-}
-template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a)
-{
- const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
- return _mm_and_pd(a,mask);
-}
-template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
-{
- #ifdef EIGEN_VECTORIZE_SSSE3
- return _mm_abs_epi32(a);
- #else
- Packet4i aux = _mm_srai_epi32(a,31);
- return _mm_sub_epi32(_mm_xor_si128(a,aux),aux);
- #endif
-}
-
-// with AVX, the default implementations based on pload1 are faster
-#ifndef __AVX__
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet4f>(const float *a,
- Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
-{
- a3 = pload<Packet4f>(a);
- a0 = vec4f_swizzle1(a3, 0,0,0,0);
- a1 = vec4f_swizzle1(a3, 1,1,1,1);
- a2 = vec4f_swizzle1(a3, 2,2,2,2);
- a3 = vec4f_swizzle1(a3, 3,3,3,3);
-}
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet2d>(const double *a,
- Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
-{
-#ifdef EIGEN_VECTORIZE_SSE3
- a0 = _mm_loaddup_pd(a+0);
- a1 = _mm_loaddup_pd(a+1);
- a2 = _mm_loaddup_pd(a+2);
- a3 = _mm_loaddup_pd(a+3);
-#else
- a1 = pload<Packet2d>(a);
- a0 = vec2d_swizzle1(a1, 0,0);
- a1 = vec2d_swizzle1(a1, 1,1);
- a3 = pload<Packet2d>(a+2);
- a2 = vec2d_swizzle1(a3, 0,0);
- a3 = vec2d_swizzle1(a3, 1,1);
-#endif
-}
-#endif
-
-EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs)
-{
- vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
- vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA));
- vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF));
- vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
-}
-
-#ifdef EIGEN_VECTORIZE_SSE3
-// TODO implement SSE2 versions as well as integer versions
-template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
-{
- return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3]));
-}
-template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
-{
- return _mm_hadd_pd(vecs[0], vecs[1]);
-}
-// SSSE3 version:
-// EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs)
-// {
-// return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3]));
-// }
-
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
- Packet4f tmp0 = _mm_hadd_ps(a,a);
- return pfirst<Packet4f>(_mm_hadd_ps(tmp0, tmp0));
-}
-
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst<Packet2d>(_mm_hadd_pd(a, a)); }
-
-// SSSE3 version:
-// EIGEN_STRONG_INLINE float predux(const Packet4i& a)
-// {
-// Packet4i tmp0 = _mm_hadd_epi32(a,a);
-// return pfirst(_mm_hadd_epi32(tmp0, tmp0));
-// }
-#else
-// SSE2 versions
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
- Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
- return pfirst(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
-}
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
-{
- return pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
-{
- Packet4f tmp0, tmp1, tmp2;
- tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]);
- tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]);
- tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]);
- tmp0 = _mm_add_ps(tmp0, tmp1);
- tmp1 = _mm_unpacklo_ps(vecs[2], vecs[3]);
- tmp1 = _mm_add_ps(tmp1, tmp2);
- tmp2 = _mm_movehl_ps(tmp1, tmp0);
- tmp0 = _mm_movelh_ps(tmp0, tmp1);
- return _mm_add_ps(tmp0, tmp2);
-}
-
-template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
-{
- return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1]));
-}
-#endif // SSE3
-
-template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
-{
- Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
- return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
-{
- Packet4i tmp0, tmp1, tmp2;
- tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
- tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
- tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
- tmp0 = _mm_add_epi32(tmp0, tmp1);
- tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
- tmp1 = _mm_add_epi32(tmp1, tmp2);
- tmp2 = _mm_unpacklo_epi64(tmp0, tmp1);
- tmp0 = _mm_unpackhi_epi64(tmp0, tmp1);
- return _mm_add_epi32(tmp0, tmp2);
-}
-
-// Other reduction functions:
-
-// mul
-template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
-{
- Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a));
- return pfirst<Packet4f>(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
-}
-template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
-{
- return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
-}
-template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
-{
- // after some experiments, it is seems this is the fastest way to implement it
- // for GCC (eg., reusing pmul is very slow !)
- // TODO try to call _mm_mul_epu32 directly
- EIGEN_ALIGN16 int aux[4];
- pstore(aux, a);
- return (aux[0] * aux[1]) * (aux[2] * aux[3]);;
-}
-
-// min
-template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
-{
- Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a));
- return pfirst<Packet4f>(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
-}
-template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
-{
- return pfirst<Packet2d>(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
-}
-template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
-{
-#ifdef EIGEN_VECTORIZE_SSE4_1
- Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
- return pfirst<Packet4i>(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
-#else
- // after some experiments, it is seems this is the fastest way to implement it
- // for GCC (eg., it does not like using std::min after the pstore !!)
- EIGEN_ALIGN16 int aux[4];
- pstore(aux, a);
- int aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
- int aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
- return aux0<aux2 ? aux0 : aux2;
-#endif // EIGEN_VECTORIZE_SSE4_1
-}
-
-// max
-template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
-{
- Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a));
- return pfirst<Packet4f>(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
-}
-template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
-{
- return pfirst<Packet2d>(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
-}
-template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
-{
-#ifdef EIGEN_VECTORIZE_SSE4_1
- Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
- return pfirst<Packet4i>(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
-#else
- // after some experiments, it is seems this is the fastest way to implement it
- // for GCC (eg., it does not like using std::min after the pstore !!)
- EIGEN_ALIGN16 int aux[4];
- pstore(aux, a);
- int aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
- int aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
- return aux0>aux2 ? aux0 : aux2;
-#endif // EIGEN_VECTORIZE_SSE4_1
-}
-
-#if EIGEN_COMP_GNUC
-// template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
-// {
-// Packet4f res = b;
-// asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c));
-// return res;
-// }
-// EIGEN_STRONG_INLINE Packet4i _mm_alignr_epi8(const Packet4i& a, const Packet4i& b, const int i)
-// {
-// Packet4i res = a;
-// asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i));
-// return res;
-// }
-#endif
-
-#ifdef EIGEN_VECTORIZE_SSSE3
-// SSSE3 versions
-template<int Offset>
-struct palign_impl<Offset,Packet4f>
-{
- static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
- {
- if (Offset!=0)
- first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4));
- }
-};
-
-template<int Offset>
-struct palign_impl<Offset,Packet4i>
-{
- static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
- {
- if (Offset!=0)
- first = _mm_alignr_epi8(second,first, Offset*4);
- }
-};
-
-template<int Offset>
-struct palign_impl<Offset,Packet2d>
-{
- static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
- {
- if (Offset==1)
- first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8));
- }
-};
-#else
-// SSE2 versions
-template<int Offset>
-struct palign_impl<Offset,Packet4f>
-{
- static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
- {
- if (Offset==1)
- {
- first = _mm_move_ss(first,second);
- first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39));
- }
- else if (Offset==2)
- {
- first = _mm_movehl_ps(first,first);
- first = _mm_movelh_ps(first,second);
- }
- else if (Offset==3)
- {
- first = _mm_move_ss(first,second);
- first = _mm_shuffle_ps(first,second,0x93);
- }
- }
-};
-
-template<int Offset>
-struct palign_impl<Offset,Packet4i>
-{
- static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
- {
- if (Offset==1)
- {
- first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
- first = _mm_shuffle_epi32(first,0x39);
- }
- else if (Offset==2)
- {
- first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first)));
- first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
- }
- else if (Offset==3)
- {
- first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
- first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93));
- }
- }
-};
-
-template<int Offset>
-struct palign_impl<Offset,Packet2d>
-{
- static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
- {
- if (Offset==1)
- {
- first = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(first),_mm_castpd_ps(first)));
- first = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(first),_mm_castpd_ps(second)));
- }
- }
-};
-#endif
-
-template<> EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4f,4>& kernel) {
- _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
-}
-
-template<> EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet2d,2>& kernel) {
- __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
- kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
- kernel.packet[1] = tmp;
-}
-
-template<> EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4i,4>& kernel) {
- __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
- __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
- __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
- __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
-
- kernel.packet[0] = _mm_unpacklo_epi64(T0, T1);
- kernel.packet[1] = _mm_unpackhi_epi64(T0, T1);
- kernel.packet[2] = _mm_unpacklo_epi64(T2, T3);
- kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
- __m128i false_mask = _mm_cmpeq_epi32(select, zero);
-#ifdef EIGEN_VECTORIZE_SSE4_1
- return _mm_blendv_epi8(thenPacket, elsePacket, false_mask);
-#else
- return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket));
-#endif
-}
-template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
- const __m128 zero = _mm_setzero_ps();
- const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
- __m128 false_mask = _mm_cmpeq_ps(select, zero);
-#ifdef EIGEN_VECTORIZE_SSE4_1
- return _mm_blendv_ps(thenPacket, elsePacket, false_mask);
-#else
- return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket));
-#endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
- const __m128d zero = _mm_setzero_pd();
- const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]);
- __m128d false_mask = _mm_cmpeq_pd(select, zero);
-#ifdef EIGEN_VECTORIZE_SSE4_1
- return _mm_blendv_pd(thenPacket, elsePacket, false_mask);
-#else
- return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket));
-#endif
-}
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_PACKET_MATH_SSE_H
diff --git a/third_party/eigen3/Eigen/src/Core/arch/SSE/TypeCasting.h b/third_party/eigen3/Eigen/src/Core/arch/SSE/TypeCasting.h
deleted file mode 100644
index c848932306..0000000000
--- a/third_party/eigen3/Eigen/src/Core/arch/SSE/TypeCasting.h
+++ /dev/null
@@ -1,77 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_TYPE_CASTING_SSE_H
-#define EIGEN_TYPE_CASTING_SSE_H
-
-namespace Eigen {
-
-namespace internal {
-
-template <>
-struct type_casting_traits<float, int> {
- enum {
- VectorizedCast = 1,
- SrcCoeffRatio = 1,
- TgtCoeffRatio = 1
- };
-};
-
-template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
- return _mm_cvttps_epi32(a);
-}
-
-
-template <>
-struct type_casting_traits<int, float> {
- enum {
- VectorizedCast = 1,
- SrcCoeffRatio = 1,
- TgtCoeffRatio = 1
- };
-};
-
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
- return _mm_cvtepi32_ps(a);
-}
-
-
-template <>
-struct type_casting_traits<double, float> {
- enum {
- VectorizedCast = 1,
- SrcCoeffRatio = 2,
- TgtCoeffRatio = 1
- };
-};
-
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
- return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
-}
-
-template <>
-struct type_casting_traits<float, double> {
- enum {
- VectorizedCast = 1,
- SrcCoeffRatio = 1,
- TgtCoeffRatio = 2
- };
-};
-
-template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
- // Simply discard the second half of the input
- return _mm_cvtps_pd(a);
-}
-
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_TYPE_CASTING_SSE_H