diff options
author | 2015-02-06 05:25:03 -0800 | |
---|---|---|
committer | 2015-02-06 05:25:03 -0800 | |
commit | c739102ef9a52fcb194dcc77f785aa55879987e4 (patch) | |
tree | 22d19d1df4cb20baea532fa1ce13208329ff53e3 /Eigen/src/Core/arch | |
parent | 2559fa9b0f20ea138cfb019d441ad1757221568d (diff) | |
parent | a8f2c6eec788c5cccc6beb9b5837544ea98a7154 (diff) |
Pulled the latest changes from the trunk
Diffstat (limited to 'Eigen/src/Core/arch')
-rw-r--r-- | Eigen/src/Core/arch/AVX/PacketMath.h | 10 | ||||
-rw-r--r-- | Eigen/src/Core/arch/AltiVec/Complex.h | 230 | ||||
-rwxr-xr-x | Eigen/src/Core/arch/AltiVec/PacketMath.h | 433 | ||||
-rw-r--r-- | Eigen/src/Core/arch/NEON/Complex.h | 194 | ||||
-rw-r--r-- | Eigen/src/Core/arch/NEON/PacketMath.h | 233 | ||||
-rw-r--r-- | Eigen/src/Core/arch/SSE/MathFunctions.h | 6 | ||||
-rwxr-xr-x | Eigen/src/Core/arch/SSE/PacketMath.h | 26 |
7 files changed, 1030 insertions, 102 deletions
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 1591458a7..e66d50649 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -22,9 +22,9 @@ namespace internal { #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) #endif -#ifdef EIGEN_VECTORIZE_FMA -#ifndef EIGEN_HAS_FUSED_MADD -#define EIGEN_HAS_FUSED_MADD 1 +#ifdef __FMA__ +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif #endif @@ -137,7 +137,7 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, co #ifdef EIGEN_VECTORIZE_FMA template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { -#if defined(__clang__) || defined(__GNUC__) +#if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers, // and gcc stupidly generates a vfmadd132ps instruction, // so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate @@ -150,7 +150,7 @@ template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& #endif } template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) { -#if defined(__clang__) || defined(__GNUC__) +#if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG // see above Packet4d res = c; __asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index 13b874d0c..f9b93a42b 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -7,23 +7,21 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_COMPLEX_ALTIVEC_H -#define EIGEN_COMPLEX_ALTIVEC_H +#ifndef EIGEN_COMPLEX32_ALTIVEC_H +#define EIGEN_COMPLEX32_ALTIVEC_H namespace Eigen { namespace internal { static Packet4ui p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_ZERO_);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; -static Packet16uc p16uc_COMPLEX_RE = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; -static Packet16uc p16uc_COMPLEX_IM = vec_sld(p16uc_DUPLICATE, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; -static Packet16uc p16uc_COMPLEX_REV = vec_sld(p16uc_REVERSE, p16uc_REVERSE, 8);//{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; -static Packet16uc p16uc_COMPLEX_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);//{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; -static Packet16uc p16uc_PSET_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_COMPLEX_RE, (Packet4ui)p16uc_COMPLEX_IM);//{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 }; -static Packet16uc p16uc_PSET_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_COMPLEX_RE, (Packet4ui)p16uc_COMPLEX_IM);//{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 }; -static Packet16uc p16uc_COMPLEX_MASK16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8);//{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; -static Packet16uc p16uc_COMPLEX_TRANSPOSE_0 = vec_add(p16uc_PSET_HI, p16uc_COMPLEX_MASK16);//{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; -static Packet16uc p16uc_COMPLEX_TRANSPOSE_1 = vec_add(p16uc_PSET_LO, p16uc_COMPLEX_MASK16);//{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31}; +#ifdef _BIG_ENDIAN +static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 }; +static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 }; +#else +static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 }; +static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 }; +#endif //---------- float ---------- struct Packet2cf @@ -65,7 +63,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<flo res.v = pload<Packet4f>((const float *)&from); else res.v = ploadu<Packet4f>((const float *)&from); - res.v = vec_perm(res.v, res.v, p16uc_PSET_HI); + res.v = vec_perm(res.v, res.v, p16uc_PSET64_HI); return res; } @@ -95,16 +93,16 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con Packet4f v1, v2; // Permute and multiply the real parts of a and b - v1 = vec_perm(a.v, a.v, p16uc_COMPLEX_RE); + v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD); // Get the imaginary parts of a - v2 = vec_perm(a.v, a.v, p16uc_COMPLEX_IM); + v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN); // multiply a_re * b v1 = vec_madd(v1, b.v, p4f_ZERO); // multiply a_im * b and get the conjugate result v2 = vec_madd(v2, b.v, p4f_ZERO); v2 = (Packet4f) vec_xor((Packet4ui)v2, p4ui_CONJ_XOR); // permute back to a proper order - v2 = vec_perm(v2, v2, p16uc_COMPLEX_REV); + v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV); return Packet2cf(vec_add(v1, v2)); } @@ -138,7 +136,7 @@ template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Pack template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { Packet4f rev_a; - rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX_REV2); + rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX32_REV2); return Packet2cf(rev_a); } @@ -153,9 +151,13 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packe template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs) { Packet4f b1, b2; - +#ifdef _BIG_ENDIAN b1 = (Packet4f) vec_sld(vecs[0].v, vecs[1].v, 8); b2 = (Packet4f) vec_sld(vecs[1].v, vecs[0].v, 8); +#else + b1 = (Packet4f) vec_sld(vecs[1].v, vecs[0].v, 8); + b2 = (Packet4f) vec_sld(vecs[0].v, vecs[1].v, 8); +#endif b2 = (Packet4f) vec_sld(b2, b2, 8); b2 = padd(b1, b2); @@ -179,7 +181,11 @@ struct palign_impl<Offset,Packet2cf> { if (Offset==1) { +#ifdef _BIG_ENDIAN first.v = vec_sld(first.v, second.v, 8); +#else + first.v = vec_sld(second.v, first.v, 8); +#endif } } }; @@ -222,23 +228,203 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con // TODO optimize it for AltiVec Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b); Packet4f s = vec_madd(b.v, b.v, p4f_ZERO); - return Packet2cf(pdiv(res.v, vec_add(s,vec_perm(s, s, p16uc_COMPLEX_REV)))); + return Packet2cf(pdiv(res.v, vec_add(s,vec_perm(s, s, p16uc_COMPLEX32_REV)))); } template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x) { - return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX_REV)); + return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX32_REV)); } EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel) { - Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_COMPLEX_TRANSPOSE_0); - kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_COMPLEX_TRANSPOSE_1); + Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); + kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); kernel.packet[0].v = tmp; } +//---------- double ---------- +#ifdef __VSX__ +struct Packet1cd +{ + EIGEN_STRONG_INLINE Packet1cd() {} + EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {} + Packet2d v; +}; + +template<> struct packet_traits<std::complex<double> > : default_packet_traits +{ + typedef Packet1cd type; + typedef Packet1cd half; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = 1, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; typedef Packet1cd half; }; + +template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); } +template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } + +template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from) +{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); } + +template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, DenseIndex stride) +{ + std::complex<double> EIGEN_ALIGN16 af[2]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + return pload<Packet1cd>(af); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, DenseIndex stride) +{ + std::complex<double> EIGEN_ALIGN16 af[2]; + pstore<std::complex<double> >(af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; +} + +template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_add(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_sub(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); } +template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); } + +template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) +{ + Packet2d a_re, a_im, v1, v2; + + // Permute and multiply the real parts of a and b + a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI); + // Get the imaginary parts of a + a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO); + // multiply a_re * b + v1 = vec_madd(a_re, b.v, p2d_ZERO); + // multiply a_im * b and get the conjugate result + v2 = vec_madd(a_im, b.v, p2d_ZERO); + v2 = (Packet2d) vec_sld((Packet4ui)v2, (Packet4ui)v2, 8); + v2 = (Packet2d) vec_xor((Packet2d)v2, (Packet2d) p2ul_CONJ_XOR1); + + return Packet1cd(vec_add(v1, v2)); +} + +template<> EIGEN_STRONG_INLINE Packet1cd pand <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd por <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pxor <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); } + +template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) +{ + return pset1<Packet1cd>(*from); +} + +template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { vec_dstt((long *)addr, DST_CTRL(2,2,32), DST_CHAN); } + +template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) +{ + std::complex<double> EIGEN_ALIGN16 res[2]; + pstore<std::complex<double> >(res, a); + + return res[0]; +} + +template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; } + +template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) +{ + return pfirst(a); +} + +template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs) +{ + return vecs[0]; +} + +template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) +{ + return pfirst(a); +} + +template<int Offset> +struct palign_impl<Offset,Packet1cd> +{ + static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) + { + // FIXME is it sure we never have to align a Packet1cd? + // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... + } +}; + +template<> struct conj_helper<Packet1cd, Packet1cd, false,true> +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return internal::pmul(a, pconj(b)); + } +}; + +template<> struct conj_helper<Packet1cd, Packet1cd, true,false> +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return internal::pmul(pconj(a), b); + } +}; + +template<> struct conj_helper<Packet1cd, Packet1cd, true,true> +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return pconj(internal::pmul(a, b)); + } +}; + +template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) +{ + // TODO optimize it for AltiVec + Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b); + Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_); + return Packet1cd(pdiv(res.v, vec_add(s,vec_perm(s, s, p16uc_COMPLEX32_REV)))); +} + +EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x) +{ + return Packet1cd(preverse(Packet2d(x.v))); +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel) +{ + Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); + kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); + kernel.packet[0].v = tmp; +} +#endif // __VSX__ } // end namespace internal } // end namespace Eigen -#endif // EIGEN_COMPLEX_ALTIVEC_H +#endif // EIGEN_COMPLEX32_ALTIVEC_H diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index b43e8ace3..6b68fc7a5 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -18,17 +18,17 @@ namespace internal { #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 #endif -#ifndef EIGEN_HAS_FUSED_MADD -#define EIGEN_HAS_FUSED_MADD 1 +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif -#ifndef EIGEN_HAS_FUSE_CJMADD -#define EIGEN_HAS_FUSE_CJMADD 1 +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD #endif // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 #endif typedef __vector float Packet4f; @@ -50,22 +50,20 @@ typedef __vector unsigned char Packet16uc; #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ Packet4f p4f_##NAME = pset1<Packet4f>(X) -#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ - Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X)) - #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ Packet4i p4i_##NAME = pset1<Packet4i>(X) +#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \ + Packet2d p2d_##NAME = pset1<Packet2d>(X) + +#define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \ + Packet2l p2l_##NAME = pset1<Packet2l>(X) + #define DST_CHAN 1 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride)) -// Define global static constants: -static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 }; -static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; -static Packet16uc p16uc_REVERSE = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3}; -static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0); //{ 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15} -static Packet16uc p16uc_DUPLICATE = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7}; +// These constants are endian-agnostic static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0} static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1} @@ -74,6 +72,50 @@ static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1} static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0} static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000} +static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 }; +static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; + +static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 }; +static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 }; + +// Mask alignment +#ifdef __PPC64__ +#define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0 +#else +#define _EIGEN_MASK_ALIGNMENT 0xfffffff0 +#endif + +#define _EIGEN_ALIGNED_PTR(x) ((ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT) + +// Handle endianness properly while loading constants +// Define global static constants: +#ifdef _BIG_ENDIAN +static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0); +static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; +static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; +static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; +static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; +#else +static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; +static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; +static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; +static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; +static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO, 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; +#endif // _BIG_ENDIAN + +static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 }; +static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 }; +static Packet16uc p16uc_TRANSPOSE64_HI = vec_add(p16uc_PSET64_HI, p16uc_HALF64_0_16); //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; +static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0_16); //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31}; + +static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; + +#ifdef _BIG_ENDIAN +static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; +#else +static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_LO, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; +#endif // _BIG_ENDIAN + template<> struct packet_traits<float> : default_packet_traits { typedef Packet4f type; @@ -105,9 +147,22 @@ template<> struct packet_traits<int> : default_packet_traits }; }; + template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; typedef Packet4f half; }; template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; typedef Packet4i half; }; -/* + +inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v) +{ + union { + Packet16uc v; + unsigned char n[16]; + } vt; + vt.v = v; + for (int i=0; i< 16; i++) + s << (int)vt.n[i] << ", "; + return s; +} + inline std::ostream & operator <<(std::ostream & s, const Packet4f & v) { union { @@ -140,7 +195,7 @@ inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v) s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; return s; } - +/* inline std::ostream & operator <<(std::ostream & s, const Packetbi & v) { union { @@ -150,14 +205,21 @@ inline std::ostream & operator <<(std::ostream & s, const Packetbi & v) vt.v = v; s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; return s; -} -*/ +}*/ + + +// Need to define them first or we get specialization after instantiation errors +template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); } +template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); } + +template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); } +template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); } template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html float EIGEN_ALIGN16 af[4]; af[0] = from; - Packet4f vc = vec_ld(0, af); + Packet4f vc = pload<Packet4f>(af); vc = vec_splat(vc, 0); return vc; } @@ -165,17 +227,15 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { int EIGEN_ALIGN16 ai[4]; ai[0] = from; - Packet4i vc = vec_ld(0, ai); + Packet4i vc = pload<Packet4i>(ai); vc = vec_splat(vc, 0); return vc; } - - template<> EIGEN_STRONG_INLINE void pbroadcast4<Packet4f>(const float *a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) { - a3 = vec_ld(0,a); + a3 = pload<Packet4f>(a); a0 = vec_splat(a3, 0); a1 = vec_splat(a3, 1); a2 = vec_splat(a3, 2); @@ -185,7 +245,7 @@ template<> EIGEN_STRONG_INLINE void pbroadcast4<Packet4i>(const int *a, Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) { - a3 = vec_ld(0,a); + a3 = pload<Packet4i>(a); a0 = vec_splat(a3, 0); a1 = vec_splat(a3, 1); a2 = vec_splat(a3, 2); @@ -199,7 +259,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const floa af[1] = from[1*stride]; af[2] = from[2*stride]; af[3] = from[3*stride]; - return vec_ld(0, af); + return pload<Packet4f>(af); } template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, DenseIndex stride) { @@ -208,12 +268,12 @@ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* f ai[1] = from[1*stride]; ai[2] = from[2*stride]; ai[3] = from[3*stride]; - return vec_ld(0, ai); + return pload<Packet4i>(ai); } template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, DenseIndex stride) { float EIGEN_ALIGN16 af[4]; - vec_st(from, 0, af); + pstore<float>(af, from); to[0*stride] = af[0]; to[1*stride] = af[1]; to[2*stride] = af[2]; @@ -222,7 +282,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, co template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, DenseIndex stride) { int EIGEN_ALIGN16 ai[4]; - vec_st(from, 0, ai); + pstore<int>((int *)ai, from); to[0*stride] = ai[0]; to[1*stride] = ai[1]; to[2*stride] = ai[2]; @@ -283,7 +343,8 @@ template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const */ template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { - Packet4f t, y_0, y_1, res; +#ifndef __VSX__ // VSX actually provides a div instruction + Packet4f t, y_0, y_1; // Altivec does not offer a divide instruction, we have to do a reciprocal approximation y_0 = vec_re(b); @@ -292,8 +353,10 @@ template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const t = vec_nmsub(y_0, b, p4f_ONE); y_1 = vec_madd(y_0, t, y_0); - res = vec_madd(a, y_1, p4f_ZERO); - return res; + return vec_madd(a, y_1, p4f_ZERO); +#else + return vec_div(a, b); +#endif } template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) @@ -311,7 +374,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); } template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } -// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } @@ -324,13 +386,10 @@ template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); } template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); } -template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); } -template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); } - +#ifdef _BIG_ENDIAN template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD - // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html Packet16uc MSQ, LSQ; Packet16uc mask; MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword @@ -350,25 +409,36 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) mask = vec_lvsl(0, from); // create the permute mask return (Packet4i) vec_perm(MSQ, LSQ, mask); // align the data } +#else +// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX +template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) +{ + EIGEN_DEBUG_ALIGNED_LOAD + return (Packet4i) vec_vsx_ld((long)from & 15, (const int*) _EIGEN_ALIGNED_PTR(from)); +} +template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) +{ + EIGEN_DEBUG_ALIGNED_LOAD + return (Packet4f) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from)); +} +#endif template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) { Packet4f p; if((ptrdiff_t(from) % 16) == 0) p = pload<Packet4f>(from); else p = ploadu<Packet4f>(from); - return vec_perm(p, p, p16uc_DUPLICATE); + return vec_perm(p, p, p16uc_DUPLICATE32_HI); } template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) { Packet4i p; if((ptrdiff_t(from) % 16) == 0) p = pload<Packet4i>(from); else p = ploadu<Packet4i>(from); - return vec_perm(p, p, p16uc_DUPLICATE); + return vec_perm(p, p, p16uc_DUPLICATE32_HI); } -template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); } -template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); } - +#ifdef _BIG_ENDIAN template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE @@ -405,15 +475,30 @@ template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& f vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part } +#else +// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX +template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) +{ + EIGEN_DEBUG_ALIGNED_STORE + vec_vsx_st(from, (long)to & 15, (int*) _EIGEN_ALIGNED_PTR(to)); +} +template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) +{ + EIGEN_DEBUG_ALIGNED_STORE + vec_vsx_st(from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to)); +} +#endif +#ifndef __VSX__ template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); } template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); } +#endif template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; } template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; } -template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return (Packet4f)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE); } -template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return (Packet4i)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE); } +template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return (Packet4f)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE32); } +template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return (Packet4i)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE32); } template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); } template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } @@ -460,7 +545,11 @@ template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) { Packet4i sum; sum = vec_sums(a, p4i_ZERO); +#ifdef _BIG_ENDIAN sum = vec_sld(sum, p4i_ZERO, 12); +#else + sum = vec_sld(p4i_ZERO, sum, 4); +#endif return pfirst(sum); } @@ -547,8 +636,25 @@ struct palign_impl<Offset,Packet4f> { static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) { - if (Offset!=0) - first = vec_sld(first, second, Offset*4); +#ifdef _BIG_ENDIAN + switch (Offset % 4) { + case 1: + first = vec_sld(first, second, 4); break; + case 2: + first = vec_sld(first, second, 8); break; + case 3: + first = vec_sld(first, second, 12); break; + } +#else + switch (Offset % 4) { + case 1: + first = vec_sld(second, first, 12); break; + case 2: + first = vec_sld(second, first, 8); break; + case 3: + first = vec_sld(second, first, 4); break; + } +#endif } }; @@ -557,8 +663,25 @@ struct palign_impl<Offset,Packet4i> { static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) { - if (Offset!=0) - first = vec_sld(first, second, Offset*4); +#ifdef _BIG_ENDIAN + switch (Offset % 4) { + case 1: + first = vec_sld(first, second, 4); break; + case 2: + first = vec_sld(first, second, 8); break; + case 3: + first = vec_sld(first, second, 12); break; + } +#else + switch (Offset % 4) { + case 1: + first = vec_sld(second, first, 12); break; + case 2: + first = vec_sld(second, first, 8); break; + case 3: + first = vec_sld(second, first, 4); break; + } +#endif } }; @@ -588,6 +711,222 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) { kernel.packet[3] = vec_mergel(t1, t3); } + +//---------- double ---------- +#ifdef __VSX__ +typedef __vector double Packet2d; +typedef __vector unsigned long long Packet2ul; +typedef __vector long long Packet2l; + +static Packet2l p2l_ZERO = (Packet2l) p4i_ZERO; +static Packet2d p2d_ONE = { 1.0, 1.0 }; +static Packet2d p2d_ZERO = (Packet2d) p4f_ZERO; +static Packet2d p2d_ZERO_ = { -0.0, -0.0 }; + +#ifdef _BIG_ENDIAN +static Packet2d p2d_COUNTDOWN = (Packet2d) vec_sld((Packet16uc) p2d_ZERO, (Packet16uc) p2d_ONE, 8); +#else +static Packet2d p2d_COUNTDOWN = (Packet2d) vec_sld((Packet16uc) p2d_ONE, (Packet16uc) p2d_ZERO, 8); +#endif + +static EIGEN_STRONG_INLINE Packet2d vec_splat_dbl(Packet2d& a, int index) +{ + switch (index) { + case 0: + return (Packet2d) vec_perm(a, a, p16uc_PSET64_HI); + case 1: + return (Packet2d) vec_perm(a, a, p16uc_PSET64_LO); + } + return a; +} + +template<> struct packet_traits<double> : default_packet_traits +{ + typedef Packet2d type; + typedef Packet2d half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=2, + HasHalfPacket = 0, + + HasDiv = 1, + HasExp = 0, + HasSqrt = 0 + }; +}; + +template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; typedef Packet2d half; }; + + +inline std::ostream & operator <<(std::ostream & s, const Packet2d & v) +{ + union { + Packet2d v; + double n[2]; + } vt; + vt.v = v; + s << vt.n[0] << ", " << vt.n[1]; + return s; +} + +// Need to define them first or we get specialization after instantiation errors +template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return (Packet2d) vec_ld(0, (const float *) from); } //FIXME + +template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st((Packet4f)from, 0, (float *)to); } + +template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { + double EIGEN_ALIGN16 af[2]; + af[0] = from; + Packet2d vc = pload<Packet2d>(af); + vc = vec_splat_dbl(vc, 0); + return vc; +} +template<> EIGEN_STRONG_INLINE void +pbroadcast4<Packet2d>(const double *a, + Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) +{ + a1 = pload<Packet2d>(a); + a0 = vec_splat_dbl(a1, 0); + a1 = vec_splat_dbl(a1, 1); + a3 = pload<Packet2d>(a+2); + a2 = vec_splat_dbl(a3, 0); + a3 = vec_splat_dbl(a3, 1); +} +template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, DenseIndex stride) +{ + double EIGEN_ALIGN16 af[2]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + return pload<Packet2d>(af); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, DenseIndex stride) +{ + double EIGEN_ALIGN16 af[2]; + pstore<double>(af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; +} +template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return vec_add(pset1<Packet2d>(a), p2d_COUNTDOWN); } + +template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_add(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_sub(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return psub<Packet2d>(p2d_ZERO, a); } + +template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_madd(a,b,p2d_ZERO); } +template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_div(a,b); } + +// for some weird raisons, it has to be overloaded for packet of integers +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } + +template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); } + +template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); } + +template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } + +template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); } + +template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); } + +template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); } + +template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) +{ + EIGEN_DEBUG_ALIGNED_LOAD + return (Packet2d) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from)); +} +template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) +{ + Packet2d p; + if((ptrdiff_t(from) % 16) == 0) p = pload<Packet2d>(from); + else p = ploadu<Packet2d>(from); + return vec_perm(p, p, p16uc_PSET64_HI); +} + +template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) +{ + EIGEN_DEBUG_ALIGNED_STORE + vec_vsx_st((Packet4f)from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to)); +} + +template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { vec_dstt((const float *) addr, DST_CTRL(2,2,32), DST_CHAN); } + +template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; } + +template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return (Packet2d)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE64); } + +template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); } + +template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) +{ + Packet2d b, sum; + b = (Packet2d) vec_sld((Packet4ui) a, (Packet4ui)a, 8); + sum = vec_add(a, b); + return pfirst(sum); +} + +template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) +{ + Packet2d v[2], sum; + v[0] = vec_add(vecs[0], (Packet2d) vec_sld((Packet4ui) vecs[0], (Packet4ui) vecs[0], 8)); + v[1] = vec_add(vecs[1], (Packet2d) vec_sld((Packet4ui) vecs[1], (Packet4ui) vecs[1], 8)); + +#ifdef _BIG_ENDIAN + sum = (Packet2d) vec_sld((Packet4ui) v[0], (Packet4ui) v[1], 8); +#else + sum = (Packet2d) vec_sld((Packet4ui) v[1], (Packet4ui) v[0], 8); +#endif + + return sum; +} +// Other reduction functions: +// mul +template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) +{ + return pfirst(pmul(a, (Packet2d)vec_sld((Packet4ui) a, (Packet4ui) a, 8))); +} + +// min +template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) +{ + return pfirst(vec_min(a, (Packet2d) vec_sld((Packet4ui) a, (Packet4ui) a, 8))); +} + +// max +template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) +{ + return pfirst(vec_max(a, (Packet2d) vec_sld((Packet4ui) a, (Packet4ui) a, 8))); +} + +template<int Offset> +struct palign_impl<Offset,Packet2d> +{ + static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) + { + if (Offset == 1) +#ifdef _BIG_ENDIAN + first = (Packet2d) vec_sld((Packet4ui) first, (Packet4ui) second, 8); +#else + first = (Packet2d) vec_sld((Packet4ui) second, (Packet4ui) first, 8); +#endif + } +}; + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet2d,2>& kernel) { + Packet2d t0, t1; + t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI); + t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO); + kernel.packet[0] = t0; + kernel.packet[1] = t1; +} + +#endif // __VSX__ } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index 42e7733d7..0fdcb0741 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -33,6 +33,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits Vectorizable = 1, AlignedOnScalar = 1, size = 2, + HasHalfPacket = 0, HasAdd = 1, HasSub = 1, @@ -88,7 +89,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con template<> EIGEN_STRONG_INLINE Packet2cf pand <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { - return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); + return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); } template<> EIGEN_STRONG_INLINE Packet2cf por <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { @@ -252,7 +253,7 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true> template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { - // TODO optimize it for AltiVec + // TODO optimize it for NEON Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b); Packet4f s, rev_s; @@ -265,11 +266,198 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cf,2>& kernel) { - float32x4_t tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v)); + Packet4f tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v)); kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v)); kernel.packet[1].v = tmp; } +//---------- double ---------- +#if EIGEN_ARCH_ARM64 + +static uint64x2_t p2ul_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x0, 0x8000000000000000); + +struct Packet1cd +{ + EIGEN_STRONG_INLINE Packet1cd() {} + EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {} + Packet2d v; +}; + +template<> struct packet_traits<std::complex<double> > : default_packet_traits +{ + typedef Packet1cd type; + typedef Packet1cd half; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = 1, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; typedef Packet1cd half; }; + +template<> EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); } + +template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from) +{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); } + +template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(padd<Packet2d>(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(psub<Packet2d>(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate<Packet2d>(a.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR))); } + +template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) +{ + Packet2d v1, v2; + + // Get the real values of a + v1 = vdupq_lane_f64(vget_low_f64(a.v), 0); + // Get the real values of a + v2 = vdupq_lane_f64(vget_high_f64(a.v), 1); + // Multiply the real a with b + v1 = vmulq_f64(v1, b.v); + // Multiply the imag a with b + v2 = vmulq_f64(v2, b.v); + // Conjugate v2 + v2 = vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(v2), p2ul_CONJ_XOR)); + // Swap real/imag elements in v2. + v2 = preverse<Packet2d>(v2); + // Add and return the result + return Packet1cd(vaddq_f64(v1, v2)); +} + +template<> EIGEN_STRONG_INLINE Packet1cd pand <Packet1cd>(const Packet1cd& a, const Packet1cd& b) +{ + return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); +} +template<> EIGEN_STRONG_INLINE Packet1cd por <Packet1cd>(const Packet1cd& a, const Packet1cd& b) +{ + return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); +} +template<> EIGEN_STRONG_INLINE Packet1cd pxor <Packet1cd>(const Packet1cd& a, const Packet1cd& b) +{ + return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); +} +template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) +{ + return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); +} + +template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); } + +template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } + +template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { EIGEN_ARM_PREFETCH((double *)addr); } + +template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, DenseIndex stride) +{ + Packet2d res; + res = vsetq_lane_f64(std::real(from[0*stride]), res, 0); + res = vsetq_lane_f64(std::imag(from[0*stride]), res, 1); + return Packet1cd(res); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, DenseIndex stride) +{ + to[stride*0] = std::complex<double>(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1)); +} + + +template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) +{ + std::complex<double> EIGEN_ALIGN16 res; + pstore<std::complex<double> >(&res, a); + + return res; +} + +template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; } + +template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) { return pfirst(a); } + +template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs) { return vecs[0]; } + +template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) { return pfirst(a); } + +template<int Offset> +struct palign_impl<Offset,Packet1cd> +{ + static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) + { + // FIXME is it sure we never have to align a Packet1cd? + // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... + } +}; + +template<> struct conj_helper<Packet1cd, Packet1cd, false,true> +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return internal::pmul(a, pconj(b)); + } +}; + +template<> struct conj_helper<Packet1cd, Packet1cd, true,false> +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return internal::pmul(pconj(a), b); + } +}; + +template<> struct conj_helper<Packet1cd, Packet1cd, true,true> +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return pconj(internal::pmul(a, b)); + } +}; + +template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) +{ + // TODO optimize it for NEON + Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b); + Packet2d s = pmul<Packet2d>(b.v, b.v); + Packet2d rev_s = preverse<Packet2d>(s); + + return Packet1cd(pdiv(res.v, padd<Packet2d>(s,rev_s))); +} + +EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x) +{ + return Packet1cd(preverse(Packet2d(x.v))); +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel) +{ + Packet2d tmp = vcombine_f64(vget_high_f64(kernel.packet[0].v), vget_high_f64(kernel.packet[1].v)); + kernel.packet[0].v = vcombine_f64(vget_low_f64(kernel.packet[0].v), vget_low_f64(kernel.packet[1].v)); + kernel.packet[1].v = tmp; +} +#endif // EIGEN_ARCH_ARM64 } // end namespace internal diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 0504c095c..9afd86bec 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -20,14 +20,24 @@ namespace internal { #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 #endif +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#endif + +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD +#endif + // FIXME NEON has 16 quad registers, but since the current register allocator // is so bad, it is much better to reduce it to 8 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 8 +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 #endif +typedef float32x2_t Packet2f; typedef float32x4_t Packet4f; typedef int32x4_t Packet4i; +typedef int32x2_t Packet2i; typedef uint32x4_t Packet4ui; #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ @@ -39,7 +49,7 @@ typedef uint32x4_t Packet4ui; #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ const Packet4i p4i_##NAME = pset1<Packet4i>(X) -#if defined(__llvm__) && !defined(__clang__) +#if EIGEN_COMP_LLVM && !EIGEN_COMP_CLANG //Special treatment for Apple's llvm-gcc, its NEON packet types are unions #define EIGEN_INIT_NEON_PACKET2(X, Y) {{X, Y}} #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {{X, Y, Z, W}} @@ -52,11 +62,11 @@ typedef uint32x4_t Packet4ui; // arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function // which available on LLVM and GCC (at least) -#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || defined(__GNUC__) +#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR); #elif defined __pld #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR) -#elif !defined(__aarch64__) +#elif !EIGEN_ARCH_ARM64 #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ( " pld [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" ); #else // by default no explicit prefetching @@ -66,11 +76,12 @@ typedef uint32x4_t Packet4ui; template<> struct packet_traits<float> : default_packet_traits { typedef Packet4f type; - typedef Packet4f half; + typedef Packet2f half; enum { Vectorizable = 1, AlignedOnScalar = 1, size = 4, + HasHalfPacket=1, HasDiv = 1, // FIXME check the Has* @@ -84,16 +95,17 @@ template<> struct packet_traits<float> : default_packet_traits template<> struct packet_traits<int> : default_packet_traits { typedef Packet4i type; - typedef Packet4i half; + typedef Packet2i half; enum { Vectorizable = 1, AlignedOnScalar = 1, - size=4 + size=4, + HasHalfPacket=1 // FIXME check the Has* }; }; -#if EIGEN_GNUC_AT_MOST(4,4) && !defined(__llvm__) +#if EIGEN_GNUC_AT_MOST(4,4) && !EIGEN_COMP_LLVM // workaround gcc 4.2, 4.3 and 4.4 compilatin issue EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); } EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); } @@ -136,6 +148,9 @@ template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { +#if EIGEN_ARCH_ARM64 + return vdivq_f32(a,b); +#else Packet4f inv, restep, div; // NEON does not offer a divide instruction, we have to do a reciprocal approximation @@ -154,14 +169,27 @@ template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const div = vmulq_f32(a, inv); return div; +#endif } + template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) { eigen_assert(false && "packet integer division are not supported by NEON"); return pset1<Packet4i>(0); } -// for some weird raisons, it has to be overloaded for packet of integers +#ifdef __ARM_FEATURE_FMA +// See bug 936. +// FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4. +// FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding. +// MLA is not fused i.e. does 2 roundings. +// In addition to giving better accuracy, FMA also gives better performance here on a Krait (Nexus 4): +// MLA: 10 GFlop/s ; FMA: 12 GFlops/s. +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); } +#else template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vmlaq_f32(c,a,b); } +#endif + +// No FMA instruction for int, so use MLA unconditionally. template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); } template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); } @@ -472,6 +500,193 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) { kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1])); } +//---------- double ---------- +#if EIGEN_ARCH_ARM64 + +#if (EIGEN_COMP_GNUC_STRICT && defined(__ANDROID__)) || defined(__apple_build_version__) +// Bug 907: workaround missing declarations of the following two functions in the ADK +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_u64_f64 (float64x2_t __a) +{ + return (uint64x2_t) __a; +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_f64_u64 (uint64x2_t __a) +{ + return (float64x2_t) __a; +} +#endif + +typedef float64x2_t Packet2d; +typedef float64x1_t Packet1d; + +template<> struct packet_traits<double> : default_packet_traits +{ + typedef Packet2d type; + typedef Packet1d half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + HasHalfPacket=1, + + HasDiv = 1, + // FIXME check the Has* + HasSin = 0, + HasCos = 0, + HasLog = 0, + HasExp = 0, + HasSqrt = 0 + }; +}; + +template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; typedef Packet2d half; }; + +template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return vdupq_n_f64(from); } + +template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) +{ + Packet2d countdown = EIGEN_INIT_NEON_PACKET2(0, 1); + return vaddq_f64(pset1<Packet2d>(a), countdown); +} +template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); } + +template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmulq_f64(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); } + +#ifdef __ARM_FEATURE_FMA +// See bug 936. See above comment about FMA for float. +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); } +#else +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vmlaq_f64(c,a,b); } +#endif + +template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); } + +// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics +template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) +{ + return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); +} + +template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) +{ + return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); +} + +template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) +{ + return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); +} + +template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) +{ + return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); +} + +template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); } + +template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); } + +template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) +{ + return vld1q_dup_f64(from); +} +template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from); } + +template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to, from); } + +template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, DenseIndex stride) +{ + Packet2d res; + res = vsetq_lane_f64(from[0*stride], res, 0); + res = vsetq_lane_f64(from[1*stride], res, 1); + return res; +} +template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, DenseIndex stride) +{ + to[stride*0] = vgetq_lane_f64(from, 0); + to[stride*1] = vgetq_lane_f64(from, 1); +} +template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ARM_PREFETCH(addr); } + +// FIXME only store the 2 first elements ? +template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(a, 0); } + +template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); } + +template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); } + +#if EIGEN_COMP_CLANG && defined(__apple_build_version__) +// workaround ICE, see bug 907 +template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) + vget_high_f64(a))[0]; } +#else +template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); } +#endif + +template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) +{ + float64x2_t trn1, trn2; + + // NEON zip performs interleaving of the supplied vectors. + // We perform two interleaves in a row to acquire the transposed vector + trn1 = vzip1q_f64(vecs[0], vecs[1]); + trn2 = vzip2q_f64(vecs[0], vecs[1]); + + // Do the addition of the resulting vectors + return vaddq_f64(trn1, trn2); +} +// Other reduction functions: +// mul +#if EIGEN_COMP_CLANG && defined(__apple_build_version__) +template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; } +#else +template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); } +#endif + +// min +template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); } + +// max +template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpmaxq_f64(a, a), 0); } + +// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors, +// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074 +#define PALIGN_NEON(Offset,Type,Command) \ +template<>\ +struct palign_impl<Offset,Type>\ +{\ + EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\ + {\ + if (Offset!=0)\ + first = Command(first, second, Offset);\ + }\ +};\ + +PALIGN_NEON(0,Packet2d,vextq_f64) +PALIGN_NEON(1,Packet2d,vextq_f64) +#undef PALIGN_NEON + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet2d,2>& kernel) { + float64x2_t trn1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]); + float64x2_t trn2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]); + + kernel.packet[0] = trn1; + kernel.packet[1] = trn2; +} +#endif // EIGEN_ARCH_ARM64 + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index 8f78b3a6c..9ffba5b41 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -52,7 +52,7 @@ Packet4f plog<Packet4f>(const Packet4f& _x) Packet4i emm0; - Packet4f invalid_mask = _mm_cmplt_ps(x, _mm_setzero_ps()); + Packet4f invalid_mask = _mm_cmpnge_ps(x, _mm_setzero_ps()); // not greater equal is true if x is NaN Packet4f iszero_mask = _mm_cmpeq_ps(x, _mm_setzero_ps()); x = pmax(x, p4f_min_norm_pos); /* cut off denormalized stuff */ @@ -167,7 +167,7 @@ Packet4f pexp<Packet4f>(const Packet4f& _x) emm0 = _mm_cvttps_epi32(fx); emm0 = _mm_add_epi32(emm0, p4i_0x7f); emm0 = _mm_slli_epi32(emm0, 23); - return pmul(y, Packet4f(_mm_castsi128_ps(emm0))); + return pmax(pmul(y, Packet4f(_mm_castsi128_ps(emm0))), _x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d pexp<Packet2d>(const Packet2d& _x) @@ -241,7 +241,7 @@ Packet2d pexp<Packet2d>(const Packet2d& _x) emm0 = _mm_add_epi32(emm0, p4i_1023_0); emm0 = _mm_slli_epi32(emm0, 20); emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3)); - return pmul(x, Packet2d(_mm_castsi128_pd(emm0))); + return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x); } /* evaluation of 4 sines at onces, using SSE2 intrinsics. diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 6923c88ec..3befd4c25 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -22,13 +22,13 @@ namespace internal { #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) #endif -#ifdef EIGEN_VECTORIZE_FMA -#ifndef EIGEN_HAS_FUSED_MADD -#define EIGEN_HAS_FUSED_MADD 1 +#ifdef __FMA__ +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 #endif #endif -#if defined EIGEN_VECTORIZE_AVX && defined __GNUC__ && !(defined __clang__ || defined __INTEL_COMPILER) +#if defined EIGEN_VECTORIZE_AVX && EIGEN_COMP_GNUC_STRICT // With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot // have overloads for both types without linking error. // One solution is to increase ABI version using -fabi-version=4 (or greater). @@ -147,7 +147,7 @@ template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4} template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; typedef Packet2d half; }; template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; typedef Packet4i half; }; -#if defined(_MSC_VER) && (_MSC_VER==1500) +#if EIGEN_COMP_MSVC==1500 // Workaround MSVC 9 internal compiler error. // TODO: It has been detected with win64 builds (amd64), so let's check whether it also happens in 32bits+SSE mode // TODO: let's check whether there does not exist a better fix, like adding a pset0() function. (it crashed on pset1(0)). @@ -165,7 +165,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { re // Using inline assembly is also not an option because then gcc fails to reorder properly the instructions. // Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply. // Also note that with AVX, we want it to generate a vbroadcastss. -#if (defined __GNUC__) && (!defined __INTEL_COMPILER) && (!defined __clang__) && (!defined __AVX__) +#if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__) template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) { return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0); } @@ -282,10 +282,10 @@ template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { E template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); } template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); } -#if defined(_MSC_VER) +#if EIGEN_COMP_MSVC template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD - #if (_MSC_VER==1600) + #if (EIGEN_COMP_MSVC==1600) // NOTE Some version of MSVC10 generates bad code when using _mm_loadu_ps // (i.e., it does not generate an unaligned load!! // TODO On most architectures this version should also be faster than a single _mm_loadu_ps @@ -307,11 +307,11 @@ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { E // TODO: do the same for MSVC (ICC is compatible) // NOTE: with the code below, MSVC's compiler crashes! -#if defined(__GNUC__) && (defined(__i386__) || (defined(__x86_64) && EIGEN_GNUC_AT_LEAST(4, 8))) +#if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386 || (EIGEN_ARCH_x86_64 && EIGEN_GNUC_AT_LEAST(4, 8))) // bug 195: gcc/i386 emits weird x87 fldl/fstpl instructions for _mm_load_sd #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1 #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 1 -#elif defined(__clang__) +#elif EIGEN_COMP_CLANG // bug 201: Segfaults in __mm_loadh_pd with clang 2.8 #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1 #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 0 @@ -439,13 +439,13 @@ template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_p template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } #endif -#if defined(_MSC_VER) && defined(_WIN64) && !defined(__INTEL_COMPILER) +#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64 // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010 // Direct of the struct members fixed bug #62. template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return a.m128_f32[0]; } template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return a.m128d_f64[0]; } template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; } -#elif defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#elif EIGEN_COMP_MSVC_STRICT // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010 template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; } template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; } @@ -680,7 +680,7 @@ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) #endif // EIGEN_VECTORIZE_SSE4_1 } -#if (defined __GNUC__) +#if EIGEN_COMP_GNUC // template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) // { // Packet4f res = b; |