From 5fdc1792410f7c2a0aa751ed7cabc013026aef26 Mon Sep 17 00:00:00 2001 From: Pedro Caldeira Date: Mon, 11 May 2020 16:38:56 -0300 Subject: Altivec template functions to better code reusability --- Eigen/src/Core/arch/AltiVec/PacketMath.h | 728 +++++++++++-------------------- 1 file changed, 257 insertions(+), 471 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 83b75b974..d12aa2b10 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -75,6 +75,7 @@ typedef __vector uint8_t Packet16uc; #define DST_CHAN 1 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride)) +#define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits::type // These constants are endian-agnostic static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0} @@ -375,8 +376,8 @@ inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v) return s; } -// Need to define them first or we get specialization after instantiation errors -template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) +template +EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet)* from) { // some versions of GCC throw "unused-but-set-parameter". // ignoring these warnings for now. @@ -389,57 +390,39 @@ template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) #endif } +// Need to define them first or we get specialization after instantiation errors +template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) +{ + return pload_common(from); +} + template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { - // some versions of GCC throw "unused-but-set-parameter". - // ignoring these warnings for now. - EIGEN_UNUSED_VARIABLE(from); - EIGEN_DEBUG_ALIGNED_LOAD -#ifdef __VSX__ - return vec_xl(0, from); -#else - return vec_ld(0, from); -#endif + return pload_common(from); } template<> EIGEN_STRONG_INLINE Packet8s pload(const short int* from) { - // some versions of GCC throw "unused-but-set-parameter". - // ignoring these warnings for now. - EIGEN_UNUSED_VARIABLE(from); - EIGEN_DEBUG_ALIGNED_LOAD - return vec_ld(0, from); + return pload_common(from); } template<> EIGEN_STRONG_INLINE Packet8us pload(const unsigned short int* from) { - // some versions of GCC throw "unused-but-set-parameter". - // ignoring these warnings for now. - EIGEN_UNUSED_VARIABLE(from); - EIGEN_DEBUG_ALIGNED_LOAD - return vec_ld(0, from); + return pload_common(from); } template<> EIGEN_STRONG_INLINE Packet16c pload(const int8_t* from) { - // some versions of GCC throw "unused-but-set-parameter". - // ignoring these warnings for now. - EIGEN_UNUSED_VARIABLE(from); - EIGEN_DEBUG_ALIGNED_LOAD - return vec_ld(0, from); + return pload_common(from); } template<> EIGEN_STRONG_INLINE Packet16uc pload(const uint8_t* from) { - // some versions of GCC throw "unused-but-set-parameter". - // ignoring these warnings for now. - EIGEN_UNUSED_VARIABLE(from); - EIGEN_DEBUG_ALIGNED_LOAD - return vec_ld(0, from); + return pload_common(from); } -template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) -{ +template +EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet& from){ // some versions of GCC throw "unused-but-set-parameter" (float *to). // ignoring these warnings for now. EIGEN_UNUSED_VARIABLE(to); @@ -448,247 +431,232 @@ template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& f vec_xst(from, 0, to); #else vec_st(from, 0, to); -#endif +#endif +} + +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) +{ + pstore_common(to, from); } template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& from) { - // some versions of GCC throw "unused-but-set-parameter" (float *to). - // ignoring these warnings for now. - EIGEN_UNUSED_VARIABLE(to); - EIGEN_DEBUG_ALIGNED_STORE -#ifdef __VSX__ - vec_xst(from, 0, to); -#else - vec_st(from, 0, to); -#endif + pstore_common(to, from); } template<> EIGEN_STRONG_INLINE void pstore(short int* to, const Packet8s& from) { - // some versions of GCC throw "unused-but-set-parameter" (float *to). - // ignoring these warnings for now. - EIGEN_UNUSED_VARIABLE(to); - EIGEN_DEBUG_ALIGNED_STORE - vec_st(from, 0, to); + pstore_common(to, from); } template<> EIGEN_STRONG_INLINE void pstore(unsigned short int* to, const Packet8us& from) { - // some versions of GCC throw "unused-but-set-parameter" (float *to). - // ignoring these warnings for now. - EIGEN_UNUSED_VARIABLE(to); - EIGEN_DEBUG_ALIGNED_STORE - vec_st(from, 0, to); + pstore_common(to, from); } + template<> EIGEN_STRONG_INLINE void pstore(int8_t* to, const Packet16c& from) { - // some versions of GCC throw "unused-but-set-parameter" (float *to). - // ignoring these warnings for now. - EIGEN_UNUSED_VARIABLE(to); - EIGEN_DEBUG_ALIGNED_STORE - vec_st(from, 0, to); + pstore_common(to, from); } template<> EIGEN_STRONG_INLINE void pstore(uint8_t* to, const Packet16uc& from) { - // some versions of GCC throw "unused-but-set-parameter" (float *to). - // ignoring these warnings for now. - EIGEN_UNUSED_VARIABLE(to); - EIGEN_DEBUG_ALIGNED_STORE - vec_st(from, 0, to); + pstore_common(to, from); } -template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { - Packet4f v = {from, from, from, from}; +template +EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet)& from) +{ + Packet v = {from, from, from, from}; return v; } -template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { - Packet4i v = {from, from, from, from}; +template +EIGEN_STRONG_INLINE Packet pset1_size8(const __UNPACK_TYPE__(Packet)& from) +{ + Packet v = {from, from, from, from, from, from, from, from}; return v; } -template<> EIGEN_STRONG_INLINE Packet8s pset1(const short int& from) { - Packet8s v = {from, from, from, from, from, from, from, from}; +template +EIGEN_STRONG_INLINE Packet pset1_size16(const __UNPACK_TYPE__(Packet)& from) +{ + Packet v = {from, from, from, from, from, from, from, from, from, from, from, from, from, from, from, from}; return v; } +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { + return pset1_size4(from); +} + +template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { + return pset1_size4(from); +} + +template<> EIGEN_STRONG_INLINE Packet8s pset1(const short int& from) { + return pset1_size8(from); +} + template<> EIGEN_STRONG_INLINE Packet8us pset1(const unsigned short int& from) { - Packet8us v = {from, from, from, from, from, from, from, from}; - return v; + return pset1_size8(from); } template<> EIGEN_STRONG_INLINE Packet16c pset1(const int8_t& from) { - Packet16c v = {from, from, from, from, from, from, from, from, from, from, from, from, from, from, from, from}; - return v; + return pset1_size16(from); } template<> EIGEN_STRONG_INLINE Packet16uc pset1(const uint8_t& from) { - Packet16uc v = {from, from, from, from, from, from, from, from, from, from, from, from, from, from, from, from}; - return v; + return pset1_size16(from); } template<> EIGEN_STRONG_INLINE Packet4f pset1frombits(unsigned int from) { return reinterpret_cast(pset1(from)); } -template<> EIGEN_STRONG_INLINE void -pbroadcast4(const float *a, - Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +template EIGEN_STRONG_INLINE void +pbroadcast4_common(const __UNPACK_TYPE__(Packet) *a, + Packet& a0, Packet& a1, Packet& a2, Packet& a3) { - a3 = pload(a); + a3 = pload(a); a0 = vec_splat(a3, 0); a1 = vec_splat(a3, 1); a2 = vec_splat(a3, 2); a3 = vec_splat(a3, 3); } + +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + pbroadcast4_common(a, a0, a1, a2, a3); +} template<> EIGEN_STRONG_INLINE void pbroadcast4(const int *a, Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) { - a3 = pload(a); - a0 = vec_splat(a3, 0); - a1 = vec_splat(a3, 1); - a2 = vec_splat(a3, 2); - a3 = vec_splat(a3, 3); + pbroadcast4_common(a, a0, a1, a2, a3); +} + +template EIGEN_DEVICE_FUNC inline Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride) +{ + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4]; + a[0] = from[0*stride]; + a[1] = from[1*stride]; + a[2] = from[2*stride]; + a[3] = from[3*stride]; + return pload(a); } template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) { - EIGEN_ALIGN16 float af[4]; - af[0] = from[0*stride]; - af[1] = from[1*stride]; - af[2] = from[2*stride]; - af[3] = from[3*stride]; - return pload(af); + return pgather_common(from, stride); } + template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, Index stride) { - EIGEN_ALIGN16 int ai[4]; - ai[0] = from[0*stride]; - ai[1] = from[1*stride]; - ai[2] = from[2*stride]; - ai[3] = from[3*stride]; - return pload(ai); + return pgather_common(from, stride); } + +template EIGEN_DEVICE_FUNC inline Packet pgather_size8(const __UNPACK_TYPE__(Packet)* from, Index stride) +{ + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8]; + a[0] = from[0*stride]; + a[1] = from[1*stride]; + a[2] = from[2*stride]; + a[3] = from[3*stride]; + a[4] = from[4*stride]; + a[5] = from[5*stride]; + a[6] = from[6*stride]; + a[7] = from[7*stride]; + return pload(a); +} + template<> EIGEN_DEVICE_FUNC inline Packet8s pgather(const short int* from, Index stride) { - EIGEN_ALIGN16 short int ai[8]; - ai[0] = from[0*stride]; - ai[1] = from[1*stride]; - ai[2] = from[2*stride]; - ai[3] = from[3*stride]; - ai[4] = from[4*stride]; - ai[5] = from[5*stride]; - ai[6] = from[6*stride]; - ai[7] = from[7*stride]; - return pload(ai); + return pgather_size8(from, stride); } template<> EIGEN_DEVICE_FUNC inline Packet8us pgather(const unsigned short int* from, Index stride) { - EIGEN_ALIGN16 unsigned short int ai[8]; - ai[0] = from[0*stride]; - ai[1] = from[1*stride]; - ai[2] = from[2*stride]; - ai[3] = from[3*stride]; - ai[4] = from[4*stride]; - ai[5] = from[5*stride]; - ai[6] = from[6*stride]; - ai[7] = from[7*stride]; - return pload(ai); + return pgather_size8(from, stride); +} + +template EIGEN_DEVICE_FUNC inline Packet pgather_size16(const __UNPACK_TYPE__(Packet)* from, Index stride) +{ + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16]; + a[0] = from[0*stride]; + a[1] = from[1*stride]; + a[2] = from[2*stride]; + a[3] = from[3*stride]; + a[4] = from[4*stride]; + a[5] = from[5*stride]; + a[6] = from[6*stride]; + a[7] = from[7*stride]; + a[8] = from[8*stride]; + a[9] = from[9*stride]; + a[10] = from[10*stride]; + a[11] = from[11*stride]; + a[12] = from[12*stride]; + a[13] = from[13*stride]; + a[14] = from[14*stride]; + a[15] = from[15*stride]; + return pload(a); } + template<> EIGEN_DEVICE_FUNC inline Packet16c pgather(const int8_t* from, Index stride) { - EIGEN_ALIGN16 int8_t ai[16]; - ai[0] = from[0*stride]; - ai[1] = from[1*stride]; - ai[2] = from[2*stride]; - ai[3] = from[3*stride]; - ai[4] = from[4*stride]; - ai[5] = from[5*stride]; - ai[6] = from[6*stride]; - ai[7] = from[7*stride]; - ai[8] = from[8*stride]; - ai[9] = from[9*stride]; - ai[10] = from[10*stride]; - ai[11] = from[11*stride]; - ai[12] = from[12*stride]; - ai[13] = from[13*stride]; - ai[14] = from[14*stride]; - ai[15] = from[15*stride]; - return pload(ai); + return pgather_size16(from, stride); } template<> EIGEN_DEVICE_FUNC inline Packet16uc pgather(const uint8_t* from, Index stride) { - EIGEN_ALIGN16 uint8_t ai[16]; - ai[0] = from[0*stride]; - ai[1] = from[1*stride]; - ai[2] = from[2*stride]; - ai[3] = from[3*stride]; - ai[4] = from[4*stride]; - ai[5] = from[5*stride]; - ai[6] = from[6*stride]; - ai[7] = from[7*stride]; - ai[8] = from[8*stride]; - ai[9] = from[9*stride]; - ai[10] = from[10*stride]; - ai[11] = from[11*stride]; - ai[12] = from[12*stride]; - ai[13] = from[13*stride]; - ai[14] = from[14*stride]; - ai[15] = from[15*stride]; - return pload(ai); + return pgather_size16(from, stride); +} + +template EIGEN_DEVICE_FUNC inline void pscatter_size4(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride) +{ + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4]; + pstore<__UNPACK_TYPE__(Packet)>(a, from); + to[0*stride] = a[0]; + to[1*stride] = a[1]; + to[2*stride] = a[2]; + to[3*stride] = a[3]; } template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) { - EIGEN_ALIGN16 float af[4]; - pstore(af, from); - to[0*stride] = af[0]; - to[1*stride] = af[1]; - to[2*stride] = af[2]; - to[3*stride] = af[3]; + pscatter_size4(to, from, stride); } + template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, Index stride) { - EIGEN_ALIGN16 int ai[4]; - pstore((int *)ai, from); - to[0*stride] = ai[0]; - to[1*stride] = ai[1]; - to[2*stride] = ai[2]; - to[3*stride] = ai[3]; + pscatter_size4(to, from, stride); +} + +template EIGEN_DEVICE_FUNC inline void pscatter_size8(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride) +{ + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8]; + pstore<__UNPACK_TYPE__(Packet)>(a, from); + to[0*stride] = a[0]; + to[1*stride] = a[1]; + to[2*stride] = a[2]; + to[3*stride] = a[3]; + to[4*stride] = a[4]; + to[5*stride] = a[5]; + to[6*stride] = a[6]; + to[7*stride] = a[7]; } + template<> EIGEN_DEVICE_FUNC inline void pscatter(short int* to, const Packet8s& from, Index stride) { - EIGEN_ALIGN16 short int ai[8]; - pstore((short int *)ai, from); - to[0*stride] = ai[0]; - to[1*stride] = ai[1]; - to[2*stride] = ai[2]; - to[3*stride] = ai[3]; - to[4*stride] = ai[4]; - to[5*stride] = ai[5]; - to[6*stride] = ai[6]; - to[7*stride] = ai[7]; + pscatter_size8(to, from, stride); } template<> EIGEN_DEVICE_FUNC inline void pscatter(unsigned short int* to, const Packet8us& from, Index stride) { - EIGEN_ALIGN16 unsigned short int ai[8]; - pstore((unsigned short int *)ai, from); - to[0*stride] = ai[0]; - to[1*stride] = ai[1]; - to[2*stride] = ai[2]; - to[3*stride] = ai[3]; - to[4*stride] = ai[4]; - to[5*stride] = ai[5]; - to[6*stride] = ai[6]; - to[7*stride] = ai[7]; + pscatter_size8(to, from, stride); } template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return pset1(a) + p4f_COUNTDOWN; } @@ -819,122 +787,62 @@ template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) { return vec_ceil(a); } template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { return vec_floor(a); } -#ifdef _BIG_ENDIAN -template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) -{ - EIGEN_DEBUG_ALIGNED_LOAD - Packet16uc MSQ, LSQ; - Packet16uc mask; - MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword - LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword - mask = vec_lvsl(0, from); // create the permute mask - return (Packet4f) vec_perm(MSQ, LSQ, mask); // align the data - -} -template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) -{ - EIGEN_DEBUG_ALIGNED_LOAD - // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html - Packet16uc MSQ, LSQ; - Packet16uc mask; - MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword - LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword - mask = vec_lvsl(0, from); // create the permute mask - return (Packet4i) vec_perm(MSQ, LSQ, mask); // align the data -} -template<> EIGEN_STRONG_INLINE Packet8s ploadu(const short int* from) +template EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet)* from) { EIGEN_DEBUG_ALIGNED_LOAD - // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html - Packet16uc MSQ, LSQ; - Packet16uc mask; - MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword - LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword - mask = vec_lvsl(0, from); // create the permute mask - return static_cast(vec_perm(MSQ, LSQ, mask)); // align the data -} -template<> EIGEN_STRONG_INLINE Packet8us ploadu(const unsigned short int* from) -{ - EIGEN_DEBUG_ALIGNED_LOAD - // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html +#ifdef _BIG_ENDIAN Packet16uc MSQ, LSQ; Packet16uc mask; MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword mask = vec_lvsl(0, from); // create the permute mask - return static_cast(vec_perm(MSQ, LSQ, mask)); // align the data -} -template<> EIGEN_STRONG_INLINE Packet16c ploadu(const char* from) -{ - EIGEN_DEBUG_ALIGNED_LOAD - // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html - Packet16uc MSQ, LSQ; - Packet16uc mask; - MSQ = vec_ld(0, from); // most significant quadword - LSQ = vec_ld(15, from); // least significant quadword - mask = vec_lvsl(0, from); // create the permute mask - return static_cast(vec_perm(MSQ, LSQ, mask)); // align the data -} - -template<> EIGEN_STRONG_INLINE Packet16uc ploadu(const unsigned char* from) -{ - EIGEN_DEBUG_ALIGNED_LOAD - // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html - Packet16uc MSQ, LSQ; - Packet16uc mask; - MSQ = vec_ld(0, from); // most significant quadword - LSQ = vec_ld(15, from); // least significant quadword - mask = vec_lvsl(0, from); // create the permute mask - return static_cast(vec_perm(MSQ, LSQ, mask)); // align the data -} + //TODO: Add static_cast here + return (Packet) vec_perm(MSQ, LSQ, mask); // align the data #else -// We also need to redefine little endian loading of Packet4i/Packet4f using VSX -template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) -{ EIGEN_DEBUG_UNALIGNED_LOAD return vec_xl(0, from); +#endif } + template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { - EIGEN_DEBUG_UNALIGNED_LOAD - return vec_xl(0, from); + return ploadu_common(from); +} +template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) +{ + return ploadu_common(from); } template<> EIGEN_STRONG_INLINE Packet8s ploadu(const short int* from) { - EIGEN_DEBUG_UNALIGNED_LOAD - return vec_vsx_ld(0, from); + return ploadu_common(from); } template<> EIGEN_STRONG_INLINE Packet8us ploadu(const unsigned short int* from) { - EIGEN_DEBUG_UNALIGNED_LOAD - return vec_vsx_ld(0, from); + return ploadu_common(from); } template<> EIGEN_STRONG_INLINE Packet16c ploadu(const int8_t* from) { - EIGEN_DEBUG_UNALIGNED_LOAD - return vec_vsx_ld(0, from); + return ploadu_common(from); } template<> EIGEN_STRONG_INLINE Packet16uc ploadu(const uint8_t* from) { - EIGEN_DEBUG_UNALIGNED_LOAD - return vec_vsx_ld(0, from); + return ploadu_common(from); } -#endif -template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) +template EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)* from) { - Packet4f p; - if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); - else p = ploadu(from); + Packet p; + if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); + else p = ploadu(from); return vec_perm(p, p, p16uc_DUPLICATE32_HI); } - +template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) +{ + return ploaddup_common(from); +} template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) { - Packet4i p; - if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); - else p = ploadu(from); - return vec_perm(p, p, p16uc_DUPLICATE32_HI); + return ploaddup_common(from); } template<> EIGEN_STRONG_INLINE Packet8s ploaddup(const short int* from) @@ -985,10 +893,10 @@ template<> EIGEN_STRONG_INLINE Packet16uc ploaddup(const uint8_t* return vec_perm(p, p, p16uc_DUPLICATE8_HI); } -#ifdef _BIG_ENDIAN -template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) +template EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet)* to, const Packet& from) { EIGEN_DEBUG_UNALIGNED_STORE +#ifdef _BIG_ENDIAN // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html // Warning: not thread safe! Packet16uc MSQ, LSQ, edges; @@ -1002,140 +910,34 @@ template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& f MSQ = vec_perm(edges,(Packet16uc)from,align); // misalign the data (MSQ) LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ) vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first - vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part +#else + vec_xst(from, 0, to); +#endif +} +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) +{ + pstoreu_common(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { - EIGEN_DEBUG_UNALIGNED_STORE - // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html - // Warning: not thread safe! - Packet16uc MSQ, LSQ, edges; - Packet16uc edgeAlign, align; - - MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword - LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword - edgeAlign = vec_lvsl(0, to); // permute map to extract edges - edges=vec_perm(LSQ, MSQ, edgeAlign); // extract the edges - align = vec_lvsr( 0, to ); // permute map to misalign data - MSQ = vec_perm(edges, (Packet16uc) from, align); // misalign the data (MSQ) - LSQ = vec_perm((Packet16uc) from, edges, align); // misalign the data (LSQ) - vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first - vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part + pstoreu_common(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu(short int* to, const Packet8s& from) { - EIGEN_DEBUG_UNALIGNED_STORE - // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html - // Warning: not thread safe! - Packet16uc MSQ, LSQ, edges; - Packet16uc edgeAlign, align; - - MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword - LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword - edgeAlign = vec_lvsl(0, to); // permute map to extract edges - edges = vec_perm(LSQ, MSQ, edgeAlign); // extract the edges - align = vec_lvsr( 0, to ); // permute map to misalign data - MSQ = vec_perm(edges, (Packet16uc) from, align); // misalign the data (MSQ) - LSQ = vec_perm((Packet16uc) from, edges, align); // misalign the data (LSQ) - vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first - vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part + pstoreu_common(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu(unsigned short int* to, const Packet8us& from) { - EIGEN_DEBUG_UNALIGNED_STORE - // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html - // Warning: not thread safe! - Packet16uc MSQ, LSQ, edges; - Packet16uc edgeAlign, align; - - MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword - LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword - edgeAlign = vec_lvsl(0, to); // permute map to extract edges - edges = vec_perm(LSQ, MSQ, edgeAlign); // extract the edges - align = vec_lvsr( 0, to ); // permute map to misalign data - MSQ = vec_perm(edges, (Packet16uc) from, align); // misalign the data (MSQ) - LSQ = vec_perm((Packet16uc) from, edges, align); // misalign the data (LSQ) - vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first - vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part -} - -template<> EIGEN_STRONG_INLINE void pstoreu(char* to, const Packet16c& from) -{ - EIGEN_DEBUG_UNALIGNED_STORE - // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html - // Warning: not thread safe! - Packet16uc MSQ, LSQ, edges; - Packet16uc edgeAlign, align; - - MSQ = vec_ld(0, to); // most significant quadword - LSQ = vec_ld(15,to); // least significant quadword - edgeAlign = vec_lvsl(0, to); // permute map to extract edges - edges=vec_perm(LSQ, MSQ, edgeAlign); // extract the edges - align = vec_lvsr( 0, to ); // permute map to misalign data - MSQ = vec_perm(edges, (Packet16uc) from, align); // misalign the data (MSQ) - LSQ = vec_perm((Packet16uc) from, edges, align); // misalign the data (LSQ) - vec_st( LSQ, 15, to ); // Store the LSQ part first - vec_st( MSQ, 0, to ); // Store the MSQ part -} -template<> EIGEN_STRONG_INLINE void pstoreu(unsigned char* to, const Packet16uc& from) -{ - EIGEN_DEBUG_UNALIGNED_STORE - // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html - // Warning: not thread safe! - Packet16uc MSQ, LSQ, edges; - Packet16uc edgeAlign, align; - - MSQ = vec_ld(0, to); // most significant quadword - LSQ = vec_ld(15,to); // least significant quadword - edgeAlign = vec_lvsl(0, to); // permute map to extract edges - edges=vec_perm(LSQ, MSQ, edgeAlign); // extract the edges - align = vec_lvsr( 0, to ); // permute map to misalign data - MSQ = vec_perm(edges, (Packet16uc) from, align); // misalign the data (MSQ) - LSQ = vec_perm((Packet16uc) from, edges, align); // misalign the data (LSQ) - vec_st( LSQ, 15, to ); // Store the LSQ part first - vec_st( MSQ, 0, to ); // Store the MSQ part -} -#else -// We also need to redefine little endian loading of Packet4i/Packet4f using VSX -template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) -{ - EIGEN_DEBUG_UNALIGNED_STORE - vec_xst(from, 0, to); -} -template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) -{ - EIGEN_DEBUG_UNALIGNED_STORE - vec_xst(from, 0, to); -} -template<> EIGEN_STRONG_INLINE void pstoreu(short int* to, const Packet8s& from) -{ - EIGEN_DEBUG_UNALIGNED_STORE - /*GCC provides a commonly used synonym for vec_xst called vec_vsx_st. - * Although these have the same behavior, - * only vec_xst is guaranteed to be portable across compliant compilers - * vec_xst should be preferred. */ - vec_xst(from, 0, to); -} -template<> EIGEN_STRONG_INLINE void pstoreu(unsigned short int* to, const Packet8us& from) -{ - EIGEN_DEBUG_UNALIGNED_STORE - /*GCC provides a commonly used synonym for vec_xst called vec_vsx_st. - * Although these have the same behavior, - * only vec_xst is guaranteed to be portable across compliant compilers - * vec_xst should be preferred. */ - vec_xst(from, 0, to); + pstoreu_common(to, from); } -template<> EIGEN_STRONG_INLINE void pstoreu(int8_t* to, const Packet16c& from) +template<> EIGEN_STRONG_INLINE void pstoreu(int8_t* to, const Packet16c& from) { - EIGEN_DEBUG_UNALIGNED_STORE - vec_vsx_st(from, 0, to); + pstoreu_common(to, from); } -template<> EIGEN_STRONG_INLINE void pstoreu(uint8_t* to, const Packet16uc& from) +template<> EIGEN_STRONG_INLINE void pstoreu(uint8_t* to, const Packet16uc& from) { - EIGEN_DEBUG_UNALIGNED_STORE - vec_vsx_st(from, 0, to); + pstoreu_common(to, from); } -#endif template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_PPC_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_PPC_PREFETCH(addr); } @@ -1143,29 +945,28 @@ template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGE template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { EIGEN_ALIGN16 float x; vec_ste(a, 0, &x); return x; } template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { EIGEN_ALIGN16 int x; vec_ste(a, 0, &x); return x; } -template<> EIGEN_STRONG_INLINE short int pfirst(const Packet8s& a) { - EIGEN_ALIGN16 short int x; +template EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) pfirst_common(const Packet& a) { + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) x; vec_ste(a, 0, &x); return x; } +template<> EIGEN_STRONG_INLINE short int pfirst(const Packet8s& a) { + return pfirst_common(a); +} + template<> EIGEN_STRONG_INLINE unsigned short int pfirst(const Packet8us& a) { - EIGEN_ALIGN16 unsigned short int x; - vec_ste(a, 0, &x); - return x; + return pfirst_common(a); } template<> EIGEN_STRONG_INLINE int8_t pfirst(const Packet16c& a) { - EIGEN_ALIGN16 int8_t x; - vec_ste(a, 0, &x); - return x; + return pfirst_common(a); } + template<> EIGEN_STRONG_INLINE uint8_t pfirst(const Packet16uc& a) { - EIGEN_ALIGN16 uint8_t x; - vec_ste(a, 0, &x); - return x; + return pfirst_common(a); } template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) @@ -1237,44 +1038,37 @@ template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) return pfirst(sum); } -template<> EIGEN_STRONG_INLINE short int predux(const Packet8s& a) +template EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size8(const Packet& a) { union{ - Packet8s v; - short int n[8]; + Packet v; + __UNPACK_TYPE__(Packet) n[8]; } vt; vt.v = a; - EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] }; + EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] }; EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] }; Packet4i first_half = pload(first_loader); Packet4i second_half = pload(second_loader); - return static_cast(predux(first_half) + predux(second_half)); + return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_half) + predux(second_half)); } -template<> EIGEN_STRONG_INLINE unsigned short int predux(const Packet8us& a) +template<> EIGEN_STRONG_INLINE short int predux(const Packet8s& a) { - union{ - Packet8us v; - unsigned short int n[8]; - } vt; - vt.v = a; - - //There is no predux for Packet4ui. So we are intentionally using int - EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] }; - EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] }; - Packet4i first_half = pload(first_loader); - Packet4i second_half = pload(second_loader); + return predux_size8(a); +} - return static_cast(predux(first_half) + predux(second_half)); +template<> EIGEN_STRONG_INLINE unsigned short int predux(const Packet8us& a) +{ + return predux_size8(a); } -template<> EIGEN_STRONG_INLINE int8_t predux(const Packet16c& a) +template EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size16(const Packet& a) { union{ - Packet16c v; - int8_t n[16]; + Packet v; + __UNPACK_TYPE__(Packet) n[16]; } vt; vt.v = a; @@ -1288,33 +1082,19 @@ template<> EIGEN_STRONG_INLINE int8_t predux(const Packet16c& a) Packet4i third_quarter = pload(third_loader); Packet4i fourth_quarter = pload(fourth_loader); - return static_cast(predux(first_quarter) + predux(second_quarter) + return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_quarter) + predux(second_quarter) + predux(third_quarter) + predux(fourth_quarter)); } -template<> EIGEN_STRONG_INLINE uint8_t predux(const Packet16uc& a) +template<> EIGEN_STRONG_INLINE int8_t predux(const Packet16c& a) { - union{ - Packet16uc v; - uint8_t n[16]; - } vt; - vt.v = a; - - EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] }; - EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] }; - EIGEN_ALIGN16 int third_loader[4] = { vt.n[8], vt.n[9], vt.n[10], vt.n[11] }; - EIGEN_ALIGN16 int fourth_loader[4] = { vt.n[12], vt.n[13], vt.n[14], vt.n[15] }; - - Packet4i first_quarter = pload(first_loader); - Packet4i second_quarter = pload(second_loader); - Packet4i third_quarter = pload(third_loader); - Packet4i fourth_quarter = pload(fourth_loader); - - - return static_cast(predux(first_quarter) + predux(second_quarter) - + predux(third_quarter) + predux(fourth_quarter)); + return predux_size16(a); } +template<> EIGEN_STRONG_INLINE uint8_t predux(const Packet16uc& a) +{ + return predux_size16(a); +} // Other reduction functions: // mul @@ -1379,20 +1159,24 @@ template<> EIGEN_STRONG_INLINE uint8_t predux_mul(const Packet16uc& } // min -template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) +template EIGEN_STRONG_INLINE +__UNPACK_TYPE__(Packet) predux_min4(const Packet& a) { - Packet4f b, res; + Packet b, res; b = vec_min(a, vec_sld(a, a, 8)); res = vec_min(b, vec_sld(b, b, 4)); return pfirst(res); } + +template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) +{ + return predux_min4(a); +} + template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) { - Packet4i b, res; - b = vec_min(a, vec_sld(a, a, 8)); - res = vec_min(b, vec_sld(b, b, 4)); - return pfirst(res); + return predux_min4(a); } template<> EIGEN_STRONG_INLINE short int predux_min(const Packet8s& a) @@ -1449,20 +1233,22 @@ template<> EIGEN_STRONG_INLINE uint8_t predux_min(const Packet16uc& return pfirst(result); } // max -template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) +template EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(const Packet& a) { - Packet4f b, res; + Packet b, res; b = vec_max(a, vec_sld(a, a, 8)); res = vec_max(b, vec_sld(b, b, 4)); return pfirst(res); } +template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) +{ + return predux_max4(a); +} + template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) { - Packet4i b, res; - b = vec_max(a, vec_sld(a, a, 8)); - res = vec_max(b, vec_sld(b, b, 4)); - return pfirst(res); + return predux_max4(a); } template<> EIGEN_STRONG_INLINE short int predux_max(const Packet8s& a) @@ -1524,9 +1310,9 @@ template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) return vec_any_ne(x, pzero(x)); } -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - Packet4f t0, t1, t2, t3; +template EIGEN_DEVICE_FUNC inline void +ptranpose_common(PacketBlock& kernel){ + T t0, t1, t2, t3; t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); @@ -1537,17 +1323,14 @@ ptranspose(PacketBlock& kernel) { kernel.packet[3] = vec_mergel(t1, t3); } +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + ptranpose_common(kernel); +} + EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - Packet4i t0, t1, t2, t3; - t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); - t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); - t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); - t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); - kernel.packet[0] = vec_mergeh(t0, t2); - kernel.packet[1] = vec_mergel(t0, t2); - kernel.packet[2] = vec_mergeh(t1, t3); - kernel.packet[3] = vec_mergel(t1, t3); + ptranpose_common(kernel); } EIGEN_DEVICE_FUNC inline void @@ -1811,16 +1594,19 @@ ptranspose(PacketBlock& kernel) { kernel.packet[15] = vec_mergel(step3[7], step3[15]); } -template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { +template EIGEN_STRONG_INLINE +Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) { Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; Packet4ui mask = reinterpret_cast(vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p4i_ONE))); return vec_sel(elsePacket, thenPacket, mask); } +template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { + return pblend4(ifPacket, thenPacket, elsePacket); +} + template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { - Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; - Packet4ui mask = reinterpret_cast(vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p4i_ONE))); - return vec_sel(elsePacket, thenPacket, mask); + return pblend4(ifPacket, thenPacket, elsePacket); } template<> EIGEN_STRONG_INLINE Packet8s pblend(const Selector<8>& ifPacket, const Packet8s& thenPacket, const Packet8s& elsePacket) { -- cgit v1.2.3