From 225ab040e078b923ece75b7a49ae0cef980c226f Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 7 May 2020 17:14:26 -0700 Subject: Remove unused packet op "palign". Clean up a compiler warning in c++03 mode in AVX512/Complex.h. --- Eigen/src/Core/GenericPacketMath.h | 29 ----- Eigen/src/Core/arch/AVX/Complex.h | 20 ---- Eigen/src/Core/arch/AVX/PacketMath.h | 97 ---------------- Eigen/src/Core/arch/AVX512/Complex.h | 22 +--- Eigen/src/Core/arch/AVX512/PacketMath.h | 46 -------- Eigen/src/Core/arch/AltiVec/Complex.h | 26 ----- Eigen/src/Core/arch/AltiVec/PacketMath.h | 184 ------------------------------- Eigen/src/Core/arch/MSA/Complex.h | 18 --- Eigen/src/Core/arch/MSA/PacketMath.h | 32 ------ Eigen/src/Core/arch/NEON/Complex.h | 20 ---- Eigen/src/Core/arch/NEON/PacketMath.h | 157 -------------------------- Eigen/src/Core/arch/SSE/Complex.h | 23 ---- Eigen/src/Core/arch/SSE/PacketMath.h | 108 ------------------ Eigen/src/Core/arch/ZVector/Complex.h | 34 ------ Eigen/src/Core/arch/ZVector/PacketMath.h | 67 ----------- test/packetmath.cpp | 33 ------ 16 files changed, 1 insertion(+), 915 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 3f2489b46..0ed5d2cc5 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -685,35 +685,6 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_ro(const typename unpacket_t return ploadt(from); } -/** \internal default implementation of palign() allowing partial specialization */ -template -struct palign_impl -{ - // by default data are aligned, so there is nothing to be done :) - static inline void run(PacketType&, const PacketType&) {} -}; - -/** \internal update \a first using the concatenation of the packet_size minus \a Offset last elements - * of \a first and \a Offset first elements of \a second. - * - * This function is currently only used to optimize matrix-vector products on unligned matrices. - * It takes 2 packets that represent a contiguous memory array, and returns a packet starting - * at the position \a Offset. For instance, for packets of 4 elements, we have: - * Input: - * - first = {f0,f1,f2,f3} - * - second = {s0,s1,s2,s3} - * Output: - * - if Offset==0 then {f0,f1,f2,f3} - * - if Offset==1 then {f1,f2,f3,s0} - * - if Offset==2 then {f2,f3,s0,s1} - * - if Offset==3 then {f3,s0,s1,s3} - */ -template -inline void palign(PacketType& first, const PacketType& second) -{ - palign_impl::run(first,second); -} - /*************************************************************************** * Fast complex products (GCC generates a function call which is very slow) ***************************************************************************/ diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 03a097e49..893eb2702 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -157,16 +157,6 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const P Packet2cf(_mm256_extractf128_ps(a.v, 1)))); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second) - { - if (Offset==0) return; - palign_impl::run(first.v, second.v); - } -}; - template<> struct conj_helper { EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const @@ -339,16 +329,6 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd(_mm256_extractf128_pd(a.v,1)))); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second) - { - if (Offset==0) return; - palign_impl::run(first.v, second.v); - } -}; - template<> struct conj_helper { EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 3ed713eee..10196fd6d 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -691,93 +691,6 @@ template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x) return _mm256_movemask_ps(x)!=0; } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second) - { - if (Offset==1) - { - first = _mm256_blend_ps(first, second, 1); - Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); - Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); - first = _mm256_blend_ps(tmp1, tmp2, 0x88); - } - else if (Offset==2) - { - first = _mm256_blend_ps(first, second, 3); - Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); - Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); - first = _mm256_blend_ps(tmp1, tmp2, 0xcc); - } - else if (Offset==3) - { - first = _mm256_blend_ps(first, second, 7); - Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); - Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); - first = _mm256_blend_ps(tmp1, tmp2, 0xee); - } - else if (Offset==4) - { - first = _mm256_blend_ps(first, second, 15); - Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0)); - Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); - first = _mm256_permute_ps(tmp2, _MM_SHUFFLE(3,2,1,0)); - } - else if (Offset==5) - { - first = _mm256_blend_ps(first, second, 31); - first = _mm256_permute2f128_ps(first, first, 1); - Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); - first = _mm256_permute2f128_ps(tmp, tmp, 1); - first = _mm256_blend_ps(tmp, first, 0x88); - } - else if (Offset==6) - { - first = _mm256_blend_ps(first, second, 63); - first = _mm256_permute2f128_ps(first, first, 1); - Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); - first = _mm256_permute2f128_ps(tmp, tmp, 1); - first = _mm256_blend_ps(tmp, first, 0xcc); - } - else if (Offset==7) - { - first = _mm256_blend_ps(first, second, 127); - first = _mm256_permute2f128_ps(first, first, 1); - Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); - first = _mm256_permute2f128_ps(tmp, tmp, 1); - first = _mm256_blend_ps(tmp, first, 0xee); - } - } -}; - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second) - { - if (Offset==1) - { - first = _mm256_blend_pd(first, second, 1); - __m256d tmp = _mm256_permute_pd(first, 5); - first = _mm256_permute2f128_pd(tmp, tmp, 1); - first = _mm256_blend_pd(tmp, first, 0xA); - } - else if (Offset==2) - { - first = _mm256_blend_pd(first, second, 3); - first = _mm256_permute2f128_pd(first, first, 1); - } - else if (Offset==3) - { - first = _mm256_blend_pd(first, second, 7); - __m256d tmp = _mm256_permute_pd(first, 5); - first = _mm256_permute2f128_pd(tmp, tmp, 1); - first = _mm256_blend_pd(tmp, first, 5); - } - } -}; - EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]); @@ -1078,16 +991,6 @@ template<> EIGEN_STRONG_INLINE Packet8h pinsertlast(const Packet8h& a, Eigen::ha return _mm_insert_epi16(a,int(b.x),7); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet8h& first, const Packet8h& second) - { - if (Offset!=0) - first = _mm_alignr_epi8(second,first, Offset*2); - } -}; - EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { __m128i a = kernel.packet[0]; diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 219de36db..75bdf57f1 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -153,16 +153,6 @@ EIGEN_STRONG_INLINE Packet4cf predux_half_dowto4(const Packet8cf& a) return Packet4cf(res); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet8cf& first, const Packet8cf& second) - { - if (Offset==0) return; - palign_impl::run(first.v, second.v); - } -}; - template<> struct conj_helper { EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const @@ -239,7 +229,7 @@ template<> struct packet_traits > : default_packet_traits HasAbs2 = 0, HasMin = 0, HasMax = 0, - HasSetLinear = 0, + HasSetLinear = 0 }; }; @@ -351,16 +341,6 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cd(_mm512_extractf64x4_pd(a.v,1)))); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4cd& first, const Packet4cd& second) - { - if (Offset==0) return; - palign_impl::run(first.v, second.v); - } -}; - template<> struct conj_helper { EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 3a48ea028..346d1f06e 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -919,52 +919,6 @@ template<> EIGEN_STRONG_INLINE bool predux_any(const Packet16f& x) return !_mm512_kortestz(tmp,tmp); } -template -struct palign_impl { - static EIGEN_STRONG_INLINE void run(Packet16f& first, - const Packet16f& second) { - if (Offset != 0) { - __m512i first_idx = _mm512_set_epi32( - Offset + 15, Offset + 14, Offset + 13, Offset + 12, Offset + 11, - Offset + 10, Offset + 9, Offset + 8, Offset + 7, Offset + 6, - Offset + 5, Offset + 4, Offset + 3, Offset + 2, Offset + 1, Offset); - - __m512i second_idx = - _mm512_set_epi32(Offset - 1, Offset - 2, Offset - 3, Offset - 4, - Offset - 5, Offset - 6, Offset - 7, Offset - 8, - Offset - 9, Offset - 10, Offset - 11, Offset - 12, - Offset - 13, Offset - 14, Offset - 15, Offset - 16); - - unsigned short mask = 0xFFFF; - mask <<= (16 - Offset); - - first = _mm512_permutexvar_ps(first_idx, first); - Packet16f tmp = _mm512_permutexvar_ps(second_idx, second); - first = _mm512_mask_blend_ps(mask, first, tmp); - } - } -}; -template -struct palign_impl { - static EIGEN_STRONG_INLINE void run(Packet8d& first, const Packet8d& second) { - if (Offset != 0) { - __m512i first_idx = _mm512_set_epi32( - 0, Offset + 7, 0, Offset + 6, 0, Offset + 5, 0, Offset + 4, 0, - Offset + 3, 0, Offset + 2, 0, Offset + 1, 0, Offset); - - __m512i second_idx = _mm512_set_epi32( - 0, Offset - 1, 0, Offset - 2, 0, Offset - 3, 0, Offset - 4, 0, - Offset - 5, 0, Offset - 6, 0, Offset - 7, 0, Offset - 8); - - unsigned char mask = 0xFF; - mask <<= (8 - Offset); - - first = _mm512_permutexvar_pd(first_idx, first); - Packet8d tmp = _mm512_permutexvar_pd(second_idx, second); - first = _mm512_mask_blend_pd(mask, first, tmp); - } - } -}; #define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \ diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index 2a2689bc6..69d2ceca8 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -159,22 +159,6 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const P return pfirst(prod); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) - { - if (Offset==1) - { -#ifdef _BIG_ENDIAN - first.v = vec_sld(first.v, second.v, 8); -#else - first.v = vec_sld(second.v, first.v, 8); -#endif - } - } -}; - template<> struct conj_helper { EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const @@ -346,16 +330,6 @@ template<> EIGEN_STRONG_INLINE std::complex predux(const Pack template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { return pfirst(a); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) - { - // FIXME is it sure we never have to align a Packet1cd? - // Even though a std::complex has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... - } -}; - template<> struct conj_helper { EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index f2dd98c06..83b75b974 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -1524,176 +1524,6 @@ template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) return vec_any_ne(x, pzero(x)); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) - { -#ifdef _BIG_ENDIAN - switch (Offset % 4) { - case 1: - first = vec_sld(first, second, 4); break; - case 2: - first = vec_sld(first, second, 8); break; - case 3: - first = vec_sld(first, second, 12); break; - } -#else - switch (Offset % 4) { - case 1: - first = vec_sld(second, first, 12); break; - case 2: - first = vec_sld(second, first, 8); break; - case 3: - first = vec_sld(second, first, 4); break; - } -#endif - } -}; - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) - { -#ifdef _BIG_ENDIAN - switch (Offset % 4) { - case 1: - first = vec_sld(first, second, 4); break; - case 2: - first = vec_sld(first, second, 8); break; - case 3: - first = vec_sld(first, second, 12); break; - } -#else - switch (Offset % 4) { - case 1: - first = vec_sld(second, first, 12); break; - case 2: - first = vec_sld(second, first, 8); break; - case 3: - first = vec_sld(second, first, 4); break; - } -#endif - } -}; - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet8s& first, const Packet8s& second) - { -#ifdef _BIG_ENDIAN - switch (Offset % 8) { - case 1: - first = vec_sld(first, second, 2); break; - case 2: - first = vec_sld(first, second, 4); break; - case 3: - first = vec_sld(first, second, 6); break; - case 4: - first = vec_sld(first, second, 8); break; - case 5: - first = vec_sld(first, second, 10); break; - case 6: - first = vec_sld(first, second, 12); break; - case 7: - first = vec_sld(first, second, 14); break; - } -#else - switch (Offset % 8) { - case 1: - first = vec_sld(second, first, 14); break; - case 2: - first = vec_sld(second, first, 12); break; - case 3: - first = vec_sld(second, first, 10); break; - case 4: - first = vec_sld(second, first, 8); break; - case 5: - first = vec_sld(second, first, 6); break; - case 6: - first = vec_sld(second, first, 4); break; - case 7: - first = vec_sld(second, first, 2); break; - } -#endif - } -}; - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet8us& first, const Packet8us& second) - { -#ifdef _BIG_ENDIAN - switch (Offset % 8) { - case 1: - first = vec_sld(first, second, 2); break; - case 2: - first = vec_sld(first, second, 4); break; - case 3: - first = vec_sld(first, second, 6); break; - case 4: - first = vec_sld(first, second, 8); break; - case 5: - first = vec_sld(first, second, 10); break; - case 6: - first = vec_sld(first, second, 12); break; - case 7: - first = vec_sld(first, second, 14); break; - } -#else - switch (Offset % 8) { - case 1: - first = vec_sld(second, first, 14); break; - case 2: - first = vec_sld(second, first, 12); break; - case 3: - first = vec_sld(second, first, 10); break; - case 4: - first = vec_sld(second, first, 8); break; - case 5: - first = vec_sld(second, first, 6); break; - case 6: - first = vec_sld(second, first, 4); break; - case 7: - first = vec_sld(second, first, 2); break; - } -#endif - } -}; - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet16c& first, const Packet16c& second) - { - const int shift = Offset % 16; - if ( shift == 0 ) return; -#ifdef _BIG_ENDIAN - first = vec_sld(first, second, shift); -#else - first = vec_sld(first, second, shift); -#endif - } -}; - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet16uc& first, const Packet16uc& second) - { - const int shift = Offset % 16; - if ( shift == 0 ) return; -#ifdef _BIG_ENDIAN - first = vec_sld(first, second, shift); -#else - first = vec_sld(first, second, shift); -#endif - } -}; - EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { Packet4f t0, t1, t2, t3; @@ -2362,20 +2192,6 @@ template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) return pfirst(pmax(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) - { - if (Offset == 1) -#ifdef _BIG_ENDIAN - first = reinterpret_cast(vec_sld(reinterpret_cast(first), reinterpret_cast(second), 8)); -#else - first = reinterpret_cast(vec_sld(reinterpret_cast(second), reinterpret_cast(first), 8)); -#endif - } -}; - EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { Packet2d t0, t1; diff --git a/Eigen/src/Core/arch/MSA/Complex.h b/Eigen/src/Core/arch/MSA/Complex.h index 7baa25e33..4877a95a8 100644 --- a/Eigen/src/Core/arch/MSA/Complex.h +++ b/Eigen/src/Core/arch/MSA/Complex.h @@ -305,15 +305,6 @@ EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a (a.v[0] * a.v[3]) + (a.v[1] * a.v[2])); } -template -struct palign_impl { - EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second) { - if (Offset == 1) { - first.v = (Packet4f)__builtin_msa_sldi_b((v16i8)second.v, (v16i8)first.v, Offset * 8); - } - } -}; - template <> struct conj_helper { EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, @@ -653,15 +644,6 @@ EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& return pfirst(a); } -template -struct palign_impl { - static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) { - // FIXME is it sure we never have to align a Packet1cd? - // Even though a std::complex has 16 bytes, it is not necessarily aligned on a 16 bytes - // boundary... - } -}; - template <> struct conj_helper { EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, diff --git a/Eigen/src/Core/arch/MSA/PacketMath.h b/Eigen/src/Core/arch/MSA/PacketMath.h index ff4e1d5f1..f03cf61ff 100644 --- a/Eigen/src/Core/arch/MSA/PacketMath.h +++ b/Eigen/src/Core/arch/MSA/PacketMath.h @@ -675,25 +675,6 @@ EIGEN_STRONG_INLINE int32_t predux_max(const Packet4i& a) { return m[0]; } -#define PALIGN_MSA(Offset, Type, Command) \ - template <> \ - struct palign_impl { \ - EIGEN_STRONG_INLINE static void run(Type& first, const Type& second) { \ - if (Offset != 0) first = (Type)(Command((v16i8)second, (v16i8)first, Offset * 4)); \ - } \ - }; - -PALIGN_MSA(0, Packet4f, __builtin_msa_sldi_b) -PALIGN_MSA(1, Packet4f, __builtin_msa_sldi_b) -PALIGN_MSA(2, Packet4f, __builtin_msa_sldi_b) -PALIGN_MSA(3, Packet4f, __builtin_msa_sldi_b) -PALIGN_MSA(0, Packet4i, __builtin_msa_sldi_b) -PALIGN_MSA(1, Packet4i, __builtin_msa_sldi_b) -PALIGN_MSA(2, Packet4i, __builtin_msa_sldi_b) -PALIGN_MSA(3, Packet4i, __builtin_msa_sldi_b) - -#undef PALIGN_MSA - inline std::ostream& operator<<(std::ostream& os, const PacketBlock& value) { os << "[ " << value.packet[0] << "," << std::endl << " " << value.packet[1] << "," << std::endl @@ -1168,19 +1149,6 @@ EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) { #endif } -#define PALIGN_MSA(Offset, Type, Command) \ - template <> \ - struct palign_impl { \ - EIGEN_STRONG_INLINE static void run(Type& first, const Type& second) { \ - if (Offset != 0) first = (Type)(Command((v16i8)second, (v16i8)first, Offset * 8)); \ - } \ - }; - -PALIGN_MSA(0, Packet2d, __builtin_msa_sldi_b) -PALIGN_MSA(1, Packet2d, __builtin_msa_sldi_b) - -#undef PALIGN_MSA - inline std::ostream& operator<<(std::ostream& os, const PacketBlock& value) { os << "[ " << value.packet[0] << "," << std::endl << " " << value.packet[1] << " ]"; return os; diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index b03c66122..8cd2a5ebe 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -340,16 +340,6 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const P return s; } -template -struct palign_impl -{ - EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second) - { - if (Offset == 1) - first.v = vextq_f32(first.v, second.v, 2); - } -}; - template<> struct conj_helper { EIGEN_STRONG_INLINE Packet1cf pmadd(const Packet1cf& x, const Packet1cf& y, const Packet1cf& c) const @@ -602,16 +592,6 @@ template<> EIGEN_STRONG_INLINE std::complex predux(const Pack template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { return pfirst(a); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) - { - // FIXME is it sure we never have to align a Packet1cd? - // Even though a std::complex has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... - } -}; - template<> struct conj_helper { EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 3d24f00ce..5937433f5 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -2708,147 +2708,6 @@ template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) return vget_lane_u32(vpmax_u32(tmp, tmp), 0); } -// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors, -// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074 -#define PALIGN_NEON(Offset,Type,Command) \ -template<>\ -struct palign_impl\ -{\ - EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\ - {\ - if (Offset!=0)\ - first = Command(first, second, Offset);\ - }\ -};\ - -template -EIGEN_STRONG_INLINE T palign_4c(const T& first, const T &second, const int n) -{ - return static_cast((static_cast(second) << (32 - n * 8)) | (static_cast(first) >> (n * 8))); -} - -PALIGN_NEON(0, Packet2f, vext_f32) -PALIGN_NEON(1, Packet2f, vext_f32) - -PALIGN_NEON(0, Packet4f, vextq_f32) -PALIGN_NEON(1, Packet4f, vextq_f32) -PALIGN_NEON(2, Packet4f, vextq_f32) -PALIGN_NEON(3, Packet4f, vextq_f32) - -PALIGN_NEON(0, Packet4c, palign_4c) -PALIGN_NEON(1, Packet4c, palign_4c) -PALIGN_NEON(2, Packet4c, palign_4c) -PALIGN_NEON(3, Packet4c, palign_4c) - -PALIGN_NEON(0, Packet8c, vext_s8) -PALIGN_NEON(1, Packet8c, vext_s8) -PALIGN_NEON(2, Packet8c, vext_s8) -PALIGN_NEON(3, Packet8c, vext_s8) -PALIGN_NEON(4, Packet8c, vext_s8) -PALIGN_NEON(5, Packet8c, vext_s8) -PALIGN_NEON(6, Packet8c, vext_s8) -PALIGN_NEON(7, Packet8c, vext_s8) - -PALIGN_NEON(0, Packet16c, vextq_s8) -PALIGN_NEON(1, Packet16c, vextq_s8) -PALIGN_NEON(2, Packet16c, vextq_s8) -PALIGN_NEON(3, Packet16c, vextq_s8) -PALIGN_NEON(4, Packet16c, vextq_s8) -PALIGN_NEON(5, Packet16c, vextq_s8) -PALIGN_NEON(6, Packet16c, vextq_s8) -PALIGN_NEON(7, Packet16c, vextq_s8) -PALIGN_NEON(8, Packet16c, vextq_s8) -PALIGN_NEON(9, Packet16c, vextq_s8) -PALIGN_NEON(10, Packet16c, vextq_s8) -PALIGN_NEON(11, Packet16c, vextq_s8) -PALIGN_NEON(12, Packet16c, vextq_s8) -PALIGN_NEON(13, Packet16c, vextq_s8) -PALIGN_NEON(14, Packet16c, vextq_s8) -PALIGN_NEON(15, Packet16c, vextq_s8) - -PALIGN_NEON(0, Packet4uc, palign_4c) -PALIGN_NEON(1, Packet4uc, palign_4c) -PALIGN_NEON(2, Packet4uc, palign_4c) -PALIGN_NEON(3, Packet4uc, palign_4c) - -PALIGN_NEON(0, Packet8uc, vext_u8) -PALIGN_NEON(1, Packet8uc, vext_u8) -PALIGN_NEON(2, Packet8uc, vext_u8) -PALIGN_NEON(3, Packet8uc, vext_u8) -PALIGN_NEON(4, Packet8uc, vext_u8) -PALIGN_NEON(5, Packet8uc, vext_u8) -PALIGN_NEON(6, Packet8uc, vext_u8) -PALIGN_NEON(7, Packet8uc, vext_u8) - -PALIGN_NEON(0, Packet16uc, vextq_u8) -PALIGN_NEON(1, Packet16uc, vextq_u8) -PALIGN_NEON(2, Packet16uc, vextq_u8) -PALIGN_NEON(3, Packet16uc, vextq_u8) -PALIGN_NEON(4, Packet16uc, vextq_u8) -PALIGN_NEON(5, Packet16uc, vextq_u8) -PALIGN_NEON(6, Packet16uc, vextq_u8) -PALIGN_NEON(7, Packet16uc, vextq_u8) -PALIGN_NEON(8, Packet16uc, vextq_u8) -PALIGN_NEON(9, Packet16uc, vextq_u8) -PALIGN_NEON(10, Packet16uc, vextq_u8) -PALIGN_NEON(11, Packet16uc, vextq_u8) -PALIGN_NEON(12, Packet16uc, vextq_u8) -PALIGN_NEON(13, Packet16uc, vextq_u8) -PALIGN_NEON(14, Packet16uc, vextq_u8) -PALIGN_NEON(15, Packet16uc, vextq_u8) - -PALIGN_NEON(0, Packet4s, vext_s16) -PALIGN_NEON(1, Packet4s, vext_s16) -PALIGN_NEON(2, Packet4s, vext_s16) -PALIGN_NEON(3, Packet4s, vext_s16) - -PALIGN_NEON(0, Packet8s, vextq_s16) -PALIGN_NEON(1, Packet8s, vextq_s16) -PALIGN_NEON(2, Packet8s, vextq_s16) -PALIGN_NEON(3, Packet8s, vextq_s16) -PALIGN_NEON(4, Packet8s, vextq_s16) -PALIGN_NEON(5, Packet8s, vextq_s16) -PALIGN_NEON(6, Packet8s, vextq_s16) -PALIGN_NEON(7, Packet8s, vextq_s16) - -PALIGN_NEON(0, Packet4us, vext_u16) -PALIGN_NEON(1, Packet4us, vext_u16) -PALIGN_NEON(2, Packet4us, vext_u16) -PALIGN_NEON(3, Packet4us, vext_u16) - -PALIGN_NEON(0, Packet8us, vextq_u16) -PALIGN_NEON(1, Packet8us, vextq_u16) -PALIGN_NEON(2, Packet8us, vextq_u16) -PALIGN_NEON(3, Packet8us, vextq_u16) -PALIGN_NEON(4, Packet8us, vextq_u16) -PALIGN_NEON(5, Packet8us, vextq_u16) -PALIGN_NEON(6, Packet8us, vextq_u16) -PALIGN_NEON(7, Packet8us, vextq_u16) - -PALIGN_NEON(0, Packet2i, vext_s32) -PALIGN_NEON(1, Packet2i, vext_s32) - -PALIGN_NEON(0, Packet4i, vextq_s32) -PALIGN_NEON(1, Packet4i, vextq_s32) -PALIGN_NEON(2, Packet4i, vextq_s32) -PALIGN_NEON(3, Packet4i, vextq_s32) - -PALIGN_NEON(0, Packet2ui, vext_u32) -PALIGN_NEON(1, Packet2ui, vext_u32) - -PALIGN_NEON(0, Packet4ui, vextq_u32) -PALIGN_NEON(1, Packet4ui, vextq_u32) -PALIGN_NEON(2, Packet4ui, vextq_u32) -PALIGN_NEON(3, Packet4ui, vextq_u32) - -PALIGN_NEON(0, Packet2l, vextq_s64) -PALIGN_NEON(1, Packet2l, vextq_s64) - -PALIGN_NEON(0, Packet2ul, vextq_u64) -PALIGN_NEON(1, Packet2ul, vextq_u64) - -#undef PALIGN_NEON - EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { const float32x2x2_t z = vzip_f32(kernel.packet[0], kernel.packet[1]); @@ -3563,22 +3422,6 @@ template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) { return vgetq_lane_f64(vpmaxq_f64(a,a), 0); } -// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors, -// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074 -#define PALIGN_NEON(Offset,Type,Command) \ -template<>\ -struct palign_impl\ -{\ - EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\ - {\ - if (Offset!=0)\ - first = Command(first, second, Offset);\ - }\ -};\ - -PALIGN_NEON(0, Packet2d, vextq_f64) -PALIGN_NEON(1, Packet2d, vextq_f64) -#undef PALIGN_NEON EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index d6bfeafe4..a16d73e27 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -161,19 +161,6 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const P return pfirst(pmul(a, Packet2cf(_mm_movehl_ps(a.v,a.v)))); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) - { - if (Offset==1) - { - first.v = _mm_movehl_ps(first.v, first.v); - first.v = _mm_movelh_ps(first.v, second.v); - } - } -}; - template<> struct conj_helper { EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const @@ -346,16 +333,6 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const return pfirst(a); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) - { - // FIXME is it sure we never have to align a Packet1cd? - // Even though a std::complex has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... - } -}; - template<> struct conj_helper { EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index f4a409430..cf2f0be17 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -867,114 +867,6 @@ template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) return _mm_movemask_ps(x) != 0x0; } -#if EIGEN_COMP_GNUC -// template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) -// { -// Packet4f res = b; -// asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c)); -// return res; -// } -// EIGEN_STRONG_INLINE Packet4i _mm_alignr_epi8(const Packet4i& a, const Packet4i& b, const int i) -// { -// Packet4i res = a; -// asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i)); -// return res; -// } -#endif - -#ifdef EIGEN_VECTORIZE_SSSE3 -// SSSE3 versions -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) - { - if (Offset!=0) - first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4)); - } -}; - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) - { - if (Offset!=0) - first = _mm_alignr_epi8(second,first, Offset*4); - } -}; - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) - { - if (Offset==1) - first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8)); - } -}; -#else -// SSE2 versions -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) - { - if (Offset==1) - { - first = _mm_move_ss(first,second); - first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39)); - } - else if (Offset==2) - { - first = _mm_movehl_ps(first,first); - first = _mm_movelh_ps(first,second); - } - else if (Offset==3) - { - first = _mm_move_ss(first,second); - first = _mm_shuffle_ps(first,second,0x93); - } - } -}; - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) - { - if (Offset==1) - { - first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); - first = _mm_shuffle_epi32(first,0x39); - } - else if (Offset==2) - { - first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first))); - first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); - } - else if (Offset==3) - { - first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); - first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93)); - } - } -}; - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) - { - if (Offset==1) - { - first = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(first),_mm_castpd_ps(first))); - first = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(first),_mm_castpd_ps(second))); - } - } -}; -#endif - EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]); diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h index f589fddd8..d3e41b43e 100644 --- a/Eigen/src/Core/arch/ZVector/Complex.h +++ b/Eigen/src/Core/arch/ZVector/Complex.h @@ -160,16 +160,6 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const { return pfirst(a); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) - { - // FIXME is it sure we never have to align a Packet1cd? - // Even though a std::complex has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... - } -}; - template<> struct conj_helper { EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const @@ -331,18 +321,6 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const P return res; } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) - { - if (Offset == 1) { - first.cd[0] = first.cd[1]; - first.cd[1] = second.cd[0]; - } - } -}; - template<> struct conj_helper { EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const @@ -457,18 +435,6 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const P return pfirst(prod); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) - { - if (Offset==1) - { - first.v = vec_sld(first.v, second.v, 8); - } - } -}; - template<> struct conj_helper { EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h index 3435f7c1e..3fb642a38 100755 --- a/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -298,33 +298,6 @@ inline std::ostream & operator <<(std::ostream & s, const Packet4f & v) } #endif - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) - { - switch (Offset % 4) { - case 1: - first = vec_sld(first, second, 4); break; - case 2: - first = vec_sld(first, second, 8); break; - case 3: - first = vec_sld(first, second, 12); break; - } - } -}; - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) - { - if (Offset == 1) - first = reinterpret_cast(vec_sld(reinterpret_cast(first), reinterpret_cast(second), 8)); - } -}; - template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { // FIXME: No intrinsic yet @@ -636,30 +609,6 @@ template EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Pack return splat; } -/* This is a tricky one, we have to translate float alignment to vector elements of sizeof double - */ -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) - { - switch (Offset % 4) { - case 1: - first.v4f[0] = vec_sld(first.v4f[0], first.v4f[1], 8); - first.v4f[1] = vec_sld(first.v4f[1], second.v4f[0], 8); - break; - case 2: - first.v4f[0] = first.v4f[1]; - first.v4f[1] = second.v4f[0]; - break; - case 3: - first.v4f[0] = vec_sld(first.v4f[1], second.v4f[0], 8); - first.v4f[1] = vec_sld(second.v4f[0], second.v4f[1], 8); - break; - } - } -}; - template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { // FIXME: No intrinsic yet @@ -942,22 +891,6 @@ template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, cons return result; } #else -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) - { - switch (Offset % 4) { - case 1: - first = vec_sld(first, second, 4); break; - case 2: - first = vec_sld(first, second, 8); break; - case 3: - first = vec_sld(first, second, 12); break; - } - } -}; - template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { // FIXME: No intrinsic yet diff --git a/test/packetmath.cpp b/test/packetmath.cpp index cceaff7c5..7341d67e7 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -103,7 +103,6 @@ template void packetmath() EIGEN_ALIGN_MAX Scalar data1[size]; EIGEN_ALIGN_MAX Scalar data2[size]; EIGEN_ALIGN_MAX Scalar data3[size]; - EIGEN_ALIGN_MAX Packet packets[PacketSize*2]; EIGEN_ALIGN_MAX Scalar ref[size]; RealScalar refvalue = RealScalar(0); for (int i=0; i void packetmath() } } - for (int offset=0; offset(data1); - packets[1] = internal::pload(data1+PacketSize); - if (offset==0) internal::palign<0>(packets[0], packets[1]); - else if (offset==1) internal::palign(packets[0], packets[1]); - else if (offset==2) internal::palign(packets[0], packets[1]); - else if (offset==3) internal::palign(packets[0], packets[1]); - else if (offset==4) internal::palign(packets[0], packets[1]); - else if (offset==5) internal::palign(packets[0], packets[1]); - else if (offset==6) internal::palign(packets[0], packets[1]); - else if (offset==7) internal::palign(packets[0], packets[1]); - else if (offset==8) internal::palign(packets[0], packets[1]); - else if (offset==9) internal::palign(packets[0], packets[1]); - else if (offset==10) internal::palign(packets[0], packets[1]); - else if (offset==11) internal::palign(packets[0], packets[1]); - else if (offset==12) internal::palign(packets[0], packets[1]); - else if (offset==13) internal::palign(packets[0], packets[1]); - else if (offset==14) internal::palign(packets[0], packets[1]); - else if (offset==15) internal::palign(packets[0], packets[1]); - internal::pstore(data2, packets[0]); - - for (int i=0; i