diff options
author | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2014-03-26 19:03:07 -0700 |
---|---|---|
committer | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2014-03-26 19:03:07 -0700 |
commit | a419cea4a0ff545f3221020119d5eb6ab4cd3e48 (patch) | |
tree | 2ea3a4e92a767bd92003b23235afcc936df32553 /Eigen | |
parent | 14bc4b9704b7e347ffcfe3c52588790e27e5118b (diff) |
Created the ptranspose packet primitive that can transpose an array of N packets, where N is the number of words in each packet. This primitive will be used to complete the vectorization of the gemm_pack_lhs and gemm_pack_rhs functions.
Implemented the primitive using SSE instructions.
Diffstat (limited to 'Eigen')
-rwxr-xr-x | Eigen/src/Core/GenericPacketMath.h | 15 | ||||
-rw-r--r-- | Eigen/src/Core/arch/SSE/Complex.h | 10 | ||||
-rwxr-xr-x | Eigen/src/Core/arch/SSE/PacketMath.h | 25 |
3 files changed, 49 insertions, 1 deletions
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index d07541285..f9ddf4718 100755 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -386,9 +386,22 @@ template<> inline std::complex<double> pmul(const std::complex<double>& a, const #endif + +/*************************************************************************** + * Kernel, that is a collection of N packets where N is the number of words + * in the packet. +***************************************************************************/ +template <typename Packet> struct Kernel { + Packet packet[unpacket_traits<Packet>::size]; +}; + +template<typename Packet> EIGEN_DEVICE_FUNC inline void +ptranspose(Kernel<Packet>& /*kernel*/) { + // Nothing to do in the scalar case, i.e. a 1x1 matrix. +} + } // end namespace internal } // end namespace Eigen #endif // EIGEN_GENERIC_PACKET_MATH_H - diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 91bba5e38..2dce66819 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -435,6 +435,16 @@ EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x) return Packet1cd(preverse(x.v)); } +template<> EIGEN_DEVICE_FUNC inline void +ptranspose(Kernel<Packet2cf>& kernel) { + __m128d w1 = _mm_castps_pd(kernel.packet[0].v); + __m128d w2 = _mm_castps_pd(kernel.packet[1].v); + + __m128 tmp = _mm_castpd_ps(_mm_unpackhi_pd(w1, w2)); + kernel.packet[0].v = _mm_castpd_ps(_mm_unpacklo_pd(w1, w2)); + kernel.packet[1].v = tmp; +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 9d8faa7d6..937f63f88 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -707,6 +707,31 @@ struct palign_impl<Offset,Packet2d> }; #endif +template<> EIGEN_DEVICE_FUNC inline void +ptranspose(Kernel<Packet4f>& kernel) { + _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]); +} + +template<> EIGEN_DEVICE_FUNC inline void +ptranspose(Kernel<Packet2d>& kernel) { + __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]); + kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]); + kernel.packet[1] = tmp; +} + +template<> EIGEN_DEVICE_FUNC inline void +ptranspose(Kernel<Packet4i>& kernel) { + __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]); + __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]); + __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]); + __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]); + + kernel.packet[0] = _mm_unpacklo_epi64(T0, T1); + kernel.packet[1] = _mm_unpackhi_epi64(T0, T1); + kernel.packet[2] = _mm_unpacklo_epi64(T2, T3); + kernel.packet[3] = _mm_unpackhi_epi64(T2, T3); +} + } // end namespace internal } // end namespace Eigen |