aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch
diff options
context:
space:
mode:
authorGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2014-03-26 19:03:07 -0700
committerGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2014-03-26 19:03:07 -0700
commita419cea4a0ff545f3221020119d5eb6ab4cd3e48 (patch)
tree2ea3a4e92a767bd92003b23235afcc936df32553 /Eigen/src/Core/arch
parent14bc4b9704b7e347ffcfe3c52588790e27e5118b (diff)
Created the ptranspose packet primitive that can transpose an array of N packets, where N is the number of words in each packet. This primitive will be used to complete the vectorization of the gemm_pack_lhs and gemm_pack_rhs functions.
Implemented the primitive using SSE instructions.
Diffstat (limited to 'Eigen/src/Core/arch')
-rw-r--r--Eigen/src/Core/arch/SSE/Complex.h10
-rwxr-xr-xEigen/src/Core/arch/SSE/PacketMath.h25
2 files changed, 35 insertions, 0 deletions
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index 91bba5e38..2dce66819 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -435,6 +435,16 @@ EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
return Packet1cd(preverse(x.v));
}
+template<> EIGEN_DEVICE_FUNC inline void
+ptranspose(Kernel<Packet2cf>& kernel) {
+ __m128d w1 = _mm_castps_pd(kernel.packet[0].v);
+ __m128d w2 = _mm_castps_pd(kernel.packet[1].v);
+
+ __m128 tmp = _mm_castpd_ps(_mm_unpackhi_pd(w1, w2));
+ kernel.packet[0].v = _mm_castpd_ps(_mm_unpacklo_pd(w1, w2));
+ kernel.packet[1].v = tmp;
+}
+
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 9d8faa7d6..937f63f88 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -707,6 +707,31 @@ struct palign_impl<Offset,Packet2d>
};
#endif
+template<> EIGEN_DEVICE_FUNC inline void
+ptranspose(Kernel<Packet4f>& kernel) {
+ _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void
+ptranspose(Kernel<Packet2d>& kernel) {
+ __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
+ kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
+ kernel.packet[1] = tmp;
+}
+
+template<> EIGEN_DEVICE_FUNC inline void
+ptranspose(Kernel<Packet4i>& kernel) {
+ __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
+ __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
+ __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
+ __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
+
+ kernel.packet[0] = _mm_unpacklo_epi64(T0, T1);
+ kernel.packet[1] = _mm_unpackhi_epi64(T0, T1);
+ kernel.packet[2] = _mm_unpacklo_epi64(T2, T3);
+ kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
+}
+
} // end namespace internal
} // end namespace Eigen