aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch/SSE/PacketMath.h
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2008-08-03 15:15:46 +0000
committerGravatar Gael Guennebaud <g.gael@free.fr>2008-08-03 15:15:46 +0000
commit6d11a07e5e3d3659f74e39e2639c3ef9a4926366 (patch)
treea6160eea6bcca299f3eab7d05af1a6171079acde /Eigen/src/Core/arch/SSE/PacketMath.h
parent55aeb1f83a5c303da09f5c5ef3037e75e71312cd (diff)
Added a ei_palign function align a packet from two others.
This allows much faster code dealing with unligned as in the updated matrix-vector product functions.
Diffstat (limited to 'Eigen/src/Core/arch/SSE/PacketMath.h')
-rw-r--r--Eigen/src/Core/arch/SSE/PacketMath.h81
1 files changed, 80 insertions, 1 deletions
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 28825ceee..07aa84d69 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -90,6 +90,12 @@ template<> inline __m128d ei_pload(const double* from) { return _mm_load_pd(fro
template<> inline __m128i ei_pload(const int* from) { return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
template<> inline __m128 ei_ploadu(const float* from) { return _mm_loadu_ps(from); }
+// template<> inline __m128 ei_ploadu(const float* from) {
+// if (size_t(from)&0xF)
+// return _mm_loadu_ps(from);
+// else
+// return _mm_loadu_ps(from);
+// }
template<> inline __m128d ei_ploadu(const double* from) { return _mm_loadu_pd(from); }
template<> inline __m128i ei_ploadu(const int* from) { return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); }
@@ -198,7 +204,80 @@ inline __m128i ei_preduxp(const __m128i* vecs)
// asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c));
// return res;
// }
+// inline __m128i _mm_alignr_epi8(const __m128i& a, const __m128i& b, const int i)
+// {
+// __m128i res = a;
+// asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i));
+// return res;
+// }
#endif
-#endif // EIGEN_PACKET_MATH_SSE_H
+#ifdef __SSSE3__
+// SSSE3 versions
+template<int Offset>
+struct ei_palign_impl<Offset,__m128>
+{
+ inline static void run(__m128& first, const __m128& second)
+ {
+ first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(first), _mm_castps_si128(second), (4-Offset)*4));
+ }
+};
+template<int Offset>
+struct ei_palign_impl<Offset,__m128i>
+{
+ inline static void run(__m128i& first, const __m128i& second)
+ {
+ first = _mm_alignr_epi8(first, second, (4-Offset)*4);
+ }
+};
+#else
+// SSE2 versions
+template<int Offset>
+struct ei_palign_impl<Offset,__m128>
+{
+ inline static void run(__m128& first, const __m128& second)
+ {
+ if (Offset==1)
+ {
+ first = _mm_move_ss(first,second);
+ first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39));
+ }
+ else if (Offset==2)
+ {
+ first = _mm_movehl_ps(first,first);
+ first = _mm_movelh_ps(first,second);
+ }
+ else if (Offset==3)
+ {
+ first = _mm_move_ss(first,second);
+ first = _mm_shuffle_ps(first,second,0x93);
+ }
+ }
+};
+
+template<int Offset>
+struct ei_palign_impl<Offset,__m128i>
+{
+ inline static void run(__m128i& first, const __m128i& second)
+ {
+ if (Offset==1)
+ {
+ first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
+ first = _mm_shuffle_epi32(first,0x39);
+ }
+ else if (Offset==2)
+ {
+ first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first)));
+ first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
+ }
+ else if (Offset==3)
+ {
+ first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
+ first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93));
+ }
+ }
+};
+#endif
+
+#endif // EIGEN_PACKET_MATH_SSE_H