aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch/AVX
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2014-04-25 10:56:18 +0200
committerGravatar Gael Guennebaud <g.gael@free.fr>2014-04-25 10:56:18 +0200
commit3d8d0f6269478a06f4fcbd4b838c8e9b9d7e9d62 (patch)
tree9f55e81498c1acc61ea0c10bce0ead69a71875d1 /Eigen/src/Core/arch/AVX
parentb0e19db1cf462a07e25429d4f04f7d8e858f670f (diff)
Enable vectorization of pack_rhs with a column-major RHS.
Rename and generalize Kernel<*> to PacketBlock<*,N>.
Diffstat (limited to 'Eigen/src/Core/arch/AVX')
-rw-r--r--Eigen/src/Core/arch/AVX/Complex.h8
-rw-r--r--Eigen/src/Core/arch/AVX/PacketMath.h26
2 files changed, 26 insertions, 8 deletions
diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h
index 18be2f225..9ced85132 100644
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -431,8 +431,8 @@ template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& x
return Packet2cd(_mm256_shuffle_pd(x.v, x.v, 0x5));
}
-template<> EIGEN_DEVICE_FUNC inline void
-ptranspose(Kernel<Packet4cf>& kernel) {
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4cf,4>& kernel) {
__m256d P0 = _mm256_castps_pd(kernel.packet[0].v);
__m256d P1 = _mm256_castps_pd(kernel.packet[1].v);
__m256d P2 = _mm256_castps_pd(kernel.packet[2].v);
@@ -449,8 +449,8 @@ ptranspose(Kernel<Packet4cf>& kernel) {
kernel.packet[2].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 49));
}
-template<> EIGEN_DEVICE_FUNC inline void
-ptranspose(Kernel<Packet2cd>& kernel) {
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet2cd,2>& kernel) {
__m256d tmp = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 0+(2<<4));
kernel.packet[1].v = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 1+(3<<4));
kernel.packet[0].v = tmp;
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index e6f540430..8b8307d75 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -498,8 +498,8 @@ struct palign_impl<Offset,Packet4d>
}
};
-template<> EIGEN_DEVICE_FUNC inline void
-ptranspose(Kernel<Packet8f>& kernel) {
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8f,8>& kernel) {
__m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
__m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
__m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
@@ -526,8 +526,26 @@ ptranspose(Kernel<Packet8f>& kernel) {
kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31);
}
-template<> EIGEN_DEVICE_FUNC inline void
-ptranspose(Kernel<Packet4d>& kernel) {
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8f,4>& kernel) {
+ __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
+ __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
+ __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
+ __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
+
+ __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
+ __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
+ __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
+ __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
+
+ kernel.packet[0] = _mm256_permute2f128_ps(S0, S1, 0x20);
+ kernel.packet[1] = _mm256_permute2f128_ps(S2, S3, 0x20);
+ kernel.packet[2] = _mm256_permute2f128_ps(S0, S1, 0x31);
+ kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4d,4>& kernel) {
__m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15);
__m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
__m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15);