aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch/AVX/Complex.h
diff options
context:
space:
mode:
authorGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2014-03-27 09:34:51 -0700
committerGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2014-03-27 09:34:51 -0700
commitc4902a3d0182dfc9ac02a24ec2a52cd567ac0104 (patch)
treeab2c8d944d37c1d32e1da6eb3bd4929d4588e024 /Eigen/src/Core/arch/AVX/Complex.h
parent7d73c7f18be20407ffd28f72b3275da233658f80 (diff)
Implemented the AVX version of the ptranspose packet primitive.
Diffstat (limited to 'Eigen/src/Core/arch/AVX/Complex.h')
-rw-r--r--Eigen/src/Core/arch/AVX/Complex.h25
1 files changed, 25 insertions, 0 deletions
diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h
index 4f4f6eb52..0121cec86 100644
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -404,6 +404,31 @@ template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& x
return Packet2cd(_mm256_shuffle_pd(x.v, x.v, 0x5));
}
+template<> EIGEN_DEVICE_FUNC inline void
+ptranspose(Kernel<Packet4cf>& kernel) {
+ __m256d P0 = _mm256_castps_pd(kernel.packet[0].v);
+ __m256d P1 = _mm256_castps_pd(kernel.packet[1].v);
+ __m256d P2 = _mm256_castps_pd(kernel.packet[2].v);
+ __m256d P3 = _mm256_castps_pd(kernel.packet[3].v);
+
+ __m256d T0 = _mm256_shuffle_pd(P0, P1, 15);
+ __m256d T1 = _mm256_shuffle_pd(P0, P1, 0);
+ __m256d T2 = _mm256_shuffle_pd(P2, P3, 15);
+ __m256d T3 = _mm256_shuffle_pd(P2, P3, 0);
+
+ kernel.packet[1].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 32));
+ kernel.packet[3].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 49));
+ kernel.packet[0].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 32));
+ kernel.packet[2].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 49));
+}
+
+template<> EIGEN_DEVICE_FUNC inline void
+ptranspose(Kernel<Packet2cd>& kernel) {
+ __m256d tmp = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 0+(2<<4));
+ kernel.packet[1].v = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 1+(3<<4));
+ kernel.packet[0].v = tmp;
+}
+
} // end namespace internal
} // end namespace Eigen