diff options
author | Gael Guennebaud <g.gael@free.fr> | 2014-04-16 17:05:11 +0200 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2014-04-16 17:05:11 +0200 |
commit | d5a795f67366db20a132cc70e4f0217f42372357 (patch) | |
tree | 74df7a911811e64a4fa0baff940abe9c97abd5b6 /Eigen/src/Core/arch/AVX | |
parent | feaf7c7e6d01a4804cee5949a01ece1f8a46866f (diff) |
New gebp kernel handling up to 3 packets x 4 register-level blocks. Huge speeup on Haswell.
This changeset also introduce new vector functions: ploadquad and predux4.
Diffstat (limited to 'Eigen/src/Core/arch/AVX')
-rw-r--r-- | Eigen/src/Core/arch/AVX/Complex.h | 4 | ||||
-rw-r--r-- | Eigen/src/Core/arch/AVX/PacketMath.h | 29 |
2 files changed, 27 insertions, 6 deletions
diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index cb16180c5..8f95a7be7 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -45,7 +45,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits }; }; -template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4}; }; +template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); } @@ -271,7 +271,7 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits }; }; -template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2}; }; +template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2}; typedef Packet1cd half; }; template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 38f52ecc8..47e10f6da 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -83,9 +83,9 @@ template<> struct packet_traits<int> : default_packet_traits }; */ -template<> struct unpacket_traits<Packet8f> { typedef float type; enum {size=8}; }; -template<> struct unpacket_traits<Packet4d> { typedef double type; enum {size=4}; }; -template<> struct unpacket_traits<Packet8i> { typedef int type; enum {size=8}; }; +template<> struct unpacket_traits<Packet8f> { typedef float type; typedef Packet4f half; enum {size=8}; }; +template<> struct unpacket_traits<Packet4d> { typedef double type; typedef Packet2d half; enum {size=4}; }; +template<> struct unpacket_traits<Packet8i> { typedef int type; typedef Packet4i half; enum {size=8}; }; template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) { return _mm256_set1_ps(from); } template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); } @@ -141,7 +141,16 @@ template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& return _mm256_fmadd_ps(a,b,c); #endif } -template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) { return _mm256_fmadd_pd(a,b,c); } +template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) { +#if defined(__clang__) || defined(__GNUC__) + // see above + Packet4d res = c; + asm("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); + return res; +#else + return _mm256_fmadd_pd(a,b,c); +#endif +} #endif template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); } @@ -189,6 +198,13 @@ template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from) return _mm256_blend_pd(tmp1,_mm256_permute2f128_pd(tmp2,tmp2,1),12); } +// Loads 2 floats from memory a returns the packet {a0, a0 a0, a0, a1, a1, a1, a1} +template<> EIGEN_STRONG_INLINE Packet8f ploadquad<Packet8f>(const float* from) +{ + Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from)); + return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1); +} + template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); } template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); } template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } @@ -345,6 +361,11 @@ template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a) return pfirst(_mm256_hadd_pd(tmp0,tmp0)); } +template<> EIGEN_STRONG_INLINE Packet4f predux4<Packet8f>(const Packet8f& a) +{ + return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1)); +} + template<> EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a) { Packet8f tmp; |