aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2014-04-16 17:05:11 +0200
committerGravatar Gael Guennebaud <g.gael@free.fr>2014-04-16 17:05:11 +0200
commitd5a795f67366db20a132cc70e4f0217f42372357 (patch)
tree74df7a911811e64a4fa0baff940abe9c97abd5b6 /Eigen/src/Core/arch
parentfeaf7c7e6d01a4804cee5949a01ece1f8a46866f (diff)
New gebp kernel handling up to 3 packets x 4 register-level blocks. Huge speeup on Haswell.
This changeset also introduce new vector functions: ploadquad and predux4.
Diffstat (limited to 'Eigen/src/Core/arch')
-rw-r--r--Eigen/src/Core/arch/AVX/Complex.h4
-rw-r--r--Eigen/src/Core/arch/AVX/PacketMath.h29
-rwxr-xr-xEigen/src/Core/arch/AltiVec/PacketMath.h4
-rw-r--r--Eigen/src/Core/arch/NEON/Complex.h2
-rw-r--r--Eigen/src/Core/arch/NEON/PacketMath.h4
-rw-r--r--Eigen/src/Core/arch/SSE/Complex.h4
-rwxr-xr-xEigen/src/Core/arch/SSE/PacketMath.h6
7 files changed, 37 insertions, 16 deletions
diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h
index cb16180c5..8f95a7be7 100644
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -45,7 +45,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
};
};
-template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4}; };
+template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4}; typedef Packet2cf half; };
template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); }
@@ -271,7 +271,7 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits
};
};
-template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2}; };
+template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2}; typedef Packet1cd half; };
template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); }
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 38f52ecc8..47e10f6da 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -83,9 +83,9 @@ template<> struct packet_traits<int> : default_packet_traits
};
*/
-template<> struct unpacket_traits<Packet8f> { typedef float type; enum {size=8}; };
-template<> struct unpacket_traits<Packet4d> { typedef double type; enum {size=4}; };
-template<> struct unpacket_traits<Packet8i> { typedef int type; enum {size=8}; };
+template<> struct unpacket_traits<Packet8f> { typedef float type; typedef Packet4f half; enum {size=8}; };
+template<> struct unpacket_traits<Packet4d> { typedef double type; typedef Packet2d half; enum {size=4}; };
+template<> struct unpacket_traits<Packet8i> { typedef int type; typedef Packet4i half; enum {size=8}; };
template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) { return _mm256_set1_ps(from); }
template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); }
@@ -141,7 +141,16 @@ template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f&
return _mm256_fmadd_ps(a,b,c);
#endif
}
-template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) { return _mm256_fmadd_pd(a,b,c); }
+template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
+#if defined(__clang__) || defined(__GNUC__)
+ // see above
+ Packet4d res = c;
+ asm("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
+ return res;
+#else
+ return _mm256_fmadd_pd(a,b,c);
+#endif
+}
#endif
template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); }
@@ -189,6 +198,13 @@ template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from)
return _mm256_blend_pd(tmp1,_mm256_permute2f128_pd(tmp2,tmp2,1),12);
}
+// Loads 2 floats from memory a returns the packet {a0, a0 a0, a0, a1, a1, a1, a1}
+template<> EIGEN_STRONG_INLINE Packet8f ploadquad<Packet8f>(const float* from)
+{
+ Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from));
+ return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1);
+}
+
template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); }
template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); }
template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
@@ -345,6 +361,11 @@ template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
return pfirst(_mm256_hadd_pd(tmp0,tmp0));
}
+template<> EIGEN_STRONG_INLINE Packet4f predux4<Packet8f>(const Packet8f& a)
+{
+ return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1));
+}
+
template<> EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a)
{
Packet8f tmp;
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 5d7a16f5c..16948264f 100755
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -99,8 +99,8 @@ template<> struct packet_traits<int> : default_packet_traits
};
};
-template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; };
-template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; };
+template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; typedef Packet4f half; };
+template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; typedef Packet4i half; };
/*
inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
{
diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index e49c1a873..7ca76714f 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -47,7 +47,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
};
};
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; };
template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
{
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index fae7b55fc..83150507a 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -101,8 +101,8 @@ EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q
EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
#endif
-template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; };
-template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; };
+template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; typedef Packet4f half; };
+template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; typedef Packet4i half; };
template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); }
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return vdupq_n_s32(from); }
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index e54ebbf90..715e5a13c 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -49,7 +49,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
};
#endif
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; };
template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
@@ -296,7 +296,7 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits
};
#endif
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; };
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; typedef Packet1cd half; };
template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); }
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index bc17726b4..89dfa6975 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -107,9 +107,9 @@ template<> struct packet_traits<int> : default_packet_traits
};
};
-template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; };
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; };
-template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; };
+template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; typedef Packet4f half; };
+template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; typedef Packet2d half; };
+template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; typedef Packet4i half; };
#if defined(_MSC_VER) && (_MSC_VER==1500)
// Workaround MSVC 9 internal compiler error.