diff options
author | Gael Guennebaud <g.gael@free.fr> | 2019-01-30 15:57:08 +0100 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2019-01-30 15:57:08 +0100 |
commit | eb4c6bb22dfb6c83deaea65fe5d1bbb4cdc1f8bb (patch) | |
tree | b7689e88a529be171fac3a29d3d8cbb8b7564b64 /Eigen/src/Core | |
parent | e3622a03964d389fe099d1689fff059ba3cd9830 (diff) | |
parent | 1024a70e82c0301d9f699fd344613e9cd417ab95 (diff) |
Fix conflicts and merge
Diffstat (limited to 'Eigen/src/Core')
-rw-r--r-- | Eigen/src/Core/arch/AVX/Complex.h | 4 | ||||
-rw-r--r-- | Eigen/src/Core/arch/AVX/PacketMath.h | 6 | ||||
-rw-r--r-- | Eigen/src/Core/arch/AVX512/Complex.h | 6 | ||||
-rw-r--r-- | Eigen/src/Core/arch/AVX512/PacketMath.h | 6 | ||||
-rw-r--r-- | Eigen/src/Core/arch/AltiVec/Complex.h | 4 | ||||
-rwxr-xr-x | Eigen/src/Core/arch/AltiVec/PacketMath.h | 6 | ||||
-rw-r--r-- | Eigen/src/Core/arch/GPU/PacketMath.h | 4 | ||||
-rw-r--r-- | Eigen/src/Core/arch/GPU/PacketMathHalf.h | 8 | ||||
-rw-r--r-- | Eigen/src/Core/arch/MSA/Complex.h | 4 | ||||
-rw-r--r-- | Eigen/src/Core/arch/MSA/PacketMath.h | 6 | ||||
-rw-r--r-- | Eigen/src/Core/arch/NEON/Complex.h | 4 | ||||
-rw-r--r-- | Eigen/src/Core/arch/NEON/PacketMath.h | 6 | ||||
-rw-r--r-- | Eigen/src/Core/arch/SSE/Complex.h | 4 | ||||
-rwxr-xr-x | Eigen/src/Core/arch/SSE/PacketMath.h | 6 | ||||
-rw-r--r-- | Eigen/src/Core/arch/SYCL/InteropHeaders.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/arch/ZVector/Complex.h | 4 | ||||
-rwxr-xr-x | Eigen/src/Core/arch/ZVector/PacketMath.h | 6 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralBlockPanelKernel.h | 700 | ||||
-rw-r--r-- | Eigen/src/Core/products/SelfadjointMatrixMatrix.h | 23 | ||||
-rw-r--r-- | Eigen/src/Core/util/XprHelper.h | 3 |
20 files changed, 522 insertions, 290 deletions
diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 16faf1082..5b8ff59bd 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -47,7 +47,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits }; #endif -template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4, alignment=Aligned32}; typedef Packet2cf half; }; +template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4, alignment=Aligned32, vectorizable=true}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); } @@ -263,7 +263,7 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits }; #endif -template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2, alignment=Aligned32}; typedef Packet1cd half; }; +template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2, alignment=Aligned32, vectorizable=true}; typedef Packet1cd half; }; template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index c18c18cc3..ee00f1f7d 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -117,14 +117,14 @@ template<> struct unpacket_traits<Packet8f> { typedef float type; typedef Packet4f half; typedef Packet8i integer_packet; - enum {size=8, alignment=Aligned32}; + enum {size=8, alignment=Aligned32, vectorizable=true}; }; template<> struct unpacket_traits<Packet4d> { typedef double type; typedef Packet2d half; - enum {size=4, alignment=Aligned32}; + enum {size=4, alignment=Aligned32, vectorizable=true}; }; -template<> struct unpacket_traits<Packet8i> { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32}; }; +template<> struct unpacket_traits<Packet8i> { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32, vectorizable=false}; }; template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) { return _mm256_set1_ps(from); } template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); } diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 7ea72c509..9a89dd01f 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -50,7 +50,8 @@ template<> struct unpacket_traits<Packet8cf> { typedef std::complex<float> type; enum { size = 8, - alignment=unpacket_traits<Packet16f>::alignment + alignment=unpacket_traits<Packet16f>::alignment, + vectorizable=true }; typedef Packet4cf half; }; @@ -245,7 +246,8 @@ template<> struct unpacket_traits<Packet4cd> { typedef std::complex<double> type; enum { size = 4, - alignment = unpacket_traits<Packet8d>::alignment + alignment = unpacket_traits<Packet8d>::alignment, + vectorizable=true }; typedef Packet2cd half; }; diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index a2c9599b6..2199970ad 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -102,19 +102,19 @@ struct unpacket_traits<Packet16f> { typedef float type; typedef Packet8f half; typedef Packet16i integer_packet; - enum { size = 16, alignment=Aligned64 }; + enum { size = 16, alignment=Aligned64, vectorizable=true }; }; template <> struct unpacket_traits<Packet8d> { typedef double type; typedef Packet4d half; - enum { size = 8, alignment=Aligned64 }; + enum { size = 8, alignment=Aligned64, vectorizable=true }; }; template <> struct unpacket_traits<Packet16i> { typedef int type; typedef Packet8i half; - enum { size = 16, alignment=Aligned64 }; + enum { size = 16, alignment=Aligned64, vectorizable=false }; }; template <> diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index 23dd016f9..440d058d8 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -60,7 +60,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits }; }; -template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; +template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) { @@ -286,7 +286,7 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits }; }; -template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; +template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; }; template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { return Packet1cd(pload<Packet2d>((const double*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { return Packet1cd(ploadu<Packet2d>((const double*)from)); } diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 6b347cedb..9535724eb 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -192,13 +192,13 @@ template<> struct unpacket_traits<Packet4f> typedef float type; typedef Packet4f half; typedef Packet4i integer_packet; - enum {size=4, alignment=Aligned16}; + enum {size=4, alignment=Aligned16, vectorizable=true}; }; template<> struct unpacket_traits<Packet4i> { typedef int type; typedef Packet4i half; - enum {size=4, alignment=Aligned16}; + enum {size=4, alignment=Aligned16, vectorizable=false}; }; inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v) @@ -921,7 +921,7 @@ template<> struct packet_traits<double> : default_packet_traits }; }; -template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; +template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2d half; }; inline std::ostream & operator <<(std::ostream & s, const Packet2l & v) { diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h index e3b2d56ec..c1b097fb9 100644 --- a/Eigen/src/Core/arch/GPU/PacketMath.h +++ b/Eigen/src/Core/arch/GPU/PacketMath.h @@ -90,8 +90,8 @@ template<> struct packet_traits<double> : default_packet_traits }; -template<> struct unpacket_traits<float4> { typedef float type; enum {size=4, alignment=Aligned16}; typedef float4 half; }; -template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16}; typedef double2 half; }; +template<> struct unpacket_traits<float4> { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef float4 half; }; +template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef double2 half; }; template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float& from) { return make_float4(from, from, from, from); diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index 020baa353..316ac0283 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -41,7 +41,7 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits }; }; -template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; }; +template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef half2 half; }; template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) { @@ -521,7 +521,7 @@ struct packet_traits<half> : default_packet_traits { }; -template<> struct unpacket_traits<Packet16h> { typedef Eigen::half type; enum {size=16, alignment=Aligned32}; typedef Packet16h half; }; +template<> struct unpacket_traits<Packet16h> { typedef Eigen::half type; enum {size=16, alignment=Aligned32, vectorizable=true}; typedef Packet16h half; }; template<> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) { Packet16h result; @@ -1003,7 +1003,7 @@ struct packet_traits<Eigen::half> : default_packet_traits { }; -template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16}; typedef Packet8h half; }; +template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true}; typedef Packet8h half; }; template<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) { Packet8h result; @@ -1359,7 +1359,7 @@ struct packet_traits<Eigen::half> : default_packet_traits { }; -template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16}; typedef Packet4h half; }; +template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef Packet4h half; }; template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) { Packet4h result; diff --git a/Eigen/src/Core/arch/MSA/Complex.h b/Eigen/src/Core/arch/MSA/Complex.h index 9a45cf51e..fa64d3564 100644 --- a/Eigen/src/Core/arch/MSA/Complex.h +++ b/Eigen/src/Core/arch/MSA/Complex.h @@ -127,7 +127,7 @@ struct packet_traits<std::complex<float> > : default_packet_traits { template <> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; - enum { size = 2, alignment = Aligned16 }; + enum { size = 2, alignment = Aligned16, vectorizable=true }; typedef Packet2cf half; }; @@ -500,7 +500,7 @@ struct packet_traits<std::complex<double> > : default_packet_traits { template <> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; - enum { size = 1, alignment = Aligned16 }; + enum { size = 1, alignment = Aligned16, vectorizable=true }; typedef Packet1cd half; }; diff --git a/Eigen/src/Core/arch/MSA/PacketMath.h b/Eigen/src/Core/arch/MSA/PacketMath.h index 094c874ee..a97156a84 100644 --- a/Eigen/src/Core/arch/MSA/PacketMath.h +++ b/Eigen/src/Core/arch/MSA/PacketMath.h @@ -117,14 +117,14 @@ struct packet_traits<int32_t> : default_packet_traits { template <> struct unpacket_traits<Packet4f> { typedef float type; - enum { size = 4, alignment = Aligned16 }; + enum { size = 4, alignment = Aligned16, vectorizable=true }; typedef Packet4f half; }; template <> struct unpacket_traits<Packet4i> { typedef int32_t type; - enum { size = 4, alignment = Aligned16 }; + enum { size = 4, alignment = Aligned16, vectorizable=true }; typedef Packet4i half; }; @@ -925,7 +925,7 @@ struct packet_traits<double> : default_packet_traits { template <> struct unpacket_traits<Packet2d> { typedef double type; - enum { size = 2, alignment = Aligned16 }; + enum { size = 2, alignment = Aligned16, vectorizable=true }; typedef Packet2d half; }; diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index 62d107645..d149275b5 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -62,7 +62,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits }; }; -template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; +template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) { @@ -328,7 +328,7 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits }; }; -template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; +template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; }; template<> EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); } diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 76e6b3966..e8b351849 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -145,13 +145,13 @@ template<> struct unpacket_traits<Packet4f> typedef float type; typedef Packet4f half; typedef Packet4i integer_packet; - enum {size=4, alignment=Aligned16}; + enum {size=4, alignment=Aligned16, vectorizable=true}; }; template<> struct unpacket_traits<Packet4i> { typedef int32_t type; typedef Packet4i half; - enum {size=4, alignment=Aligned16}; + enum {size=4, alignment=Aligned16, vectorizable=true}; }; template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); } @@ -657,7 +657,7 @@ template<> struct packet_traits<double> : default_packet_traits }; }; -template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; +template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2d half; }; template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return vdupq_n_f64(from); } diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index e51966f0d..f39988eac 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -50,7 +50,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits }; #endif -template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; +template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); } @@ -283,7 +283,7 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits }; #endif -template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; +template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; }; template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index ebc540e24..9c3750af0 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -166,17 +166,17 @@ template<> struct unpacket_traits<Packet4f> { typedef float type; typedef Packet4f half; typedef Packet4i integer_packet; - enum {size=4, alignment=Aligned16}; + enum {size=4, alignment=Aligned16, vectorizable=true}; }; template<> struct unpacket_traits<Packet2d> { typedef double type; typedef Packet2d half; - enum {size=2, alignment=Aligned16}; + enum {size=2, alignment=Aligned16, vectorizable=true}; }; template<> struct unpacket_traits<Packet4i> { typedef int type; typedef Packet4i half; - enum {size=4, alignment=Aligned16}; + enum {size=4, alignment=Aligned16, vectorizable=false}; }; #ifndef EIGEN_VECTORIZE_AVX diff --git a/Eigen/src/Core/arch/SYCL/InteropHeaders.h b/Eigen/src/Core/arch/SYCL/InteropHeaders.h index c1da40d14..294cb101a 100644 --- a/Eigen/src/Core/arch/SYCL/InteropHeaders.h +++ b/Eigen/src/Core/arch/SYCL/InteropHeaders.h @@ -88,7 +88,7 @@ SYCL_ARITHMETIC(cl::sycl::cl_double2) #define SYCL_UNPACKET_TRAITS(packet_type, unpacket_type, lengths)\ template<> struct unpacket_traits<packet_type> {\ typedef unpacket_type type;\ - enum {size=lengths, alignment=Aligned16};\ + enum {size=lengths, alignment=Aligned16, vectorizable=true};\ typedef packet_type half;\ }; SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4) diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h index 95aba428f..167c3ee4c 100644 --- a/Eigen/src/Core/arch/ZVector/Complex.h +++ b/Eigen/src/Core/arch/ZVector/Complex.h @@ -91,8 +91,8 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits }; }; -template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; -template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; +template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; }; +template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; }; /* Forward declaration */ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel); diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h index 0b37f4992..c8e90f1a8 100755 --- a/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -239,9 +239,9 @@ template<> struct packet_traits<double> : default_packet_traits }; }; -template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; -template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; -template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; +template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef Packet4i half; }; +template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef Packet4f half; }; +template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2d half; }; /* Forward declaration */ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f,4>& kernel); diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 86767d0e9..030c7740a 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -15,7 +15,13 @@ namespace Eigen { namespace internal { -template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false, int Arch=Architecture::Target> +enum PacketSizeType { + PacketFull = 0, + PacketHalf, + PacketQuarter +}; + +template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false, int Arch=Architecture::Target, int _PacketSize=PacketFull> class gebp_traits; @@ -365,6 +371,43 @@ struct QuadPacket const Packet& get(const FixedInt<3>&) const { return B3; } }; +template <int N, typename T1, typename T2, typename T3> +struct packet_conditional { typedef T3 type; }; + +template <typename T1, typename T2, typename T3> +struct packet_conditional<PacketFull, T1, T2, T3> { typedef T1 type; }; + +template <typename T1, typename T2, typename T3> +struct packet_conditional<PacketHalf, T1, T2, T3> { typedef T2 type; }; + +#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \ + typedef typename packet_conditional<packet_size, \ + typename packet_traits<name ## Scalar>::type, \ + typename packet_traits<name ## Scalar>::half, \ + typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \ + prefix ## name ## Packet + +#define PACKET_DECL_COND(name, packet_size) \ + typedef typename packet_conditional<packet_size, \ + typename packet_traits<name ## Scalar>::type, \ + typename packet_traits<name ## Scalar>::half, \ + typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \ + name ## Packet + +#define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size) \ + typedef typename packet_conditional<packet_size, \ + typename packet_traits<Scalar>::type, \ + typename packet_traits<Scalar>::half, \ + typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \ + prefix ## ScalarPacket + +#define PACKET_DECL_COND_SCALAR(packet_size) \ + typedef typename packet_conditional<packet_size, \ + typename packet_traits<Scalar>::type, \ + typename packet_traits<Scalar>::half, \ + typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \ + ScalarPacket + /* Vectorization logic * real*real: unpack rhs to constant packets, ... * @@ -375,7 +418,7 @@ struct QuadPacket * cplx*real : unpack rhs to constant packets, ... * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual */ -template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs, int Arch> +template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize> class gebp_traits { public: @@ -383,13 +426,17 @@ public: typedef _RhsScalar RhsScalar; typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + enum { ConjLhs = _ConjLhs, ConjRhs = _ConjRhs, - Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable, - LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, - RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, - ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, + Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, @@ -413,9 +460,6 @@ public: RhsProgress = 1 }; - typedef typename packet_traits<LhsScalar>::type _LhsPacket; - typedef typename packet_traits<RhsScalar>::type _RhsPacket; - typedef typename packet_traits<ResScalar>::type _ResPacket; typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; @@ -503,21 +547,25 @@ public: }; -template<typename RealScalar, bool _ConjLhs, int Arch> -class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false, Arch> +template<typename RealScalar, bool _ConjLhs, int Arch, int _PacketSize> +class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false, Arch, _PacketSize> { public: typedef std::complex<RealScalar> LhsScalar; typedef RealScalar RhsScalar; typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + enum { ConjLhs = _ConjLhs, ConjRhs = false, - Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable, - LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, - RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, - ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, + Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, nr = 4, @@ -532,10 +580,6 @@ public: RhsProgress = 1 }; - typedef typename packet_traits<LhsScalar>::type _LhsPacket; - typedef typename packet_traits<RhsScalar>::type _RhsPacket; - typedef typename packet_traits<ResScalar>::type _ResPacket; - typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; @@ -714,8 +758,8 @@ template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { // return res; // } -template<typename RealScalar, bool _ConjLhs, bool _ConjRhs, int Arch> -class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs,Arch> +template<typename RealScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize> +class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs, Arch, _PacketSize > { public: typedef std::complex<RealScalar> Scalar; @@ -723,15 +767,21 @@ public: typedef std::complex<RealScalar> RhsScalar; typedef std::complex<RealScalar> ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + PACKET_DECL_COND(Real, _PacketSize); + PACKET_DECL_COND_SCALAR(_PacketSize); + enum { ConjLhs = _ConjLhs, ConjRhs = _ConjRhs, - Vectorizable = packet_traits<RealScalar>::Vectorizable - && packet_traits<Scalar>::Vectorizable, - ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, - LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, - RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, - RealPacketSize = Vectorizable ? packet_traits<RealScalar>::size : 1, + Vectorizable = unpacket_traits<RealPacket>::vectorizable + && unpacket_traits<ScalarPacket>::vectorizable, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<RhsScalar>::size : 1, + RealPacketSize = Vectorizable ? unpacket_traits<RealPacket>::size : 1, // FIXME: should depend on NumberOfRegisters nr = 4, @@ -741,8 +791,6 @@ public: RhsProgress = 1 }; - typedef typename packet_traits<RealScalar>::type RealPacket; - typedef typename packet_traits<Scalar>::type ScalarPacket; typedef DoublePacket<RealPacket> DoublePacketType; typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type LhsPacket4Packing; @@ -876,8 +924,8 @@ protected: conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj; }; -template<typename RealScalar, bool _ConjRhs, int Arch> -class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs, Arch> +template<typename RealScalar, bool _ConjRhs, int Arch, int _PacketSize> +class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs, Arch, _PacketSize > { public: typedef std::complex<RealScalar> Scalar; @@ -885,14 +933,25 @@ public: typedef Scalar RhsScalar; typedef Scalar ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Real, _PacketSize); + PACKET_DECL_COND_SCALAR_PREFIX(_, _PacketSize); + +#undef PACKET_DECL_COND_SCALAR_PREFIX +#undef PACKET_DECL_COND_PREFIX +#undef PACKET_DECL_COND_SCALAR +#undef PACKET_DECL_COND + enum { ConjLhs = false, ConjRhs = _ConjRhs, - Vectorizable = packet_traits<RealScalar>::Vectorizable - && packet_traits<Scalar>::Vectorizable, - LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, - RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, - ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, + Vectorizable = unpacket_traits<_RealPacket>::vectorizable + && unpacket_traits<_ScalarPacket>::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, // FIXME: should depend on NumberOfRegisters @@ -903,10 +962,6 @@ public: RhsProgress = 1 }; - typedef typename packet_traits<LhsScalar>::type _LhsPacket; - typedef typename packet_traits<RhsScalar>::type _RhsPacket; - typedef typename packet_traits<ResScalar>::type _ResPacket; - typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; @@ -998,9 +1053,9 @@ protected: #if EIGEN_ARCH_ARM64 && defined EIGEN_VECTORIZE_NEON -template<> -struct gebp_traits <float, float, false, false,Architecture::NEON> - : gebp_traits<float,float,false,false,Architecture::Generic> +template<int _PacketSize> +struct gebp_traits <float, float, false, false,Architecture::NEON,_PacketSize> + : gebp_traits<float,float,false,false,Architecture::Generic,_PacketSize> { typedef float RhsPacket; @@ -1121,7 +1176,10 @@ struct gebp_traits <double, double, false, false,Architecture::NEON> template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> struct gebp_kernel { - typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits; + typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits; + typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,PacketHalf> HalfTraits; + typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,PacketQuarter> QuarterTraits; + typedef typename Traits::ResScalar ResScalar; typedef typename Traits::LhsPacket LhsPacket; typedef typename Traits::RhsPacket RhsPacket; @@ -1131,19 +1189,34 @@ struct gebp_kernel typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 15>::type RhsPanel15; - typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits; + typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits; + typedef typename SwappedTraits::ResScalar SResScalar; typedef typename SwappedTraits::LhsPacket SLhsPacket; typedef typename SwappedTraits::RhsPacket SRhsPacket; typedef typename SwappedTraits::ResPacket SResPacket; typedef typename SwappedTraits::AccPacket SAccPacket; + typedef typename HalfTraits::LhsPacket LhsPacketHalf; + typedef typename HalfTraits::RhsPacket RhsPacketHalf; + typedef typename HalfTraits::ResPacket ResPacketHalf; + typedef typename HalfTraits::AccPacket AccPacketHalf; + + typedef typename QuarterTraits::LhsPacket LhsPacketQuarter; + typedef typename QuarterTraits::RhsPacket RhsPacketQuarter; + typedef typename QuarterTraits::ResPacket ResPacketQuarter; + typedef typename QuarterTraits::AccPacket AccPacketQuarter; + typedef typename DataMapper::LinearMapper LinearMapper; enum { Vectorizable = Traits::Vectorizable, LhsProgress = Traits::LhsProgress, + LhsProgressHalf = HalfTraits::LhsProgress, + LhsProgressQuarter = QuarterTraits::LhsProgress, RhsProgress = Traits::RhsProgress, + RhsProgressHalf = HalfTraits::RhsProgress, + RhsProgressQuarter = QuarterTraits::RhsProgress, ResPacketSize = Traits::ResPacketSize }; @@ -1154,11 +1227,11 @@ struct gebp_kernel }; template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs, - int SwappedLhsProgress = gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs>::LhsProgress> +int SwappedLhsProgress = gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target>::LhsProgress> struct last_row_process_16_packets { - typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits; - typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits; + typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits; + typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits; typedef typename Traits::ResScalar ResScalar; typedef typename SwappedTraits::LhsPacket SLhsPacket; @@ -1186,8 +1259,8 @@ struct last_row_process_16_packets template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs, 16> { - typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits; - typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits; + typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits; + typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits; typedef typename Traits::ResScalar ResScalar; typedef typename SwappedTraits::LhsPacket SLhsPacket; @@ -1233,6 +1306,202 @@ struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, } }; +template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper> +struct lhs_process_one_packet +{ + typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4; + + EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3) + { + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4"); + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); + traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0); + traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel); + traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>); + traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>); + traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>); + traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>); + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4"); + } + + EIGEN_STRONG_INLINE void operator()( + const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha, + Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB, + int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4) + { + GEBPTraits traits; + + // loops on each largest micro horizontal panel of lhs + // (LhsProgress x depth) + for(Index i=peelStart; i<peelEnd; i+=LhsProgress) + { + // loops on each largest micro vertical panel of rhs (depth * nr) + for(Index j2=0; j2<packet_cols4; j2+=nr) + { + // We select a LhsProgress x nr micro block of res + // which is entirely stored into 1 x nr registers. + + const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)]; + prefetch(&blA[0]); + + // gets res block as register + AccPacket C0, C1, C2, C3; + traits.initAcc(C0); + traits.initAcc(C1); + traits.initAcc(C2); + traits.initAcc(C3); + + LinearMapper r0 = res.getLinearMapper(i, j2 + 0); + LinearMapper r1 = res.getLinearMapper(i, j2 + 1); + LinearMapper r2 = res.getLinearMapper(i, j2 + 2); + LinearMapper r3 = res.getLinearMapper(i, j2 + 3); + + r0.prefetch(prefetch_res_offset); + r1.prefetch(prefetch_res_offset); + r2.prefetch(prefetch_res_offset); + r3.prefetch(prefetch_res_offset); + + // performs "inner" products + const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; + prefetch(&blB[0]); + LhsPacket A0; + + for(Index k=0; k<peeled_kc; k+=pk) + { + EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4"); + RhsPacketx4 rhs_panel; + RhsPacket T0; + + internal::prefetch(blB+(48+0)); + peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + peeled_kc_onestep(1, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + peeled_kc_onestep(3, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + internal::prefetch(blB+(48+16)); + peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + peeled_kc_onestep(5, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + peeled_kc_onestep(7, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + + blB += pk*4*RhsProgress; + blA += pk*LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX4"); + } + + // process remaining peeled loop + for(Index k=peeled_kc; k<depth; k++) + { + RhsPacketx4 rhs_panel; + RhsPacket T0; + peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + blB += 4*RhsProgress; + blA += LhsProgress; + } + + ResPacket R0, R1; + ResPacket alphav = pset1<ResPacket>(alpha); + + R0 = r0.template loadPacket<ResPacket>(0); + R1 = r1.template loadPacket<ResPacket>(0); + traits.acc(C0, alphav, R0); + traits.acc(C1, alphav, R1); + r0.storePacket(0, R0); + r1.storePacket(0, R1); + + R0 = r2.template loadPacket<ResPacket>(0); + R1 = r3.template loadPacket<ResPacket>(0); + traits.acc(C2, alphav, R0); + traits.acc(C3, alphav, R1); + r2.storePacket(0, R0); + r3.storePacket(0, R1); + } + + // Deal with remaining columns of the rhs + for(Index j2=packet_cols4; j2<cols; j2++) + { + // One column at a time + const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)]; + prefetch(&blA[0]); + + // gets res block as register + AccPacket C0; + traits.initAcc(C0); + + LinearMapper r0 = res.getLinearMapper(i, j2); + + // performs "inner" products + const RhsScalar* blB = &blockB[j2*strideB+offsetB]; + LhsPacket A0; + + for(Index k= 0; k<peeled_kc; k+=pk) + { + EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX1"); + RhsPacket B_0; + +#define EIGEN_GEBGP_ONESTEP(K) \ + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \ + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ + /* FIXME: why unaligned???? */ \ + traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \ + traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ + traits.madd(A0, B_0, C0, B_0, fix<0>); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \ + } while(false); + + EIGEN_GEBGP_ONESTEP(0); + EIGEN_GEBGP_ONESTEP(1); + EIGEN_GEBGP_ONESTEP(2); + EIGEN_GEBGP_ONESTEP(3); + EIGEN_GEBGP_ONESTEP(4); + EIGEN_GEBGP_ONESTEP(5); + EIGEN_GEBGP_ONESTEP(6); + EIGEN_GEBGP_ONESTEP(7); + + blB += pk*RhsProgress; + blA += pk*LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1"); + } + + // process remaining peeled loop + for(Index k=peeled_kc; k<depth; k++) + { + RhsPacket B_0; + EIGEN_GEBGP_ONESTEP(0); + blB += RhsProgress; + blA += LhsProgress; + } +#undef EIGEN_GEBGP_ONESTEP + ResPacket R0; + ResPacket alphav = pset1<ResPacket>(alpha); + R0 = r0.template loadPacket<ResPacket>(0); + traits.acc(C0, alphav, R0); + r0.storePacket(0, R0); + } + } + } +}; + +template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper> +struct lhs_process_fraction_of_packet : lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper> +{ + +EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3) + { + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4"); + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); + traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0); + traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3); + traits.madd(*A0, *B_0, *C0, *B_0); + traits.madd(*A0, *B1, *C1, *B1); + traits.madd(*A0, *B2, *C2, *B2); + traits.madd(*A0, *B3, *C3, *B3); + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4"); + } +}; + template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> EIGEN_DONT_INLINE void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs> @@ -1249,10 +1518,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0; const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0; - const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0; + const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0; + const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0; + const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0; enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell) const Index peeled_kc = depth & ~(pk-1); - const Index prefetch_res_offset = 32/sizeof(ResScalar); + const int prefetch_res_offset = 32/sizeof(ResScalar); // const Index depth2 = depth & ~1; //---------- Process 3 * LhsProgress rows at once ---------- @@ -1718,176 +1989,29 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga //---------- Process 1 * LhsProgress rows at once ---------- if(mr>=1*Traits::LhsProgress) { - // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth) - for(Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress) - { - // loops on each largest micro vertical panel of rhs (depth * nr) - for(Index j2=0; j2<packet_cols4; j2+=nr) - { - // We select a 1*Traits::LhsProgress x nr micro block of res which is entirely - // stored into 1 x nr registers. - - const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)]; - prefetch(&blA[0]); - - // gets res block as register - AccPacket C0, C1, C2, C3; - traits.initAcc(C0); - traits.initAcc(C1); - traits.initAcc(C2); - traits.initAcc(C3); - - LinearMapper r0 = res.getLinearMapper(i, j2 + 0); - LinearMapper r1 = res.getLinearMapper(i, j2 + 1); - LinearMapper r2 = res.getLinearMapper(i, j2 + 2); - LinearMapper r3 = res.getLinearMapper(i, j2 + 3); - - r0.prefetch(prefetch_res_offset); - r1.prefetch(prefetch_res_offset); - r2.prefetch(prefetch_res_offset); - r3.prefetch(prefetch_res_offset); - - // performs "inner" products - const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; - prefetch(&blB[0]); - LhsPacket A0; - - for(Index k=0; k<peeled_kc; k+=pk) - { - EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4"); - RhsPacketx4 rhs_panel; - RhsPacket T0; - -#define EIGEN_GEBGP_ONESTEP(K) \ - do { \ - EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \ - EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ - traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \ - traits.loadRhs(&blB[(0+4*K)*RhsProgress], rhs_panel); \ - traits.madd(A0, rhs_panel, C0, T0, fix<0>); \ - traits.madd(A0, rhs_panel, C1, T0, fix<1>); \ - traits.madd(A0, rhs_panel, C2, T0, fix<2>); \ - traits.madd(A0, rhs_panel, C3, T0, fix<3>); \ - EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \ - } while(false) - - internal::prefetch(blB+(48+0)); - EIGEN_GEBGP_ONESTEP(0); - EIGEN_GEBGP_ONESTEP(1); - EIGEN_GEBGP_ONESTEP(2); - EIGEN_GEBGP_ONESTEP(3); - internal::prefetch(blB+(48+16)); - EIGEN_GEBGP_ONESTEP(4); - EIGEN_GEBGP_ONESTEP(5); - EIGEN_GEBGP_ONESTEP(6); - EIGEN_GEBGP_ONESTEP(7); - - blB += pk*4*RhsProgress; - blA += pk*1*LhsProgress; - - EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4"); - } - // process remaining peeled loop - for(Index k=peeled_kc; k<depth; k++) - { - RhsPacketx4 rhs_panel; - RhsPacket T0; - EIGEN_GEBGP_ONESTEP(0); - blB += 4*RhsProgress; - blA += 1*LhsProgress; - } -#undef EIGEN_GEBGP_ONESTEP - - ResPacket R0, R1; - ResPacket alphav = pset1<ResPacket>(alpha); - - R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize); - R1 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize); - traits.acc(C0, alphav, R0); - traits.acc(C1, alphav, R1); - r0.storePacket(0 * Traits::ResPacketSize, R0); - r1.storePacket(0 * Traits::ResPacketSize, R1); - - R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize); - R1 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize); - traits.acc(C2, alphav, R0); - traits.acc(C3, alphav, R1); - r2.storePacket(0 * Traits::ResPacketSize, R0); - r3.storePacket(0 * Traits::ResPacketSize, R1); - } - - // Deal with remaining columns of the rhs - for(Index j2=packet_cols4; j2<cols; j2++) - { - // One column at a time - const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)]; - prefetch(&blA[0]); - - // gets res block as register - AccPacket C0; - traits.initAcc(C0); - - LinearMapper r0 = res.getLinearMapper(i, j2); - - // performs "inner" products - const RhsScalar* blB = &blockB[j2*strideB+offsetB]; - LhsPacket A0; - - for(Index k=0; k<peeled_kc; k+=pk) - { - EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX1"); - RhsPacket B_0; - -#define EIGEN_GEBGP_ONESTEP(K) \ - do { \ - EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1"); \ - EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ - traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \ - traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ - traits.madd(A0, B_0, C0, B_0, fix<0>); \ - EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1"); \ - } while(false); - - EIGEN_GEBGP_ONESTEP(0); - EIGEN_GEBGP_ONESTEP(1); - EIGEN_GEBGP_ONESTEP(2); - EIGEN_GEBGP_ONESTEP(3); - EIGEN_GEBGP_ONESTEP(4); - EIGEN_GEBGP_ONESTEP(5); - EIGEN_GEBGP_ONESTEP(6); - EIGEN_GEBGP_ONESTEP(7); - - blB += pk*RhsProgress; - blA += pk*1*Traits::LhsProgress; - - EIGEN_ASM_COMMENT("end gebp micro kernel 1pX1"); - } - - // process remaining peeled loop - for(Index k=peeled_kc; k<depth; k++) - { - RhsPacket B_0; - EIGEN_GEBGP_ONESTEP(0); - blB += RhsProgress; - blA += 1*Traits::LhsProgress; - } -#undef EIGEN_GEBGP_ONESTEP - ResPacket R0; - ResPacket alphav = pset1<ResPacket>(alpha); - R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize); - traits.acc(C0, alphav, R0); - r0.storePacket(0 * Traits::ResPacketSize, R0); - } - } + lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, Traits, LinearMapper, DataMapper> p; + p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4); + } + //---------- Process LhsProgressHalf rows at once ---------- + if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf) + { + lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf, LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper> p; + p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4); + } + //---------- Process LhsProgressQuarter rows at once ---------- + if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter) + { + lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar, AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter, QuarterTraits, LinearMapper, DataMapper> p; + p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4); } //---------- Process remaining rows, 1 at once ---------- - if(peeled_mc1<rows) + if(peeled_mc_quarter<rows) { // loop on each panel of the rhs for(Index j2=0; j2<packet_cols4; j2+=nr) { // loop on each row of the lhs (1*LhsProgress x depth) - for(Index i=peeled_mc1; i<rows; i+=1) + for(Index i=peeled_mc_quarter; i<rows; i+=1) { const LhsScalar* blA = &blockA[i*strideA+offsetA]; prefetch(&blA[0]); @@ -2030,7 +2154,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga for(Index j2=packet_cols4; j2<cols; j2++) { // loop on each row of the lhs (1*LhsProgress x depth) - for(Index i=peeled_mc1; i<rows; i+=1) + for(Index i=peeled_mc_quarter; i<rows; i+=1) { const LhsScalar* blA = &blockA[i*strideA+offsetA]; prefetch(&blA[0]); @@ -2077,7 +2201,13 @@ template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pa EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - enum { PacketSize = unpacket_traits<Packet>::size }; + typedef typename unpacket_traits<Packet>::half HalfPacket; + typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket; + enum { PacketSize = unpacket_traits<Packet>::size, + HalfPacketSize = unpacket_traits<HalfPacket>::size, + QuarterPacketSize = unpacket_traits<QuarterPacket>::size, + HasHalf = (int)HalfPacketSize < (int)PacketSize, + HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); EIGEN_UNUSED_VARIABLE(stride); @@ -2089,9 +2219,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; - const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; - const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1 - : Pack2>1 ? (rows/Pack2)*Pack2 : 0; + const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0; + const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (rows/(QuarterPacketSize))*(QuarterPacketSize) : 0; + const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0; + const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter + : Pack2>1 && last_lhs_progress ? (rows/last_lhs_progress)*last_lhs_progress : 0; Index i=0; @@ -2150,20 +2283,60 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa if(PanelMode) count += (1*PacketSize) * (stride-offset-depth); } } - // Pack scalars + // Pack half packets + if(HasHalf && Pack1>=HalfPacketSize) + { + for(; i<peeled_mc_half; i+=HalfPacketSize) + { + if(PanelMode) count += (HalfPacketSize) * offset; + + for(Index k=0; k<depth; k++) + { + HalfPacket A; + A = lhs.template loadPacket<HalfPacket>(i+0*(HalfPacketSize), k); + pstoreu(blockA+count, cj.pconj(A)); + count+=HalfPacketSize; + } + if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth); + } + } + // Pack quarter packets + if(HasQuarter && Pack1>=QuarterPacketSize) + { + for(; i<peeled_mc_quarter; i+=QuarterPacketSize) + { + if(PanelMode) count += (QuarterPacketSize) * offset; + + for(Index k=0; k<depth; k++) + { + QuarterPacket A; + A = lhs.template loadPacket<QuarterPacket>(i+0*(QuarterPacketSize), k); + pstoreu(blockA+count, cj.pconj(A)); + count+=QuarterPacketSize; + } + if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth); + } + } + // Pack2 may be *smaller* than PacketSize—that happens for + // products like real * complex, where we have to go half the + // progress on the lhs in order to duplicate those operands to + // address both real & imaginary parts on the rhs. This portion will + // pack those half ones until they match the number expected on the + // last peeling loop at this point (for the rhs). if(Pack2<PacketSize && Pack2>1) { - for(; i<peeled_mc0; i+=Pack2) + for(; i<peeled_mc0; i+=last_lhs_progress) { - if(PanelMode) count += Pack2 * offset; + if(PanelMode) count += last_lhs_progress * offset; for(Index k=0; k<depth; k++) - for(Index w=0; w<Pack2; w++) + for(Index w=0; w<last_lhs_progress; w++) blockA[count++] = cj(lhs(i+w, k)); - if(PanelMode) count += Pack2 * (stride-offset-depth); + if(PanelMode) count += last_lhs_progress * (stride-offset-depth); } } + // Pack scalars for(; i<rows; i++) { if(PanelMode) count += offset; @@ -2184,7 +2357,13 @@ template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pa EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - enum { PacketSize = unpacket_traits<Packet>::size }; + typedef typename unpacket_traits<Packet>::half HalfPacket; + typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket; + enum { PacketSize = unpacket_traits<Packet>::size, + HalfPacketSize = unpacket_traits<HalfPacket>::size, + QuarterPacketSize = unpacket_traits<QuarterPacket>::size, + HasHalf = (int)HalfPacketSize < (int)PacketSize, + HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); EIGEN_UNUSED_VARIABLE(stride); @@ -2192,37 +2371,51 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; Index count = 0; + bool gone_half = false, gone_quarter = false, gone_last = false; -// const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; -// const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; -// const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; - - int pack = Pack1; Index i = 0; + int pack = Pack1; + int psize = PacketSize; while(pack>0) { Index remaining_rows = rows-i; - Index peeled_mc = i+(remaining_rows/pack)*pack; + Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack; + Index starting_pos = i; for(; i<peeled_mc; i+=pack) { if(PanelMode) count += pack * offset; - const Index peeled_k = (depth/PacketSize)*PacketSize; Index k=0; - if(pack>=PacketSize) + if(pack>=psize && psize >= QuarterPacketSize) { - for(; k<peeled_k; k+=PacketSize) + const Index peeled_k = (depth/psize)*psize; + for(; k<peeled_k; k+=psize) { - for (Index m = 0; m < pack; m += PacketSize) + for (Index m = 0; m < pack; m += psize) { - PacketBlock<Packet> kernel; - for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k); - ptranspose(kernel); - for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); + if (psize == PacketSize) { + PacketBlock<Packet> kernel; + for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k); + ptranspose(kernel); + for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); + } else if (HasHalf && psize == HalfPacketSize) { + gone_half = true; + PacketBlock<HalfPacket> kernel_half; + for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i+p+m, k); + ptranspose(kernel_half); + for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p])); + } else if (HasQuarter && psize == QuarterPacketSize) { + gone_quarter = true; + PacketBlock<QuarterPacket> kernel_quarter; + for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i+p+m, k); + ptranspose(kernel_quarter); + for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p])); + } } - count += PacketSize*pack; + count += psize*pack; } } + for(; k<depth; k++) { Index w=0; @@ -2245,9 +2438,28 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa if(PanelMode) count += pack * (stride-offset-depth); } - pack -= PacketSize; - if(pack<Pack2 && (pack+PacketSize)!=Pack2) - pack = Pack2; + pack -= psize; + Index left = rows - i; + if (pack <= 0) { + if (!gone_last && + (starting_pos == i || left >= psize/2 || left >= psize/4) && + ((psize/2 == HalfPacketSize && HasHalf && !gone_half) || + (psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) { + psize /= 2; + pack = psize; + continue; + } + // Pack2 may be *smaller* than PacketSize—that happens for + // products like real * complex, where we have to go half the + // progress on the lhs in order to duplicate those operands to + // address both real & imaginary parts on the rhs. This portion will + // pack those half ones until they match the number expected on the + // last peeling loop at this point (for the rhs). + if (Pack2 < PacketSize && !gone_last) { + gone_last = true; + psize = pack = left & ~1; + } + } } for(; i<rows; i++) diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index c84c71609..673073601 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -45,14 +45,23 @@ struct symm_pack_lhs } void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows) { - enum { PacketSize = packet_traits<Scalar>::size }; + typedef typename unpacket_traits<typename packet_traits<Scalar>::type>::half HalfPacket; + typedef typename unpacket_traits<typename unpacket_traits<typename packet_traits<Scalar>::type>::half>::half QuarterPacket; + enum { PacketSize = packet_traits<Scalar>::size, + HalfPacketSize = unpacket_traits<HalfPacket>::size, + QuarterPacketSize = unpacket_traits<QuarterPacket>::size, + HasHalf = (int)HalfPacketSize < (int)PacketSize, + HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; + const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride); Index count = 0; //Index peeled_mc3 = (rows/Pack1)*Pack1; const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; - const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0; + const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? peeled_mc_half+((rows-peeled_mc_half)/(QuarterPacketSize))*(QuarterPacketSize) : 0; if(Pack1>=3*PacketSize) for(Index i=0; i<peeled_mc3; i+=3*PacketSize) @@ -66,8 +75,16 @@ struct symm_pack_lhs for(Index i=peeled_mc2; i<peeled_mc1; i+=1*PacketSize) pack<1*PacketSize>(blockA, lhs, cols, i, count); + if(HasHalf && Pack1>=HalfPacketSize) + for(Index i=peeled_mc1; i<peeled_mc_half; i+=HalfPacketSize) + pack<HalfPacketSize>(blockA, lhs, cols, i, count); + + if(HasQuarter && Pack1>=QuarterPacketSize) + for(Index i=peeled_mc_half; i<peeled_mc_quarter; i+=QuarterPacketSize) + pack<QuarterPacketSize>(blockA, lhs, cols, i, count); + // do the same with mr==1 - for(Index i=peeled_mc1; i<rows; i++) + for(Index i=peeled_mc_quarter; i<rows; i++) { for(Index k=0; k<i; k++) blockA[count++] = lhs(i, k); // normal diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 836ff4711..91c2e42e4 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -184,7 +184,8 @@ template<typename T> struct unpacket_traits enum { size = 1, - alignment = 1 + alignment = 1, + vectorizable = false }; }; |