diff options
21 files changed, 125 insertions, 43 deletions
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 04a321b9f..3bba02278 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -287,6 +287,14 @@ pload(const typename unpacket_traits<Packet>::type* from) { return *from; } template<typename Packet> EIGEN_DEVICE_FUNC inline Packet ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; } +/** \internal \returns a packet version of \a *from, (un-aligned masked load) + * There is no generic implementation. We only have implementations for specialized + * cases. Generic case should not be called. + */ +template<typename Packet> EIGEN_DEVICE_FUNC inline +typename enable_if<unpacket_traits<Packet>::masked_load_available, Packet>::type +ploadu(const typename unpacket_traits<Packet>::type* from, typename unpacket_traits<Packet>::mask_t umask); + /** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pset1(const typename unpacket_traits<Packet>::type& a) { return a; } diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 5b8ff59bd..3d229cd81 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -47,7 +47,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits }; #endif -template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4, alignment=Aligned32, vectorizable=true}; typedef Packet2cf half; }; +template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4, alignment=Aligned32, vectorizable=true, masked_load_available=false}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); } @@ -263,7 +263,7 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits }; #endif -template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2, alignment=Aligned32, vectorizable=true}; typedef Packet1cd half; }; +template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2, alignment=Aligned32, vectorizable=true, masked_load_available=false}; typedef Packet1cd half; }; template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index f88e36024..3f94f85eb 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -117,14 +117,15 @@ template<> struct unpacket_traits<Packet8f> { typedef float type; typedef Packet4f half; typedef Packet8i integer_packet; - enum {size=8, alignment=Aligned32, vectorizable=true}; + typedef uint8_t mask_t; + enum {size=8, alignment=Aligned32, vectorizable=true, masked_load_available=true}; }; template<> struct unpacket_traits<Packet4d> { typedef double type; typedef Packet2d half; - enum {size=4, alignment=Aligned32, vectorizable=true}; + enum {size=4, alignment=Aligned32, vectorizable=true, masked_load_available=false}; }; -template<> struct unpacket_traits<Packet8i> { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32, vectorizable=false}; }; +template<> struct unpacket_traits<Packet8i> { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32, vectorizable=false, masked_load_available=false}; }; template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) { return _mm256_set1_ps(from); } template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); } @@ -385,6 +386,14 @@ template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) { EI template<> EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); } template<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); } +template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from, uint8_t umask) { + __m256i mask = _mm256_set1_epi8(static_cast<char>(umask)); + const __m256i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe); + mask = _mm256_or_si256(mask, bit_mask); + mask = _mm256_cmpeq_epi32(mask, _mm256_set1_epi32(0xffffffff)); + EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_maskload_ps(from, mask); +} + // Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3} template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from) { diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 9a89dd01f..5ab2ffe77 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -51,7 +51,8 @@ template<> struct unpacket_traits<Packet8cf> { enum { size = 8, alignment=unpacket_traits<Packet16f>::alignment, - vectorizable=true + vectorizable=true, + masked_load_available=false }; typedef Packet4cf half; }; @@ -247,7 +248,8 @@ template<> struct unpacket_traits<Packet4cd> { enum { size = 4, alignment = unpacket_traits<Packet8d>::alignment, - vectorizable=true + vectorizable=true, + masked_load_available=false }; typedef Packet2cd half; }; diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 60b723b08..094309eda 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -102,19 +102,20 @@ struct unpacket_traits<Packet16f> { typedef float type; typedef Packet8f half; typedef Packet16i integer_packet; - enum { size = 16, alignment=Aligned64, vectorizable=true }; + typedef uint16_t mask_t; + enum { size = 16, alignment=Aligned64, vectorizable=true, masked_load_available=true }; }; template <> struct unpacket_traits<Packet8d> { typedef double type; typedef Packet4d half; - enum { size = 8, alignment=Aligned64, vectorizable=true }; + enum { size = 8, alignment=Aligned64, vectorizable=true, masked_load_available=false }; }; template <> struct unpacket_traits<Packet16i> { typedef int type; typedef Packet8i half; - enum { size = 16, alignment=Aligned64, vectorizable=false }; + enum { size = 16, alignment=Aligned64, vectorizable=false, masked_load_available=false }; }; template <> @@ -485,6 +486,12 @@ EIGEN_STRONG_INLINE Packet16i ploadu<Packet16i>(const int* from) { reinterpret_cast<const __m512i*>(from)); } +template <> +EIGEN_STRONG_INLINE Packet16f ploadu<Packet16f>(const float* from, uint16_t umask) { + __mmask16 mask = static_cast<__mmask16>(umask); + EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_maskz_loadu_ps(mask, from); +} + // Loads 8 floats from memory a returns the packet // {a0, a0 a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7} template <> diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index 0c9f8e05d..ebc3b2aeb 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -60,7 +60,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits }; }; -template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; }; +template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) { @@ -291,7 +291,7 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits }; }; -template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; }; +template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef Packet1cd half; }; template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { return Packet1cd(pload<Packet2d>((const double*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { return Packet1cd(ploadu<Packet2d>((const double*)from)); } diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 9535724eb..b5484e6aa 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -192,13 +192,13 @@ template<> struct unpacket_traits<Packet4f> typedef float type; typedef Packet4f half; typedef Packet4i integer_packet; - enum {size=4, alignment=Aligned16, vectorizable=true}; + enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false}; }; template<> struct unpacket_traits<Packet4i> { typedef int type; typedef Packet4i half; - enum {size=4, alignment=Aligned16, vectorizable=false}; + enum {size=4, alignment=Aligned16, vectorizable=false, masked_load_available=false}; }; inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v) @@ -921,7 +921,7 @@ template<> struct packet_traits<double> : default_packet_traits }; }; -template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2d half; }; +template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef Packet2d half; }; inline std::ostream & operator <<(std::ostream & s, const Packet2l & v) { diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h index cd4615a45..7fac0a5e1 100644 --- a/Eigen/src/Core/arch/GPU/PacketMath.h +++ b/Eigen/src/Core/arch/GPU/PacketMath.h @@ -92,8 +92,8 @@ template<> struct packet_traits<double> : default_packet_traits }; -template<> struct unpacket_traits<float4> { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef float4 half; }; -template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef double2 half; }; +template<> struct unpacket_traits<float4> { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef float4 half; }; +template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef double2 half; }; template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float& from) { return make_float4(from, from, from, from); diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index cd518c7e4..2bee56f0f 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -42,7 +42,7 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits }; }; -template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef half2 half; }; +template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef half2 half; }; template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) { return __half2half2(from); @@ -567,7 +567,7 @@ struct packet_traits<half> : default_packet_traits { }; -template<> struct unpacket_traits<Packet16h> { typedef Eigen::half type; enum {size=16, alignment=Aligned32, vectorizable=true}; typedef Packet16h half; }; +template<> struct unpacket_traits<Packet16h> { typedef Eigen::half type; typedef uint16_t mask_t; enum {size=16, alignment=Aligned32, vectorizable=true, masked_load_available=true}; typedef Packet16h half; }; template<> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) { Packet16h result; @@ -591,6 +591,14 @@ template<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* fr return result; } +template<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from, + uint16_t umask) { + __mmask16 mask = static_cast<__mmask16>(umask); + Packet16h result; + result.x = _mm256_maskz_loadu_epi16(mask, from); + return result; +} + template<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) { // (void*) -> workaround clang warning: // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32 @@ -1056,7 +1064,7 @@ struct packet_traits<Eigen::half> : default_packet_traits { }; -template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true}; typedef Packet8h half; }; +template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef Packet8h half; }; template<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) { Packet8h result; @@ -1419,7 +1427,7 @@ struct packet_traits<Eigen::half> : default_packet_traits { }; -template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef Packet4h half; }; +template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef Packet4h half; }; template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) { Packet4h result; diff --git a/Eigen/src/Core/arch/MSA/Complex.h b/Eigen/src/Core/arch/MSA/Complex.h index fa64d3564..0ced061c5 100644 --- a/Eigen/src/Core/arch/MSA/Complex.h +++ b/Eigen/src/Core/arch/MSA/Complex.h @@ -127,7 +127,7 @@ struct packet_traits<std::complex<float> > : default_packet_traits { template <> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; - enum { size = 2, alignment = Aligned16, vectorizable=true }; + enum { size = 2, alignment = Aligned16, vectorizable=true, masked_load_available=false }; typedef Packet2cf half; }; @@ -500,7 +500,7 @@ struct packet_traits<std::complex<double> > : default_packet_traits { template <> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; - enum { size = 1, alignment = Aligned16, vectorizable=true }; + enum { size = 1, alignment = Aligned16, vectorizable=true, masked_load_available=false }; typedef Packet1cd half; }; diff --git a/Eigen/src/Core/arch/MSA/PacketMath.h b/Eigen/src/Core/arch/MSA/PacketMath.h index a97156a84..f426d5b83 100644 --- a/Eigen/src/Core/arch/MSA/PacketMath.h +++ b/Eigen/src/Core/arch/MSA/PacketMath.h @@ -117,14 +117,14 @@ struct packet_traits<int32_t> : default_packet_traits { template <> struct unpacket_traits<Packet4f> { typedef float type; - enum { size = 4, alignment = Aligned16, vectorizable=true }; + enum { size = 4, alignment = Aligned16, vectorizable=true, masked_load_available=false }; typedef Packet4f half; }; template <> struct unpacket_traits<Packet4i> { typedef int32_t type; - enum { size = 4, alignment = Aligned16, vectorizable=true }; + enum { size = 4, alignment = Aligned16, vectorizable=true, masked_load_available=false }; typedef Packet4i half; }; @@ -925,7 +925,7 @@ struct packet_traits<double> : default_packet_traits { template <> struct unpacket_traits<Packet2d> { typedef double type; - enum { size = 2, alignment = Aligned16, vectorizable=true }; + enum { size = 2, alignment = Aligned16, vectorizable=true, masked_load_available=false }; typedef Packet2d half; }; diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index f6c5c211c..c17d0a03e 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -62,7 +62,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits }; }; -template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; }; +template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) { @@ -340,7 +340,7 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits }; }; -template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; }; +template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef Packet1cd half; }; template<> EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); } diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index e8b351849..b8051cfbf 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -145,13 +145,13 @@ template<> struct unpacket_traits<Packet4f> typedef float type; typedef Packet4f half; typedef Packet4i integer_packet; - enum {size=4, alignment=Aligned16, vectorizable=true}; + enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false}; }; template<> struct unpacket_traits<Packet4i> { typedef int32_t type; typedef Packet4i half; - enum {size=4, alignment=Aligned16, vectorizable=true}; + enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false}; }; template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); } @@ -657,7 +657,7 @@ template<> struct packet_traits<double> : default_packet_traits }; }; -template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2d half; }; +template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef Packet2d half; }; template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return vdupq_n_f64(from); } diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index f39988eac..7d89c323b 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -50,7 +50,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits }; #endif -template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; }; +template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); } @@ -283,7 +283,7 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits }; #endif -template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; }; +template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef Packet1cd half; }; template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 9c3750af0..04b63601b 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -166,17 +166,17 @@ template<> struct unpacket_traits<Packet4f> { typedef float type; typedef Packet4f half; typedef Packet4i integer_packet; - enum {size=4, alignment=Aligned16, vectorizable=true}; + enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false}; }; template<> struct unpacket_traits<Packet2d> { typedef double type; typedef Packet2d half; - enum {size=2, alignment=Aligned16, vectorizable=true}; + enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false}; }; template<> struct unpacket_traits<Packet4i> { typedef int type; typedef Packet4i half; - enum {size=4, alignment=Aligned16, vectorizable=false}; + enum {size=4, alignment=Aligned16, vectorizable=false, masked_load_available=false}; }; #ifndef EIGEN_VECTORIZE_AVX diff --git a/Eigen/src/Core/arch/SYCL/InteropHeaders.h b/Eigen/src/Core/arch/SYCL/InteropHeaders.h index 294cb101a..1afa63bb2 100644 --- a/Eigen/src/Core/arch/SYCL/InteropHeaders.h +++ b/Eigen/src/Core/arch/SYCL/InteropHeaders.h @@ -88,7 +88,7 @@ SYCL_ARITHMETIC(cl::sycl::cl_double2) #define SYCL_UNPACKET_TRAITS(packet_type, unpacket_type, lengths)\ template<> struct unpacket_traits<packet_type> {\ typedef unpacket_type type;\ - enum {size=lengths, alignment=Aligned16, vectorizable=true};\ + enum {size=lengths, alignment=Aligned16, vectorizable=true, masked_load_available=false};\ typedef packet_type half;\ }; SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4) diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h index 167c3ee4c..9fcbcb82d 100644 --- a/Eigen/src/Core/arch/ZVector/Complex.h +++ b/Eigen/src/Core/arch/ZVector/Complex.h @@ -91,8 +91,8 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits }; }; -template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; }; -template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; }; +template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef Packet2cf half; }; +template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef Packet1cd half; }; /* Forward declaration */ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel); diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h index c8e90f1a8..74e0a1313 100755 --- a/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -239,9 +239,9 @@ template<> struct packet_traits<double> : default_packet_traits }; }; -template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef Packet4i half; }; -template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef Packet4f half; }; -template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2d half; }; +template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef Packet4i half; }; +template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef Packet4f half; }; +template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef Packet2d half; }; /* Forward declaration */ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f,4>& kernel); diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 91c2e42e4..ce019946b 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -185,7 +185,8 @@ template<typename T> struct unpacket_traits { size = 1, alignment = 1, - vectorizable = false + vectorizable = false, + masked_load_available=false }; }; diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 4906f6eb0..200670b8c 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -119,6 +119,9 @@ struct packet_helper inline Packet load(const T* from) const { return internal::pload<Packet>(from); } template<typename T> + inline Packet load(const T* from, unsigned long long umask) const { return internal::ploadu<Packet>(from, umask); } + + template<typename T> inline void store(T* to, const Packet& x) const { internal::pstore(to,x); } }; @@ -129,6 +132,9 @@ struct packet_helper<false,Packet> inline T load(const T* from) const { return *from; } template<typename T> + inline T load(const T* from, unsigned long long) const { return *from; } + + template<typename T> inline void store(T* to, const T& x) const { *to = x; } }; @@ -169,6 +175,7 @@ template<typename Scalar,typename Packet> void packetmath() const int size = PacketSize*max_size; EIGEN_ALIGN_MAX Scalar data1[size]; EIGEN_ALIGN_MAX Scalar data2[size]; + EIGEN_ALIGN_MAX Scalar data3[size]; EIGEN_ALIGN_MAX Packet packets[PacketSize*2]; EIGEN_ALIGN_MAX Scalar ref[size]; RealScalar refvalue = RealScalar(0); @@ -194,6 +201,22 @@ template<typename Scalar,typename Packet> void packetmath() VERIFY(areApprox(data1, data2+offset, PacketSize) && "internal::pstoreu"); } + if (internal::unpacket_traits<Packet>::masked_load_available) + { + unsigned long long max_umask = (0x1ull << PacketSize); + for (int offset=0; offset<PacketSize; ++offset) + { + for (unsigned long long umask=0; umask<max_umask; ++umask) + { + packet_helper<internal::unpacket_traits<Packet>::masked_load_available, Packet> h; + h.store(data2, h.load(data1+offset, umask)); + for (int k=0; k<PacketSize; ++k) + data3[k] = ((umask & ( 0x1ull << k )) >> k) ? data1[k+offset] : Scalar(0); + VERIFY(areApprox(data3, data2, PacketSize) && "internal::ploadu masked"); + } + } + } + for (int offset=0; offset<PacketSize; ++offset) { #define MIN(A,B) (A<B?A:B) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 4ca6b3d8c..05c684ff9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -95,6 +95,18 @@ struct TensorEvaluator return internal::ploadt<PacketReturnType, LoadMode>(m_data + index); } + // Return a packet starting at `index` where `umask` specifies which elements + // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for + // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding + // float element will be loaded, otherwise 0 will be loaded. + // Function has been templatized to enable Sfinae. + template <typename PacketReturnTypeT = PacketReturnType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::enable_if<internal::unpacket_traits<PacketReturnTypeT>::masked_load_available, PacketReturnTypeT>::type + partialPacket(Index index, typename internal::unpacket_traits<PacketReturnTypeT>::mask_t umask) const + { + return internal::ploadu<PacketReturnTypeT>(m_data + index, umask); + } + template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { @@ -244,6 +256,18 @@ struct TensorEvaluator<const Derived, Device> return internal::ploadt_ro<PacketReturnType, LoadMode>(m_data + index); } + // Return a packet starting at `index` where `umask` specifies which elements + // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for + // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding + // float element will be loaded, otherwise 0 will be loaded. + // Function has been templatized to enable Sfinae. + template <typename PacketReturnTypeT = PacketReturnType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::enable_if<internal::unpacket_traits<PacketReturnTypeT>::masked_load_available, PacketReturnTypeT>::type + partialPacket(Index index, typename internal::unpacket_traits<PacketReturnTypeT>::mask_t umask) const + { + return internal::ploadu<PacketReturnTypeT>(m_data + index, umask); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const { eigen_assert(m_data); const Index index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_dims.IndexOfColMajor(coords) |