From 69ace742be6f00f4280d312e046b0b1422fd112c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 30 Nov 2018 15:56:08 +0100 Subject: Several improvements regarding packet-bitwise operations: - add unit tests - optimize their AVX512f implementation - add missing implementations (half, Packet4f, ...) --- Eigen/src/Core/arch/GPU/PacketMathHalf.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'Eigen/src/Core/arch/GPU') diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index 8787adcde..cdd2b001b 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -640,6 +640,19 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { #endif } +template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) { + Packet16h r; r.x = por(a.x,b.x); return r; +} +template<> EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a,const Packet16h& b) { + Packet16h r; r.x = pxor(a.x,b.x); return r; +} +template<> EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a,const Packet16h& b) { + Packet16h r; r.x = pand(a.x,b.x); return r; +} +template<> EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a,const Packet16h& b) { + Packet16h r; r.x = pandnot(a.x,b.x); return r; +} + template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) { // FIXME we could do that with bit manipulation Packet16f af = half2float(a); @@ -1063,6 +1076,19 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { #endif } +template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) { + Packet8h r; r.x = por(a.x,b.x); return r; +} +template<> EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a,const Packet8h& b) { + Packet8h r; r.x = pxor(a.x,b.x); return r; +} +template<> EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a,const Packet8h& b) { + Packet8h r; r.x = pand(a.x,b.x); return r; +} +template<> EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a,const Packet8h& b) { + Packet8h r; r.x = pandnot(a.x,b.x); return r; +} + template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; } template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) { -- cgit v1.2.3 From 1ac2695ef7e1fc8e147a37ad97391d7a2941c696 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 6 Dec 2018 00:05:10 +0100 Subject: bug #1636: fix compilation with some ABI versions. --- Eigen/src/Core/arch/GPU/PacketMathHalf.h | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'Eigen/src/Core/arch/GPU') diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index cdd2b001b..f3d721dd7 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -641,16 +641,18 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { } template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) { - Packet16h r; r.x = por(a.x,b.x); return r; + // in some cases Packet8i is a wrapper around __m256i, so we need to + // cast to Packet8i to call the correct overload. + Packet16h r; r.x = por(Packet8i(a.x),Packet8i(b.x)); return r; } template<> EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a,const Packet16h& b) { - Packet16h r; r.x = pxor(a.x,b.x); return r; + Packet16h r; r.x = pxor(Packet8i(a.x),Packet8i(b.x)); return r; } template<> EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a,const Packet16h& b) { - Packet16h r; r.x = pand(a.x,b.x); return r; + Packet16h r; r.x = pand(Packet8i(a.x),Packet8i(b.x)); return r; } template<> EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a,const Packet16h& b) { - Packet16h r; r.x = pandnot(a.x,b.x); return r; + Packet16h r; r.x = pandnot(Packet8i(a.x),Packet8i(b.x)); return r; } template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) { @@ -1077,16 +1079,18 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { } template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) { - Packet8h r; r.x = por(a.x,b.x); return r; + // in some cases Packet4i is a wrapper around __m128i, so we either need to + // cast to Packet4i to directly call the intrinsics as below: + Packet8h r; r.x = _mm_or_si128(a.x,b.x); return r; } template<> EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a,const Packet8h& b) { - Packet8h r; r.x = pxor(a.x,b.x); return r; + Packet8h r; r.x = _mm_xor_si128(a.x,b.x); return r; } template<> EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a,const Packet8h& b) { - Packet8h r; r.x = pand(a.x,b.x); return r; + Packet8h r; r.x = _mm_and_si128(a.x,b.x); return r; } template<> EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a,const Packet8h& b) { - Packet8h r; r.x = pandnot(a.x,b.x); return r; + Packet8h r; r.x = _mm_andnot_si128(b.x,a.x); return r; } template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; } -- cgit v1.2.3 From e763fcd09e620300226ca22d152b94867123b603 Mon Sep 17 00:00:00 2001 From: Gustavo Lima Chaves Date: Wed, 19 Dec 2018 14:24:44 -0800 Subject: Introducing "vectorized" byte on unpacket_traits structs This is a preparation to a change on gebp_traits, where a new template argument will be introduced to dictate the packet size, so it won't be bound to the current/max packet size only anymore. By having packet types defined early on gebp_traits, one has now to act on packet types, not scalars anymore, for the enum values defined on that class. One approach for reaching the vectorizable/size properties one needs there could be getting the packet's scalar again with unpacket_traits<>, then the size/Vectorizable enum entries from packet_traits<>. It turns out guards like "#ifndef EIGEN_VECTORIZE_AVX512" at AVX/PacketMath.h will hide smaller packet variations of packet_traits<> for some types (and it makes sense to keep that). In other words, one can't go back to the scalar and create a new PacketType, as this will always lead to the maximum packet type for the architecture. The less costly/invasive solution for that, thus, is to add the vectorizable info on every unpacket_traits struct as well. --- Eigen/src/Core/arch/AVX/Complex.h | 4 ++-- Eigen/src/Core/arch/AVX/PacketMath.h | 6 +++--- Eigen/src/Core/arch/AVX512/Complex.h | 6 ++++-- Eigen/src/Core/arch/AVX512/PacketMath.h | 6 +++--- Eigen/src/Core/arch/AltiVec/Complex.h | 4 ++-- Eigen/src/Core/arch/AltiVec/PacketMath.h | 6 +++--- Eigen/src/Core/arch/GPU/PacketMath.h | 4 ++-- Eigen/src/Core/arch/GPU/PacketMathHalf.h | 8 ++++---- Eigen/src/Core/arch/MSA/Complex.h | 4 ++-- Eigen/src/Core/arch/MSA/PacketMath.h | 6 +++--- Eigen/src/Core/arch/NEON/Complex.h | 4 ++-- Eigen/src/Core/arch/NEON/PacketMath.h | 6 +++--- Eigen/src/Core/arch/SSE/Complex.h | 4 ++-- Eigen/src/Core/arch/SSE/PacketMath.h | 6 +++--- Eigen/src/Core/arch/SYCL/InteropHeaders.h | 2 +- Eigen/src/Core/arch/ZVector/Complex.h | 4 ++-- Eigen/src/Core/arch/ZVector/PacketMath.h | 6 +++--- Eigen/src/Core/util/XprHelper.h | 3 ++- 18 files changed, 46 insertions(+), 43 deletions(-) (limited to 'Eigen/src/Core/arch/GPU') diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 08d021b65..e7e2a1033 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -47,7 +47,7 @@ template<> struct packet_traits > : default_packet_traits }; #endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=4, alignment=Aligned32}; typedef Packet2cf half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=4, alignment=Aligned32, vectorizable=true}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet4cf padd(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf psub(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); } @@ -255,7 +255,7 @@ template<> struct packet_traits > : default_packet_traits }; #endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned32}; typedef Packet1cd half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned32, vectorizable=true}; typedef Packet1cd half; }; template<> EIGEN_STRONG_INLINE Packet2cd padd(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd psub(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index e5aeb6375..e771c0f25 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -117,14 +117,14 @@ template<> struct unpacket_traits { typedef float type; typedef Packet4f half; typedef Packet8i integer_packet; - enum {size=8, alignment=Aligned32}; + enum {size=8, alignment=Aligned32, vectorizable=true}; }; template<> struct unpacket_traits { typedef double type; typedef Packet2d half; - enum {size=4, alignment=Aligned32}; + enum {size=4, alignment=Aligned32, vectorizable=true}; }; -template<> struct unpacket_traits { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32}; }; +template<> struct unpacket_traits { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32, vectorizable=false}; }; template<> EIGEN_STRONG_INLINE Packet8f pset1(const float& from) { return _mm256_set1_ps(from); } template<> EIGEN_STRONG_INLINE Packet4d pset1(const double& from) { return _mm256_set1_pd(from); } diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 247f89860..569ee01ff 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -50,7 +50,8 @@ template<> struct unpacket_traits { typedef std::complex type; enum { size = 8, - alignment=unpacket_traits::alignment + alignment=unpacket_traits::alignment, + vectorizable=true }; typedef Packet4cf half; }; @@ -237,7 +238,8 @@ template<> struct unpacket_traits { typedef std::complex type; enum { size = 4, - alignment = unpacket_traits::alignment + alignment = unpacket_traits::alignment, + vectorizable=true }; typedef Packet2cd half; }; diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 10284dd7c..9c3121062 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -102,19 +102,19 @@ struct unpacket_traits { typedef float type; typedef Packet8f half; typedef Packet16i integer_packet; - enum { size = 16, alignment=Aligned64 }; + enum { size = 16, alignment=Aligned64, vectorizable=true }; }; template <> struct unpacket_traits { typedef double type; typedef Packet4d half; - enum { size = 8, alignment=Aligned64 }; + enum { size = 8, alignment=Aligned64, vectorizable=true }; }; template <> struct unpacket_traits { typedef int type; typedef Packet8i half; - enum { size = 16, alignment=Aligned64 }; + enum { size = 16, alignment=Aligned64, vectorizable=false }; }; template <> diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index 3e665730c..5404a624e 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -60,7 +60,7 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { @@ -286,7 +286,7 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; }; template<> EIGEN_STRONG_INLINE Packet1cd pload (const std::complex* from) { return Packet1cd(pload((const double*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { return Packet1cd(ploadu((const double*)from)); } diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index d0ee93f4a..2c06003ed 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -192,13 +192,13 @@ template<> struct unpacket_traits typedef float type; typedef Packet4f half; typedef Packet4i integer_packet; - enum {size=4, alignment=Aligned16}; + enum {size=4, alignment=Aligned16, vectorizable=true}; }; template<> struct unpacket_traits { typedef int type; typedef Packet4i half; - enum {size=4, alignment=Aligned16}; + enum {size=4, alignment=Aligned16, vectorizable=false}; }; inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v) @@ -916,7 +916,7 @@ template<> struct packet_traits : default_packet_traits }; }; -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2d half; }; inline std::ostream & operator <<(std::ostream & s, const Packet2l & v) { diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h index ddf37b9c1..eaba60e26 100644 --- a/Eigen/src/Core/arch/GPU/PacketMath.h +++ b/Eigen/src/Core/arch/GPU/PacketMath.h @@ -90,8 +90,8 @@ template<> struct packet_traits : default_packet_traits }; -template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef float4 half; }; -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef double2 half; }; +template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef float4 half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef double2 half; }; template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1(const float& from) { return make_float4(from, from, from, from); diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index f3d721dd7..cc5c484b6 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -41,7 +41,7 @@ template<> struct packet_traits : default_packet_traits }; }; -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; }; +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef half2 half; }; template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1(const Eigen::half& from) { @@ -517,7 +517,7 @@ struct packet_traits : default_packet_traits { }; -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=16, alignment=Aligned32}; typedef Packet16h half; }; +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=16, alignment=Aligned32, vectorizable=true}; typedef Packet16h half; }; template<> EIGEN_STRONG_INLINE Packet16h pset1(const Eigen::half& from) { Packet16h result; @@ -984,7 +984,7 @@ struct packet_traits : default_packet_traits { }; -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=8, alignment=Aligned16}; typedef Packet8h half; }; +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true}; typedef Packet8h half; }; template<> EIGEN_STRONG_INLINE Packet8h pset1(const Eigen::half& from) { Packet8h result; @@ -1329,7 +1329,7 @@ struct packet_traits : default_packet_traits { }; -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=4, alignment=Aligned16}; typedef Packet4h half; }; +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef Packet4h half; }; template<> EIGEN_STRONG_INLINE Packet4h pset1(const Eigen::half& from) { Packet4h result; diff --git a/Eigen/src/Core/arch/MSA/Complex.h b/Eigen/src/Core/arch/MSA/Complex.h index 9a45cf51e..fa64d3564 100644 --- a/Eigen/src/Core/arch/MSA/Complex.h +++ b/Eigen/src/Core/arch/MSA/Complex.h @@ -127,7 +127,7 @@ struct packet_traits > : default_packet_traits { template <> struct unpacket_traits { typedef std::complex type; - enum { size = 2, alignment = Aligned16 }; + enum { size = 2, alignment = Aligned16, vectorizable=true }; typedef Packet2cf half; }; @@ -500,7 +500,7 @@ struct packet_traits > : default_packet_traits { template <> struct unpacket_traits { typedef std::complex type; - enum { size = 1, alignment = Aligned16 }; + enum { size = 1, alignment = Aligned16, vectorizable=true }; typedef Packet1cd half; }; diff --git a/Eigen/src/Core/arch/MSA/PacketMath.h b/Eigen/src/Core/arch/MSA/PacketMath.h index 094c874ee..a97156a84 100644 --- a/Eigen/src/Core/arch/MSA/PacketMath.h +++ b/Eigen/src/Core/arch/MSA/PacketMath.h @@ -117,14 +117,14 @@ struct packet_traits : default_packet_traits { template <> struct unpacket_traits { typedef float type; - enum { size = 4, alignment = Aligned16 }; + enum { size = 4, alignment = Aligned16, vectorizable=true }; typedef Packet4f half; }; template <> struct unpacket_traits { typedef int32_t type; - enum { size = 4, alignment = Aligned16 }; + enum { size = 4, alignment = Aligned16, vectorizable=true }; typedef Packet4i half; }; @@ -925,7 +925,7 @@ struct packet_traits : default_packet_traits { template <> struct unpacket_traits { typedef double type; - enum { size = 2, alignment = Aligned16 }; + enum { size = 2, alignment = Aligned16, vectorizable=true }; typedef Packet2d half; }; diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index 306a309be..5e6de1f40 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -62,7 +62,7 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { @@ -328,7 +328,7 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; }; template<> EIGEN_STRONG_INLINE Packet1cd pload(const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); } diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index ed3cec88a..ca4f2bf94 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -145,13 +145,13 @@ template<> struct unpacket_traits typedef float type; typedef Packet4f half; typedef Packet4i integer_packet; - enum {size=4, alignment=Aligned16}; + enum {size=4, alignment=Aligned16, vectorizable=true}; }; template<> struct unpacket_traits { typedef int32_t type; typedef Packet4i half; - enum {size=4, alignment=Aligned16}; + enum {size=4, alignment=Aligned16, vectorizable=true}; }; template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return vdupq_n_f32(from); } @@ -650,7 +650,7 @@ template<> struct packet_traits : default_packet_traits }; }; -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2d half; }; template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return vdupq_n_f64(from); } diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 0f8960328..911fe066e 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -50,7 +50,7 @@ template<> struct packet_traits > : default_packet_traits }; #endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); } @@ -280,7 +280,7 @@ template<> struct packet_traits > : default_packet_traits }; #endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; }; template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 3e7a75bc0..4c7dc5b64 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -166,17 +166,17 @@ template<> struct unpacket_traits { typedef float type; typedef Packet4f half; typedef Packet4i integer_packet; - enum {size=4, alignment=Aligned16}; + enum {size=4, alignment=Aligned16, vectorizable=true}; }; template<> struct unpacket_traits { typedef double type; typedef Packet2d half; - enum {size=2, alignment=Aligned16}; + enum {size=2, alignment=Aligned16, vectorizable=true}; }; template<> struct unpacket_traits { typedef int type; typedef Packet4i half; - enum {size=4, alignment=Aligned16}; + enum {size=4, alignment=Aligned16, vectorizable=false}; }; #ifndef EIGEN_VECTORIZE_AVX diff --git a/Eigen/src/Core/arch/SYCL/InteropHeaders.h b/Eigen/src/Core/arch/SYCL/InteropHeaders.h index c1da40d14..294cb101a 100644 --- a/Eigen/src/Core/arch/SYCL/InteropHeaders.h +++ b/Eigen/src/Core/arch/SYCL/InteropHeaders.h @@ -88,7 +88,7 @@ SYCL_ARITHMETIC(cl::sycl::cl_double2) #define SYCL_UNPACKET_TRAITS(packet_type, unpacket_type, lengths)\ template<> struct unpacket_traits {\ typedef unpacket_type type;\ - enum {size=lengths, alignment=Aligned16};\ + enum {size=lengths, alignment=Aligned16, vectorizable=true};\ typedef packet_type half;\ }; SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4) diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h index 95aba428f..167c3ee4c 100644 --- a/Eigen/src/Core/arch/ZVector/Complex.h +++ b/Eigen/src/Core/arch/ZVector/Complex.h @@ -91,8 +91,8 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; }; /* Forward declaration */ EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel); diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h index 0b37f4992..c8e90f1a8 100755 --- a/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -239,9 +239,9 @@ template<> struct packet_traits : default_packet_traits }; }; -template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; -template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; +template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef Packet4i half; }; +template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef Packet4f half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2d half; }; /* Forward declaration */ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel); diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 836ff4711..91c2e42e4 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -184,7 +184,8 @@ template struct unpacket_traits enum { size = 1, - alignment = 1 + alignment = 1, + vectorizable = false }; }; -- cgit v1.2.3 From 055f0b73dbdd3b6e32ab10f8c3538b360124627f Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Mon, 7 Jan 2019 16:53:36 -0800 Subject: Add support for pcmp_eq and pnot, including for complex types. --- Eigen/src/Core/GenericPacketMath.h | 23 ++++++++++++++++++----- Eigen/src/Core/arch/AVX/Complex.h | 14 ++++++++++++++ Eigen/src/Core/arch/AVX/PacketMath.h | 1 + Eigen/src/Core/arch/AVX512/Complex.h | 18 ++++++++++++++++++ Eigen/src/Core/arch/AVX512/PacketMath.h | 13 +++++++++++++ Eigen/src/Core/arch/GPU/PacketMathHalf.h | 14 ++++++++++++++ Eigen/src/Core/arch/SSE/Complex.h | 14 ++++++++++++++ Eigen/src/Core/arch/SSE/PacketMath.h | 3 ++- test/packetmath.cpp | 9 +++++++++ 9 files changed, 103 insertions(+), 6 deletions(-) (limited to 'Eigen/src/Core/arch/GPU') diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 2b2ee9e2c..883c35d2c 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -214,6 +214,18 @@ pxor(const Packet& a, const Packet& b) { return a ^ b; } template EIGEN_DEVICE_FUNC inline Packet pandnot(const Packet& a, const Packet& b) { return a & (~b); } +/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ +template EIGEN_DEVICE_FUNC inline Packet +pset1(const typename unpacket_traits::type& a) { return a; } + +/** \internal \returns the bitwise not of \a a */ +template EIGEN_DEVICE_FUNC inline Packet +pnot(const Packet& a) { + typedef typename unpacket_traits::type Scalar; + Packet ones = pset1(Scalar(1)); + return pandnot(ones, a); +} + /** \internal \returns \a a shifted by N bits to the right */ template EIGEN_DEVICE_FUNC inline int pshiftright(const int& a) { return a >> N; } @@ -258,7 +270,12 @@ pcmp_lt(const Packet& a, const Packet& b); /* { return a EIGEN_DEVICE_FUNC inline Packet -pcmp_eq(const Packet& a, const Packet& b); /* { return a==b ? pnot(pxor(a,a)) : pxor(a,a); } */ +pcmp_eq(const Packet& a, const Packet& b) +{ + typedef typename unpacket_traits::type Scalar; + Packet zeros = pset1(Scalar(0)); + return a==b ? pnot(zeros) : zeros; +} /** \internal \returns a < b or a==NaN or b==NaN as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet @@ -272,10 +289,6 @@ pload(const typename unpacket_traits::type* from) { return *from; } template EIGEN_DEVICE_FUNC inline Packet ploadu(const typename unpacket_traits::type* from) { return *from; } -/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ -template EIGEN_DEVICE_FUNC inline Packet -pset1(const typename unpacket_traits::type& a) { return a; } - /** \internal \returns a packet with constant coefficients set from bits */ template EIGEN_DEVICE_FUNC inline Packet pset1frombits(BitsType a); diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 08d021b65..23687c624 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -69,6 +69,13 @@ template<> EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, con return Packet4cf(result); } +template <> +EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) { + __m256 eq = _mm256_cmp_ps(a.v, b.v, _CMP_EQ_OQ); + __m256 real_and_imag_equal = _mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1)); + return Packet4cf(real_and_imag_equal); +} + template<> EIGEN_STRONG_INLINE Packet4cf pand (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf por (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pxor (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); } @@ -276,6 +283,13 @@ template<> EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, con return Packet2cd(_mm256_addsub_pd(even, odd)); } +template <> +EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) { + __m256d eq = _mm256_cmp_pd(a.v, b.v, _CMP_EQ_OQ); + __m256d real_and_imag_equal = _mm256_and_pd(eq, _mm256_permute_pd(eq, 0x5)); + return Packet2cd(real_and_imag_equal); +} + template<> EIGEN_STRONG_INLINE Packet2cd pand (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd por (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pxor (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index e5aeb6375..27c35fbd9 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -228,6 +228,7 @@ template<> EIGEN_STRONG_INLINE Packet4d pmax(const Packet4d& a, const template<> EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LE_OQ); } template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LT_OQ); } template<> EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_EQ_OQ); } +template<> EIGEN_STRONG_INLINE Packet4d pcmp_eq(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_EQ_OQ); } template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a, b, _CMP_NGE_UQ); } template<> EIGEN_STRONG_INLINE Packet8i pcmp_eq(const Packet8i& a, const Packet8i& b) { diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 42cdfcd25..2c613f870 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -80,6 +80,15 @@ template<> EIGEN_STRONG_INLINE Packet8cf por (const Packet8cf& a, template<> EIGEN_STRONG_INLINE Packet8cf pxor (const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pxor(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet8cf pandnot(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pandnot(a.v,b.v)); } +template <> +EIGEN_STRONG_INLINE Packet8cf pcmp_eq(const Packet8cf& a, const Packet8cf& b) { + __m512 eq = pcmp_eq(a.v, b.v); + __m512 eq_swap_real_imag = _mm512_permute_ps(eq, 0xB1); + __m512i real_and_imag_equal = _mm512_and_si512( + _mm512_castps_si512(eq), _mm512_castps_si512(eq_swap_real_imag)); + return Packet8cf(_mm512_castsi512_ps(real_and_imag_equal)); +} + template<> EIGEN_STRONG_INLINE Packet8cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet8cf(pload(&numext::real_ref(*from))); } template<> EIGEN_STRONG_INLINE Packet8cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet8cf(ploadu(&numext::real_ref(*from))); } @@ -267,6 +276,15 @@ template<> EIGEN_STRONG_INLINE Packet4cd por (const Packet4cd& a, template<> EIGEN_STRONG_INLINE Packet4cd pxor (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pxor(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cd pandnot(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pandnot(a.v,b.v)); } +template <> +EIGEN_STRONG_INLINE Packet4cd pcmp_eq(const Packet4cd& a, const Packet4cd& b) { + __m512d eq = pcmp_eq(a.v, b.v); + __m512d eq_swap_real_imag = _mm512_permute_pd(eq, 0x55); + __m512i real_and_imag_equal = _mm512_and_si512( + _mm512_castpd_si512(eq), _mm512_castpd_si512(eq_swap_real_imag)); + return Packet4cd(_mm512_castsi512_pd(real_and_imag_equal)); +} + template<> EIGEN_STRONG_INLINE Packet4cd pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cd(pload((const double*)from)); } template<> EIGEN_STRONG_INLINE Packet4cd ploadu(const std::complex* from) diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 72b09d998..710351ed0 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -295,6 +295,19 @@ template<> EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packe return _mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1); } +template <> +EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) { + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffff)); +} + +template <> +EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) { + __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ); + return _mm512_castsi512_pd( + _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffff)); +} template <> EIGEN_STRONG_INLINE Packet16i pand(const Packet16i& a, diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index f3d721dd7..3e35f96cc 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -655,6 +655,13 @@ template<> EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a,const Packet Packet16h r; r.x = pandnot(Packet8i(a.x),Packet8i(b.x)); return r; } +template<> EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a,const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + Packet16f rf = pcmp_eq(af, bf); + return float2half(rf); +} + template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) { // FIXME we could do that with bit manipulation Packet16f af = half2float(a); @@ -1093,6 +1100,13 @@ template<> EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a,const Packet8h Packet8h r; r.x = _mm_andnot_si128(b.x,a.x); return r; } +template<> EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a,const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = pcmp_eq(af, bf); + return float2half(rf); +} + template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; } template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) { diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 0f8960328..a7304193b 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -439,6 +439,20 @@ ptranspose(PacketBlock& kernel) { kernel.packet[1].v = tmp; } +template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) +{ + __m128 eq = _mm_cmpeq_ps(a.v, b.v); + __m128 real_and_imag_equal = _mm_and_ps(eq, vec4f_swizzle1(eq, 1, 0, 3, 2)); + return Packet2cf(real_and_imag_equal); +} + +template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) +{ + __m128d eq = _mm_cmpeq_pd(a.v, b.v); + __m128d real_and_imag_equal = _mm_and_pd(eq, vec2d_swizzle1(eq, 1, 0)); + return Packet1cd(real_and_imag_equal); +} + template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { __m128d result = pblend(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v)); return Packet2cf(_mm_castpd_ps(result)); diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 3e7a75bc0..71cf6b3bb 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -374,9 +374,10 @@ template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return _mm_cmple_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return _mm_cmplt_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); } template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 1158c4f9a..3b700fdd9 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -238,6 +238,7 @@ template void packetmath() CHECK_CWISE2_IF(PacketTraits::HasMul, REF_MUL, internal::pmul); CHECK_CWISE2_IF(PacketTraits::HasDiv, REF_DIV, internal::pdiv); + CHECK_CWISE1(internal::pnot, internal::pnot); CHECK_CWISE1(internal::negate, internal::pnegate); CHECK_CWISE1(numext::conj, internal::pconj); @@ -398,6 +399,14 @@ template void packetmath() CHECK_CWISE2_IF(true, internal::pand, internal::pand); CHECK_CWISE2_IF(true, internal::pandnot, internal::pandnot); } + + { + for (int i = 0; i < PacketSize; ++i) { + data1[i] = internal::random(); + data2[i] = (i % 2) ? data1[i] : Scalar(0); + } + CHECK_CWISE2_IF(true, internal::pcmp_eq, internal::pcmp_eq); + } } template void packetmath_real() -- cgit v1.2.3 From cb955df9a6fd5cb2673a7a15172609ce2dafdde8 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 9 Jan 2019 16:17:08 -0800 Subject: Add packet up "pones". Write pnot(a) as pxor(pones(a), a). --- Eigen/src/Core/GenericPacketMath.h | 47 ++++++++++++++------------------ Eigen/src/Core/arch/AVX/Complex.h | 4 +++ Eigen/src/Core/arch/AVX/PacketMath.h | 19 +++++++++++++ Eigen/src/Core/arch/AVX512/PacketMath.h | 26 ++++++++++++------ Eigen/src/Core/arch/GPU/PacketMathHalf.h | 16 +++++++++++ Eigen/src/Core/arch/SSE/Complex.h | 5 ++++ Eigen/src/Core/arch/SSE/PacketMath.h | 11 ++++++++ test/packetmath.cpp | 2 ++ 8 files changed, 95 insertions(+), 35 deletions(-) (limited to 'Eigen/src/Core/arch/GPU') diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 8bdf16e16..777c74f57 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -214,17 +214,21 @@ pxor(const Packet& a, const Packet& b) { return a ^ b; } template EIGEN_DEVICE_FUNC inline Packet pandnot(const Packet& a, const Packet& b) { return a & (~b); } -/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ +/** \internal \returns a packet with constant coefficients set from bits */ +template EIGEN_DEVICE_FUNC inline Packet +pset1frombits(BitsType a); + +/** \internal \returns zeros */ template EIGEN_DEVICE_FUNC inline Packet -pset1(const typename unpacket_traits::type& a) { return a; } +pzero(const Packet& a) { return pxor(a,a); } -/** \internal \returns the bitwise not of \a a */ +/** \internal \returns ones */ template EIGEN_DEVICE_FUNC inline Packet -pnot(const Packet& a) { - typedef typename unpacket_traits::type Scalar; - Packet ones = pset1(Scalar(1)); - return pandnot(ones, a); -} +pones(const Packet& /*a*/) { Packet b; memset(&b, 0xff, sizeof(b)); return b;} + +/** \internal \returns the bitwise not of \a a */ +template EIGEN_DEVICE_FUNC inline Packet +pnot(const Packet& a) { return pxor(pones(a), a);} /** \internal \returns \a a shifted by N bits to the right */ template EIGEN_DEVICE_FUNC inline int @@ -250,36 +254,25 @@ pfrexp(const Packet &a, Packet &exponent) { return std::frexp(a,&exponent); } template EIGEN_DEVICE_FUNC inline Packet pldexp(const Packet &a, const Packet &exponent) { return std::ldexp(a,exponent); } -/** \internal \returns zeros */ -template EIGEN_DEVICE_FUNC inline Packet -pzero(const Packet& a) { return pxor(a,a); } - /** \internal \returns bits of \a or \b according to the input bit mask \a mask */ template EIGEN_DEVICE_FUNC inline Packet -pselect(const Packet& mask, const Packet& a, const Packet& b) { - return por(pand(a,mask),pandnot(b,mask)); -} +pselect(const Packet& mask, const Packet& a, const Packet& b) { return por(pand(a,mask),pandnot(b,mask)); } /** \internal \returns a <= b as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -pcmp_le(const Packet& a, const Packet& b); /* { return a<=b ? pnot(pxor(a,a)) : pxor(a,a); } */ +pcmp_le(const Packet& a, const Packet& b) { return a<=b ? pones(a) : pzero(a); } /** \internal \returns a < b as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -pcmp_lt(const Packet& a, const Packet& b); /* { return a EIGEN_DEVICE_FUNC inline Packet -pcmp_eq(const Packet& a, const Packet& b) -{ - typedef typename unpacket_traits::type Scalar; - Packet zeros = pset1(Scalar(0)); - return a==b ? pnot(zeros) : zeros; -} +pcmp_eq(const Packet& a, const Packet& b) { return a==b ? pones(a) : pzero(a); } /** \internal \returns a < b or a==NaN or b==NaN as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -pcmp_lt_or_nan(const Packet& a, const Packet& b); /* { return pnot(pcmp_le(b,a)); } */ +pcmp_lt_or_nan(const Packet& a, const Packet& b) { return pnot(pcmp_le(b,a)); } /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */ template EIGEN_DEVICE_FUNC inline Packet @@ -289,9 +282,9 @@ pload(const typename unpacket_traits::type* from) { return *from; } template EIGEN_DEVICE_FUNC inline Packet ploadu(const typename unpacket_traits::type* from) { return *from; } -/** \internal \returns a packet with constant coefficients set from bits */ -template EIGEN_DEVICE_FUNC inline Packet -pset1frombits(BitsType a); +/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ +template EIGEN_DEVICE_FUNC inline Packet +pset1(const typename unpacket_traits::type& a) { return a; } /** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */ template EIGEN_DEVICE_FUNC inline Packet diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 23687c624..d880ef593 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -76,6 +76,8 @@ EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(real_and_imag_equal); } +template<> EIGEN_STRONG_INLINE Packet4cf pones(const Packet4cf& a) { return Packet4cf(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pnot(const Packet4cf& a) { return Packet4cf(pnot(a.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pand (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf por (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pxor (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); } @@ -290,6 +292,8 @@ EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(real_and_imag_equal); } +template<> EIGEN_STRONG_INLINE Packet2cd pones(const Packet2cd& a) { return Packet2cd(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pnot(const Packet2cd& a) { return Packet2cd(pnot(a.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pand (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd por (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pxor (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index a6af48f21..f6a514fbf 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -250,6 +250,25 @@ template<> EIGEN_STRONG_INLINE Packet4d pceil(const Packet4d& a) { ret template<> EIGEN_STRONG_INLINE Packet8f pfloor(const Packet8f& a) { return _mm256_floor_ps(a); } template<> EIGEN_STRONG_INLINE Packet4d pfloor(const Packet4d& a) { return _mm256_floor_pd(a); } + +#ifdef EIGEN_VECTORIZE_AVX2 +template<> EIGEN_STRONG_INLINE Packet8i pones(const Packet8i& a) { + return _mm256_cmpeq_epi64(a,a); +} +#else +template<> EIGEN_STRONG_INLINE Packet8i pones(const Packet8i& /*a*/) { + const unsigned int o = 0xffffffffu; + return _mm256_set_epi32(o, o, o, o, o, o, o, o); +} +#endif +template<> EIGEN_STRONG_INLINE Packet8f pones(const Packet8f& a) { + return _mm256_castsi256_ps(pones(_mm256_castps_si256(a))); +} + +template<> EIGEN_STRONG_INLINE Packet4d pones(const Packet4d& a) { + return _mm256_castsi256_pd(pones(_mm256_castpd_si256(a))); +} + template<> EIGEN_STRONG_INLINE Packet8f pand(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pand(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet8i pand(const Packet8i& a, const Packet8i& b) { diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 68adf5e57..d258fd07b 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -295,12 +295,6 @@ template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packe return cat256(lo, hi); } -template<> EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) { - __m256 lo = pcmp_eq(extract256<0>(a), extract256<0>(b)); - __m256 hi = pcmp_eq(extract256<1>(a), extract256<1>(b)); - return cat256(lo, hi); -} - template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) { __m256 lo = pcmp_lt_or_nan(extract256<0>(a), extract256<0>(b)); __m256 hi = pcmp_lt_or_nan(extract256<1>(a), extract256<1>(b)); @@ -317,14 +311,30 @@ template <> EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) { __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ); return _mm512_castsi512_ps( - _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffff)); + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); } template <> EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) { __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ); return _mm512_castsi512_pd( - _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffff)); + _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu)); +} + +template <> +EIGEN_STRONG_INLINE Packet16i pones(const Packet16i& /*a*/) { + const unsigned int o = 0xffffffffu; + return _mm512_set_epi32(o, o, o, o, o, o, o, o, o, o, o, o, o, o, o, o); +} + +template <> +EIGEN_STRONG_INLINE Packet16f pones(const Packet16f& a) { + return _mm512_castsi512_ps(pones(_mm512_castps_si512(a))); +} + +template <> +EIGEN_STRONG_INLINE Packet8d pones(const Packet8d& a) { + return _mm512_castsi512_pd(pones(_mm512_castpd_si512(a))); } template <> diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index 3e35f96cc..c4dfedcf8 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -143,6 +143,10 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& return result; } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pones(const half2& a) { + half2 result; + *(reinterpret_cast(&(result))) = 0xffffffffu; +} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { @@ -640,6 +644,14 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { #endif } +template<> EIGEN_STRONG_INLINE Packet16h pnot(const Packet16h& a) { + Packet16h r; r.x = _mm256_xor_si256(a.x, pcmp_eq(a.x, a.x)); return r; +} + +template<> EIGEN_STRONG_INLINE Packet16h pones(const Packet16h& a) { + Packet16h r; r.x = Packet8i(pones(a.x)); return r; +} + template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) { // in some cases Packet8i is a wrapper around __m256i, so we need to // cast to Packet8i to call the correct overload. @@ -1085,6 +1097,10 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { #endif } +template<> EIGEN_STRONG_INLINE Packet8h pones(const Packet8h& a) { + Packet8h r; r.x = _mm_cmpeq_epi32(a.x, a.x); return r; +} + template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) { // in some cases Packet4i is a wrapper around __m128i, so we either need to // cast to Packet4i to directly call the intrinsics as below: diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index a7304193b..8372cedfb 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -82,6 +82,9 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con #endif } +template<> EIGEN_STRONG_INLINE Packet2cf pones (const Packet2cf& a) { return Packet2cf(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pnot (const Packet2cf& a) { return Packet2cf(pnot(a.v)); } + template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); } @@ -305,6 +308,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con #endif } +template<> EIGEN_STRONG_INLINE Packet1cd pones (const Packet1cd& a) { return Packet1cd(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pnot (const Packet1cd& a) { return Packet1cd(pnot(a.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index b8a5497a9..6dd2f8a46 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -378,6 +378,17 @@ template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4 template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pones(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); } +template<> EIGEN_STRONG_INLINE Packet4f +pones(const Packet4f& a) { + Packet4i b = _mm_castps_si128(a); + return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b)); +} +template<> EIGEN_STRONG_INLINE Packet2d +pones(const Packet2d& a) { + Packet4i b = _mm_castpd_si128(a); + return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b)); +} template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } diff --git a/test/packetmath.cpp b/test/packetmath.cpp index a88b7bba9..460cfbdbe 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -239,6 +239,8 @@ template void packetmath() CHECK_CWISE2_IF(PacketTraits::HasDiv, REF_DIV, internal::pdiv); CHECK_CWISE1(internal::pnot, internal::pnot); + CHECK_CWISE1(internal::pzero, internal::pzero); + CHECK_CWISE1(internal::pones, internal::pones); CHECK_CWISE1(internal::negate, internal::pnegate); CHECK_CWISE1(numext::conj, internal::pconj); -- cgit v1.2.3 From 8f044425263e876236030f62461507325edfdf44 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 9 Jan 2019 16:34:23 -0800 Subject: Collapsed revision * Collapsed revision * Add packet up "pones". Write pnot(a) as pxor(pones(a), a). * Collapsed revision * Simplify a bit. * Undo useless diffs. * Fix typo. --- Eigen/src/Core/GenericPacketMath.h | 29 ++++++++----------- Eigen/src/Core/arch/AVX/Complex.h | 10 ++++--- Eigen/src/Core/arch/AVX/PacketMath.h | 19 +++++++++++++ Eigen/src/Core/arch/AVX512/Complex.h | 10 ++----- Eigen/src/Core/arch/AVX512/PacketMath.h | 49 +++++++++++++++++++------------- Eigen/src/Core/arch/GPU/PacketMathHalf.h | 16 +++++++++++ Eigen/src/Core/arch/SSE/Complex.h | 11 ++++--- Eigen/src/Core/arch/SSE/PacketMath.h | 11 +++++++ test/packetmath.cpp | 2 ++ 9 files changed, 104 insertions(+), 53 deletions(-) (limited to 'Eigen/src/Core/arch/GPU') diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 8bdf16e16..8bcceaa7b 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -214,17 +214,13 @@ pxor(const Packet& a, const Packet& b) { return a ^ b; } template EIGEN_DEVICE_FUNC inline Packet pandnot(const Packet& a, const Packet& b) { return a & (~b); } -/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ +/** \internal \returns ones */ template EIGEN_DEVICE_FUNC inline Packet -pset1(const typename unpacket_traits::type& a) { return a; } +pones(const Packet& /*a*/) { Packet b; memset(&b, 0xff, sizeof(b)); return b;} /** \internal \returns the bitwise not of \a a */ -template EIGEN_DEVICE_FUNC inline Packet -pnot(const Packet& a) { - typedef typename unpacket_traits::type Scalar; - Packet ones = pset1(Scalar(1)); - return pandnot(ones, a); -} +template EIGEN_DEVICE_FUNC inline Packet +pnot(const Packet& a) { return pxor(pones(a), a);} /** \internal \returns \a a shifted by N bits to the right */ template EIGEN_DEVICE_FUNC inline int @@ -262,24 +258,19 @@ pselect(const Packet& mask, const Packet& a, const Packet& b) { /** \internal \returns a <= b as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -pcmp_le(const Packet& a, const Packet& b); /* { return a<=b ? pnot(pxor(a,a)) : pxor(a,a); } */ +pcmp_le(const Packet& a, const Packet& b) { return a<=b ? pones(a) : pzero(a); } /** \internal \returns a < b as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -pcmp_lt(const Packet& a, const Packet& b); /* { return a EIGEN_DEVICE_FUNC inline Packet -pcmp_eq(const Packet& a, const Packet& b) -{ - typedef typename unpacket_traits::type Scalar; - Packet zeros = pset1(Scalar(0)); - return a==b ? pnot(zeros) : zeros; -} +pcmp_eq(const Packet& a, const Packet& b) { return a==b ? pones(a) : pzero(a); } /** \internal \returns a < b or a==NaN or b==NaN as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -pcmp_lt_or_nan(const Packet& a, const Packet& b); /* { return pnot(pcmp_le(b,a)); } */ +pcmp_lt_or_nan(const Packet& a, const Packet& b) { return pnot(pcmp_le(b,a)); } /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */ template EIGEN_DEVICE_FUNC inline Packet @@ -289,6 +280,10 @@ pload(const typename unpacket_traits::type* from) { return *from; } template EIGEN_DEVICE_FUNC inline Packet ploadu(const typename unpacket_traits::type* from) { return *from; } +/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ +template EIGEN_DEVICE_FUNC inline Packet +pset1(const typename unpacket_traits::type& a) { return a; } + /** \internal \returns a packet with constant coefficients set from bits */ template EIGEN_DEVICE_FUNC inline Packet pset1frombits(BitsType a); diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 23687c624..9f1bb969e 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -72,10 +72,11 @@ template<> EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, con template <> EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) { __m256 eq = _mm256_cmp_ps(a.v, b.v, _CMP_EQ_OQ); - __m256 real_and_imag_equal = _mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1)); - return Packet4cf(real_and_imag_equal); + return Packet4cf(_mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1))); } +template<> EIGEN_STRONG_INLINE Packet4cf pones(const Packet4cf& a) { return Packet4cf(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pnot(const Packet4cf& a) { return Packet4cf(pnot(a.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pand (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf por (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pxor (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); } @@ -286,10 +287,11 @@ template<> EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, con template <> EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) { __m256d eq = _mm256_cmp_pd(a.v, b.v, _CMP_EQ_OQ); - __m256d real_and_imag_equal = _mm256_and_pd(eq, _mm256_permute_pd(eq, 0x5)); - return Packet2cd(real_and_imag_equal); + return Packet2cd(pand(eq, _mm256_permute_pd(eq, 0x5))); } +template<> EIGEN_STRONG_INLINE Packet2cd pones(const Packet2cd& a) { return Packet2cd(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pnot(const Packet2cd& a) { return Packet2cd(pnot(a.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pand (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd por (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pxor (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index a6af48f21..f6a514fbf 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -250,6 +250,25 @@ template<> EIGEN_STRONG_INLINE Packet4d pceil(const Packet4d& a) { ret template<> EIGEN_STRONG_INLINE Packet8f pfloor(const Packet8f& a) { return _mm256_floor_ps(a); } template<> EIGEN_STRONG_INLINE Packet4d pfloor(const Packet4d& a) { return _mm256_floor_pd(a); } + +#ifdef EIGEN_VECTORIZE_AVX2 +template<> EIGEN_STRONG_INLINE Packet8i pones(const Packet8i& a) { + return _mm256_cmpeq_epi64(a,a); +} +#else +template<> EIGEN_STRONG_INLINE Packet8i pones(const Packet8i& /*a*/) { + const unsigned int o = 0xffffffffu; + return _mm256_set_epi32(o, o, o, o, o, o, o, o); +} +#endif +template<> EIGEN_STRONG_INLINE Packet8f pones(const Packet8f& a) { + return _mm256_castsi256_ps(pones(_mm256_castps_si256(a))); +} + +template<> EIGEN_STRONG_INLINE Packet4d pones(const Packet4d& a) { + return _mm256_castsi256_pd(pones(_mm256_castpd_si256(a))); +} + template<> EIGEN_STRONG_INLINE Packet8f pand(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pand(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet8i pand(const Packet8i& a, const Packet8i& b) { diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 2c613f870..154fedc25 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -83,10 +83,7 @@ template<> EIGEN_STRONG_INLINE Packet8cf pandnot(const Packet8cf& a, template <> EIGEN_STRONG_INLINE Packet8cf pcmp_eq(const Packet8cf& a, const Packet8cf& b) { __m512 eq = pcmp_eq(a.v, b.v); - __m512 eq_swap_real_imag = _mm512_permute_ps(eq, 0xB1); - __m512i real_and_imag_equal = _mm512_and_si512( - _mm512_castps_si512(eq), _mm512_castps_si512(eq_swap_real_imag)); - return Packet8cf(_mm512_castsi512_ps(real_and_imag_equal)); + return Packet8cf(pand(eq, _mm512_permute_ps(eq, 0xB1))); } template<> EIGEN_STRONG_INLINE Packet8cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet8cf(pload(&numext::real_ref(*from))); } @@ -279,10 +276,7 @@ template<> EIGEN_STRONG_INLINE Packet4cd pandnot(const Packet4cd& a, template <> EIGEN_STRONG_INLINE Packet4cd pcmp_eq(const Packet4cd& a, const Packet4cd& b) { __m512d eq = pcmp_eq(a.v, b.v); - __m512d eq_swap_real_imag = _mm512_permute_pd(eq, 0x55); - __m512i real_and_imag_equal = _mm512_and_si512( - _mm512_castpd_si512(eq), _mm512_castpd_si512(eq_swap_real_imag)); - return Packet4cd(_mm512_castsi512_pd(real_and_imag_equal)); + return Packet4cd(pand(eq, _mm512_permute_pd(eq, 0x55))); } template<> EIGEN_STRONG_INLINE Packet4cd pload (const std::complex* from) diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 68adf5e57..9666c4e22 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -284,47 +284,56 @@ EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { #endif template<> EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) { - __m256 lo = pcmp_le(extract256<0>(a), extract256<0>(b)); - __m256 hi = pcmp_le(extract256<1>(a), extract256<1>(b)); - return cat256(lo, hi); + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); } template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) { - __m256 lo = pcmp_lt(extract256<0>(a), extract256<0>(b)); - __m256 hi = pcmp_lt(extract256<1>(a), extract256<1>(b)); - return cat256(lo, hi); -} - -template<> EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) { - __m256 lo = pcmp_eq(extract256<0>(a), extract256<0>(b)); - __m256 hi = pcmp_eq(extract256<1>(a), extract256<1>(b)); - return cat256(lo, hi); + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); } template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) { - __m256 lo = pcmp_lt_or_nan(extract256<0>(a), extract256<0>(b)); - __m256 hi = pcmp_lt_or_nan(extract256<1>(a), extract256<1>(b)); - return cat256(lo, hi); + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGT_UQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); } template<> EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) { - __m256i lo = _mm256_cmpeq_epi32(_mm512_extracti64x4_epi64(a, 0), _mm512_extracti64x4_epi64(b, 0)); - __m256i hi = _mm256_cmpeq_epi32(_mm512_extracti64x4_epi64(a, 1), _mm512_extracti64x4_epi64(b, 1)); - return _mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1); + __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _CMP_EQ_OQ); + return _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu); } template <> EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) { __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ); return _mm512_castsi512_ps( - _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffff)); + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); } template <> EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) { __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ); return _mm512_castsi512_pd( - _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffff)); + _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu)); +} + +template <> +EIGEN_STRONG_INLINE Packet16i pones(const Packet16i& /*a*/) { + const unsigned int o = 0xffffffffu; + return _mm512_set_epi32(o, o, o, o, o, o, o, o, o, o, o, o, o, o, o, o); +} + +template <> +EIGEN_STRONG_INLINE Packet16f pones(const Packet16f& a) { + return _mm512_castsi512_ps(pones(_mm512_castps_si512(a))); +} + +template <> +EIGEN_STRONG_INLINE Packet8d pones(const Packet8d& a) { + return _mm512_castsi512_pd(pones(_mm512_castpd_si512(a))); } template <> diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index 3e35f96cc..c4dfedcf8 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -143,6 +143,10 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& return result; } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pones(const half2& a) { + half2 result; + *(reinterpret_cast(&(result))) = 0xffffffffu; +} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { @@ -640,6 +644,14 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { #endif } +template<> EIGEN_STRONG_INLINE Packet16h pnot(const Packet16h& a) { + Packet16h r; r.x = _mm256_xor_si256(a.x, pcmp_eq(a.x, a.x)); return r; +} + +template<> EIGEN_STRONG_INLINE Packet16h pones(const Packet16h& a) { + Packet16h r; r.x = Packet8i(pones(a.x)); return r; +} + template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) { // in some cases Packet8i is a wrapper around __m256i, so we need to // cast to Packet8i to call the correct overload. @@ -1085,6 +1097,10 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { #endif } +template<> EIGEN_STRONG_INLINE Packet8h pones(const Packet8h& a) { + Packet8h r; r.x = _mm_cmpeq_epi32(a.x, a.x); return r; +} + template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) { // in some cases Packet4i is a wrapper around __m128i, so we either need to // cast to Packet4i to directly call the intrinsics as below: diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index a7304193b..fa84097ac 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -82,6 +82,9 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con #endif } +template<> EIGEN_STRONG_INLINE Packet2cf pones (const Packet2cf& a) { return Packet2cf(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pnot (const Packet2cf& a) { return Packet2cf(pnot(a.v)); } + template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); } @@ -305,6 +308,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con #endif } +template<> EIGEN_STRONG_INLINE Packet1cd pones (const Packet1cd& a) { return Packet1cd(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pnot (const Packet1cd& a) { return Packet1cd(pnot(a.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); } @@ -442,15 +447,13 @@ ptranspose(PacketBlock& kernel) { template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) { __m128 eq = _mm_cmpeq_ps(a.v, b.v); - __m128 real_and_imag_equal = _mm_and_ps(eq, vec4f_swizzle1(eq, 1, 0, 3, 2)); - return Packet2cf(real_and_imag_equal); + return Packet2cf(pand(eq, vec4f_swizzle1(eq, 1, 0, 3, 2))); } template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) { __m128d eq = _mm_cmpeq_pd(a.v, b.v); - __m128d real_and_imag_equal = _mm_and_pd(eq, vec2d_swizzle1(eq, 1, 0)); - return Packet1cd(real_and_imag_equal); + return Packet1cd(pand(eq, vec2d_swizzle1(eq, 1, 0))); } template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index b8a5497a9..6dd2f8a46 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -378,6 +378,17 @@ template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4 template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pones(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); } +template<> EIGEN_STRONG_INLINE Packet4f +pones(const Packet4f& a) { + Packet4i b = _mm_castps_si128(a); + return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b)); +} +template<> EIGEN_STRONG_INLINE Packet2d +pones(const Packet2d& a) { + Packet4i b = _mm_castpd_si128(a); + return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b)); +} template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } diff --git a/test/packetmath.cpp b/test/packetmath.cpp index a88b7bba9..460cfbdbe 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -239,6 +239,8 @@ template void packetmath() CHECK_CWISE2_IF(PacketTraits::HasDiv, REF_DIV, internal::pdiv); CHECK_CWISE1(internal::pnot, internal::pnot); + CHECK_CWISE1(internal::pzero, internal::pzero); + CHECK_CWISE1(internal::pones, internal::pones); CHECK_CWISE1(internal::negate, internal::pnegate); CHECK_CWISE1(numext::conj, internal::pconj); -- cgit v1.2.3 From e15bb785adf756f3e48410ee681ca97ad5bb3e76 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 9 Jan 2019 16:34:23 -0800 Subject: Collapsed revision * Add packet up "pones". Write pnot(a) as pxor(pones(a), a). * Collapsed revision * Simplify a bit. * Undo useless diffs. * Fix typo. --- Eigen/src/Core/GenericPacketMath.h | 29 ++++++++----------- Eigen/src/Core/arch/AVX/Complex.h | 10 ++++--- Eigen/src/Core/arch/AVX/PacketMath.h | 19 +++++++++++++ Eigen/src/Core/arch/AVX512/Complex.h | 10 ++----- Eigen/src/Core/arch/AVX512/PacketMath.h | 49 +++++++++++++++++++------------- Eigen/src/Core/arch/GPU/PacketMathHalf.h | 16 +++++++++++ Eigen/src/Core/arch/SSE/Complex.h | 11 ++++--- Eigen/src/Core/arch/SSE/PacketMath.h | 11 +++++++ test/packetmath.cpp | 2 ++ 9 files changed, 104 insertions(+), 53 deletions(-) (limited to 'Eigen/src/Core/arch/GPU') diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 8bdf16e16..8bcceaa7b 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -214,17 +214,13 @@ pxor(const Packet& a, const Packet& b) { return a ^ b; } template EIGEN_DEVICE_FUNC inline Packet pandnot(const Packet& a, const Packet& b) { return a & (~b); } -/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ +/** \internal \returns ones */ template EIGEN_DEVICE_FUNC inline Packet -pset1(const typename unpacket_traits::type& a) { return a; } +pones(const Packet& /*a*/) { Packet b; memset(&b, 0xff, sizeof(b)); return b;} /** \internal \returns the bitwise not of \a a */ -template EIGEN_DEVICE_FUNC inline Packet -pnot(const Packet& a) { - typedef typename unpacket_traits::type Scalar; - Packet ones = pset1(Scalar(1)); - return pandnot(ones, a); -} +template EIGEN_DEVICE_FUNC inline Packet +pnot(const Packet& a) { return pxor(pones(a), a);} /** \internal \returns \a a shifted by N bits to the right */ template EIGEN_DEVICE_FUNC inline int @@ -262,24 +258,19 @@ pselect(const Packet& mask, const Packet& a, const Packet& b) { /** \internal \returns a <= b as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -pcmp_le(const Packet& a, const Packet& b); /* { return a<=b ? pnot(pxor(a,a)) : pxor(a,a); } */ +pcmp_le(const Packet& a, const Packet& b) { return a<=b ? pones(a) : pzero(a); } /** \internal \returns a < b as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -pcmp_lt(const Packet& a, const Packet& b); /* { return a EIGEN_DEVICE_FUNC inline Packet -pcmp_eq(const Packet& a, const Packet& b) -{ - typedef typename unpacket_traits::type Scalar; - Packet zeros = pset1(Scalar(0)); - return a==b ? pnot(zeros) : zeros; -} +pcmp_eq(const Packet& a, const Packet& b) { return a==b ? pones(a) : pzero(a); } /** \internal \returns a < b or a==NaN or b==NaN as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -pcmp_lt_or_nan(const Packet& a, const Packet& b); /* { return pnot(pcmp_le(b,a)); } */ +pcmp_lt_or_nan(const Packet& a, const Packet& b) { return pnot(pcmp_le(b,a)); } /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */ template EIGEN_DEVICE_FUNC inline Packet @@ -289,6 +280,10 @@ pload(const typename unpacket_traits::type* from) { return *from; } template EIGEN_DEVICE_FUNC inline Packet ploadu(const typename unpacket_traits::type* from) { return *from; } +/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ +template EIGEN_DEVICE_FUNC inline Packet +pset1(const typename unpacket_traits::type& a) { return a; } + /** \internal \returns a packet with constant coefficients set from bits */ template EIGEN_DEVICE_FUNC inline Packet pset1frombits(BitsType a); diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 23687c624..9f1bb969e 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -72,10 +72,11 @@ template<> EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, con template <> EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) { __m256 eq = _mm256_cmp_ps(a.v, b.v, _CMP_EQ_OQ); - __m256 real_and_imag_equal = _mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1)); - return Packet4cf(real_and_imag_equal); + return Packet4cf(_mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1))); } +template<> EIGEN_STRONG_INLINE Packet4cf pones(const Packet4cf& a) { return Packet4cf(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pnot(const Packet4cf& a) { return Packet4cf(pnot(a.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pand (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf por (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pxor (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); } @@ -286,10 +287,11 @@ template<> EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, con template <> EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) { __m256d eq = _mm256_cmp_pd(a.v, b.v, _CMP_EQ_OQ); - __m256d real_and_imag_equal = _mm256_and_pd(eq, _mm256_permute_pd(eq, 0x5)); - return Packet2cd(real_and_imag_equal); + return Packet2cd(pand(eq, _mm256_permute_pd(eq, 0x5))); } +template<> EIGEN_STRONG_INLINE Packet2cd pones(const Packet2cd& a) { return Packet2cd(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pnot(const Packet2cd& a) { return Packet2cd(pnot(a.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pand (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd por (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pxor (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index a6af48f21..f6a514fbf 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -250,6 +250,25 @@ template<> EIGEN_STRONG_INLINE Packet4d pceil(const Packet4d& a) { ret template<> EIGEN_STRONG_INLINE Packet8f pfloor(const Packet8f& a) { return _mm256_floor_ps(a); } template<> EIGEN_STRONG_INLINE Packet4d pfloor(const Packet4d& a) { return _mm256_floor_pd(a); } + +#ifdef EIGEN_VECTORIZE_AVX2 +template<> EIGEN_STRONG_INLINE Packet8i pones(const Packet8i& a) { + return _mm256_cmpeq_epi64(a,a); +} +#else +template<> EIGEN_STRONG_INLINE Packet8i pones(const Packet8i& /*a*/) { + const unsigned int o = 0xffffffffu; + return _mm256_set_epi32(o, o, o, o, o, o, o, o); +} +#endif +template<> EIGEN_STRONG_INLINE Packet8f pones(const Packet8f& a) { + return _mm256_castsi256_ps(pones(_mm256_castps_si256(a))); +} + +template<> EIGEN_STRONG_INLINE Packet4d pones(const Packet4d& a) { + return _mm256_castsi256_pd(pones(_mm256_castpd_si256(a))); +} + template<> EIGEN_STRONG_INLINE Packet8f pand(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pand(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet8i pand(const Packet8i& a, const Packet8i& b) { diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 2c613f870..154fedc25 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -83,10 +83,7 @@ template<> EIGEN_STRONG_INLINE Packet8cf pandnot(const Packet8cf& a, template <> EIGEN_STRONG_INLINE Packet8cf pcmp_eq(const Packet8cf& a, const Packet8cf& b) { __m512 eq = pcmp_eq(a.v, b.v); - __m512 eq_swap_real_imag = _mm512_permute_ps(eq, 0xB1); - __m512i real_and_imag_equal = _mm512_and_si512( - _mm512_castps_si512(eq), _mm512_castps_si512(eq_swap_real_imag)); - return Packet8cf(_mm512_castsi512_ps(real_and_imag_equal)); + return Packet8cf(pand(eq, _mm512_permute_ps(eq, 0xB1))); } template<> EIGEN_STRONG_INLINE Packet8cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet8cf(pload(&numext::real_ref(*from))); } @@ -279,10 +276,7 @@ template<> EIGEN_STRONG_INLINE Packet4cd pandnot(const Packet4cd& a, template <> EIGEN_STRONG_INLINE Packet4cd pcmp_eq(const Packet4cd& a, const Packet4cd& b) { __m512d eq = pcmp_eq(a.v, b.v); - __m512d eq_swap_real_imag = _mm512_permute_pd(eq, 0x55); - __m512i real_and_imag_equal = _mm512_and_si512( - _mm512_castpd_si512(eq), _mm512_castpd_si512(eq_swap_real_imag)); - return Packet4cd(_mm512_castsi512_pd(real_and_imag_equal)); + return Packet4cd(pand(eq, _mm512_permute_pd(eq, 0x55))); } template<> EIGEN_STRONG_INLINE Packet4cd pload (const std::complex* from) diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 68adf5e57..9666c4e22 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -284,47 +284,56 @@ EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { #endif template<> EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) { - __m256 lo = pcmp_le(extract256<0>(a), extract256<0>(b)); - __m256 hi = pcmp_le(extract256<1>(a), extract256<1>(b)); - return cat256(lo, hi); + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); } template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) { - __m256 lo = pcmp_lt(extract256<0>(a), extract256<0>(b)); - __m256 hi = pcmp_lt(extract256<1>(a), extract256<1>(b)); - return cat256(lo, hi); -} - -template<> EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) { - __m256 lo = pcmp_eq(extract256<0>(a), extract256<0>(b)); - __m256 hi = pcmp_eq(extract256<1>(a), extract256<1>(b)); - return cat256(lo, hi); + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); } template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) { - __m256 lo = pcmp_lt_or_nan(extract256<0>(a), extract256<0>(b)); - __m256 hi = pcmp_lt_or_nan(extract256<1>(a), extract256<1>(b)); - return cat256(lo, hi); + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGT_UQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); } template<> EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) { - __m256i lo = _mm256_cmpeq_epi32(_mm512_extracti64x4_epi64(a, 0), _mm512_extracti64x4_epi64(b, 0)); - __m256i hi = _mm256_cmpeq_epi32(_mm512_extracti64x4_epi64(a, 1), _mm512_extracti64x4_epi64(b, 1)); - return _mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1); + __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _CMP_EQ_OQ); + return _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu); } template <> EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) { __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ); return _mm512_castsi512_ps( - _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffff)); + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); } template <> EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) { __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ); return _mm512_castsi512_pd( - _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffff)); + _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu)); +} + +template <> +EIGEN_STRONG_INLINE Packet16i pones(const Packet16i& /*a*/) { + const unsigned int o = 0xffffffffu; + return _mm512_set_epi32(o, o, o, o, o, o, o, o, o, o, o, o, o, o, o, o); +} + +template <> +EIGEN_STRONG_INLINE Packet16f pones(const Packet16f& a) { + return _mm512_castsi512_ps(pones(_mm512_castps_si512(a))); +} + +template <> +EIGEN_STRONG_INLINE Packet8d pones(const Packet8d& a) { + return _mm512_castsi512_pd(pones(_mm512_castpd_si512(a))); } template <> diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index 3e35f96cc..c4dfedcf8 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -143,6 +143,10 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& return result; } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pones(const half2& a) { + half2 result; + *(reinterpret_cast(&(result))) = 0xffffffffu; +} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { @@ -640,6 +644,14 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { #endif } +template<> EIGEN_STRONG_INLINE Packet16h pnot(const Packet16h& a) { + Packet16h r; r.x = _mm256_xor_si256(a.x, pcmp_eq(a.x, a.x)); return r; +} + +template<> EIGEN_STRONG_INLINE Packet16h pones(const Packet16h& a) { + Packet16h r; r.x = Packet8i(pones(a.x)); return r; +} + template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) { // in some cases Packet8i is a wrapper around __m256i, so we need to // cast to Packet8i to call the correct overload. @@ -1085,6 +1097,10 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { #endif } +template<> EIGEN_STRONG_INLINE Packet8h pones(const Packet8h& a) { + Packet8h r; r.x = _mm_cmpeq_epi32(a.x, a.x); return r; +} + template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) { // in some cases Packet4i is a wrapper around __m128i, so we either need to // cast to Packet4i to directly call the intrinsics as below: diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index a7304193b..fa84097ac 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -82,6 +82,9 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con #endif } +template<> EIGEN_STRONG_INLINE Packet2cf pones (const Packet2cf& a) { return Packet2cf(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pnot (const Packet2cf& a) { return Packet2cf(pnot(a.v)); } + template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); } @@ -305,6 +308,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con #endif } +template<> EIGEN_STRONG_INLINE Packet1cd pones (const Packet1cd& a) { return Packet1cd(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pnot (const Packet1cd& a) { return Packet1cd(pnot(a.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); } @@ -442,15 +447,13 @@ ptranspose(PacketBlock& kernel) { template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) { __m128 eq = _mm_cmpeq_ps(a.v, b.v); - __m128 real_and_imag_equal = _mm_and_ps(eq, vec4f_swizzle1(eq, 1, 0, 3, 2)); - return Packet2cf(real_and_imag_equal); + return Packet2cf(pand(eq, vec4f_swizzle1(eq, 1, 0, 3, 2))); } template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) { __m128d eq = _mm_cmpeq_pd(a.v, b.v); - __m128d real_and_imag_equal = _mm_and_pd(eq, vec2d_swizzle1(eq, 1, 0)); - return Packet1cd(real_and_imag_equal); + return Packet1cd(pand(eq, vec2d_swizzle1(eq, 1, 0))); } template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index b8a5497a9..6dd2f8a46 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -378,6 +378,17 @@ template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4 template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pones(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); } +template<> EIGEN_STRONG_INLINE Packet4f +pones(const Packet4f& a) { + Packet4i b = _mm_castps_si128(a); + return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b)); +} +template<> EIGEN_STRONG_INLINE Packet2d +pones(const Packet2d& a) { + Packet4i b = _mm_castpd_si128(a); + return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b)); +} template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } diff --git a/test/packetmath.cpp b/test/packetmath.cpp index a88b7bba9..460cfbdbe 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -239,6 +239,8 @@ template void packetmath() CHECK_CWISE2_IF(PacketTraits::HasDiv, REF_DIV, internal::pdiv); CHECK_CWISE1(internal::pnot, internal::pnot); + CHECK_CWISE1(internal::pzero, internal::pzero); + CHECK_CWISE1(internal::pones, internal::pones); CHECK_CWISE1(internal::negate, internal::pnegate); CHECK_CWISE1(numext::conj, internal::pconj); -- cgit v1.2.3 From df29511ac0486639e23fe65c7edafecc2d9f1579 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 11 Jan 2019 10:36:36 -0800 Subject: Fix merge. --- Eigen/src/Core/arch/GPU/PacketMathHalf.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'Eigen/src/Core/arch/GPU') diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index 85a32a18d..00e40d40b 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -143,11 +143,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& return result; } -<<<<<<< working copy template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& a) { -======= -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pones(const half2& a) { ->>>>>>> merge rev half2 result; *(reinterpret_cast(&(result))) = 0xffffffffu; } @@ -652,13 +648,8 @@ template<> EIGEN_STRONG_INLINE Packet16h pnot(const Packet16h& a) { Packet16h r; r.x = _mm256_xor_si256(a.x, pcmp_eq(a.x, a.x)); return r; } -<<<<<<< working copy template<> EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) { Packet16h r; r.x = Packet8i(ptrue(a.x)); return r; -======= -template<> EIGEN_STRONG_INLINE Packet16h pones(const Packet16h& a) { - Packet16h r; r.x = Packet8i(pones(a.x)); return r; ->>>>>>> merge rev } template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) { @@ -1106,11 +1097,7 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { #endif } -<<<<<<< working copy template<> EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) { -======= -template<> EIGEN_STRONG_INLINE Packet8h pones(const Packet8h& a) { ->>>>>>> merge rev Packet8h r; r.x = _mm_cmpeq_epi32(a.x, a.x); return r; } -- cgit v1.2.3 From 89c4001d6f5136fd2702258e4fa754be31d682a1 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 11 Jan 2019 14:10:57 -0800 Subject: Fix warnings in ptrue for complex and half types. --- Eigen/src/Core/GenericPacketMath.h | 7 +++++++ Eigen/src/Core/arch/GPU/PacketMathHalf.h | 6 +++++- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'Eigen/src/Core/arch/GPU') diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 95c4e4027..bb3275fe8 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -218,6 +218,13 @@ pandnot(const Packet& a, const Packet& b) { return a & (~b); } template EIGEN_DEVICE_FUNC inline Packet ptrue(const Packet& /*a*/) { Packet b; memset(&b, 0xff, sizeof(b)); return b;} +template +EIGEN_DEVICE_FUNC inline std::complex ptrue(const std::complex& /*a*/) { + RealScalar b; + b = ptrue(b); + return std::complex(b, b); +} + /** \internal \returns the bitwise not of \a a */ template EIGEN_DEVICE_FUNC inline Packet pnot(const Packet& a) { return pxor(ptrue(a), a);} diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index 00e40d40b..eab7be14c 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -143,10 +143,14 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& return result; } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half ptrue(const half& a) { + return __half_raw(0xffffu); +} + template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& a) { half2 result; *(reinterpret_cast(&(result))) = 0xffffffffu; -} +} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { -- cgit v1.2.3 From 7401e2541deffd08c61b0426b2bcd21ffd481ac0 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 16 Jan 2019 14:43:33 -0800 Subject: Fix compilation error for logical packet ops with older compilers. --- Eigen/src/Core/arch/AVX/Complex.h | 8 ++++---- Eigen/src/Core/arch/AVX512/Complex.h | 8 ++++---- Eigen/src/Core/arch/GPU/PacketMathHalf.h | 4 ---- Eigen/src/Core/arch/SSE/Complex.h | 12 ++++++------ 4 files changed, 14 insertions(+), 18 deletions(-) (limited to 'Eigen/src/Core/arch/GPU') diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index dcca35279..16faf1082 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -75,8 +75,8 @@ EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1))); } -template<> EIGEN_STRONG_INLINE Packet4cf ptrue(const Packet4cf& a) { return Packet4cf(ptrue(a.v)); } -template<> EIGEN_STRONG_INLINE Packet4cf pnot(const Packet4cf& a) { return Packet4cf(pnot(a.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf ptrue(const Packet4cf& a) { return Packet4cf(ptrue(Packet8f(a.v))); } +template<> EIGEN_STRONG_INLINE Packet4cf pnot(const Packet4cf& a) { return Packet4cf(pnot(Packet8f(a.v))); } template<> EIGEN_STRONG_INLINE Packet4cf pand (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf por (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pxor (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); } @@ -290,8 +290,8 @@ EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(pand(eq, _mm256_permute_pd(eq, 0x5))); } -template<> EIGEN_STRONG_INLINE Packet2cd ptrue(const Packet2cd& a) { return Packet2cd(ptrue(a.v)); } -template<> EIGEN_STRONG_INLINE Packet2cd pnot(const Packet2cd& a) { return Packet2cd(pnot(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd ptrue(const Packet2cd& a) { return Packet2cd(ptrue(Packet4d(a.v))); } +template<> EIGEN_STRONG_INLINE Packet2cd pnot(const Packet2cd& a) { return Packet2cd(pnot(Packet4d(a.v))); } template<> EIGEN_STRONG_INLINE Packet2cd pand (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd por (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pxor (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 7bb2fd630..7ea72c509 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -55,8 +55,8 @@ template<> struct unpacket_traits { typedef Packet4cf half; }; -template<> EIGEN_STRONG_INLINE Packet8cf ptrue(const Packet8cf& a) { return Packet8cf(ptrue(a.v)); } -template<> EIGEN_STRONG_INLINE Packet8cf pnot(const Packet8cf& a) { return Packet8cf(pnot(a.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf ptrue(const Packet8cf& a) { return Packet8cf(ptrue(Packet16f(a.v))); } +template<> EIGEN_STRONG_INLINE Packet8cf pnot(const Packet8cf& a) { return Packet8cf(pnot(Packet16f(a.v))); } template<> EIGEN_STRONG_INLINE Packet8cf padd(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet8cf psub(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_sub_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet8cf pnegate(const Packet8cf& a) @@ -270,8 +270,8 @@ template<> EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, con return Packet4cd(_mm512_fmaddsub_pd(tmp1, b.v, odd)); } -template<> EIGEN_STRONG_INLINE Packet4cd ptrue(const Packet4cd& a) { return Packet4cd(ptrue(a.v)); } -template<> EIGEN_STRONG_INLINE Packet4cd pnot(const Packet4cd& a) { return Packet4cd(pnot(a.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd ptrue(const Packet4cd& a) { return Packet4cd(ptrue(Packet8d(a.v))); } +template<> EIGEN_STRONG_INLINE Packet4cd pnot(const Packet4cd& a) { return Packet4cd(pnot(Packet8d(a.v))); } template<> EIGEN_STRONG_INLINE Packet4cd pand (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pand(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cd por (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(por(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cd pxor (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pxor(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index eab7be14c..020baa353 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -143,10 +143,6 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& return result; } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half ptrue(const half& a) { - return __half_raw(0xffffu); -} - template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& a) { half2 result; *(reinterpret_cast(&(result))) = 0xffffffffu; diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index c3b1de5ce..e51966f0d 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -82,8 +82,8 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con #endif } -template<> EIGEN_STRONG_INLINE Packet2cf ptrue (const Packet2cf& a) { return Packet2cf(ptrue(a.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pnot (const Packet2cf& a) { return Packet2cf(pnot(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf ptrue (const Packet2cf& a) { return Packet2cf(ptrue(Packet4f(a.v))); } +template<> EIGEN_STRONG_INLINE Packet2cf pnot (const Packet2cf& a) { return Packet2cf(pnot(Packet4f(a.v))); } template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); } @@ -308,8 +308,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con #endif } -template<> EIGEN_STRONG_INLINE Packet1cd ptrue (const Packet1cd& a) { return Packet1cd(ptrue(a.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pnot (const Packet1cd& a) { return Packet1cd(pnot(a.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd ptrue (const Packet1cd& a) { return Packet1cd(ptrue(Packet2d(a.v))); } +template<> EIGEN_STRONG_INLINE Packet1cd pnot (const Packet1cd& a) { return Packet1cd(pnot(Packet2d(a.v))); } template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); } @@ -447,13 +447,13 @@ ptranspose(PacketBlock& kernel) { template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) { __m128 eq = _mm_cmpeq_ps(a.v, b.v); - return Packet2cf(pand(eq, vec4f_swizzle1(eq, 1, 0, 3, 2))); + return Packet2cf(pand(eq, vec4f_swizzle1(eq, 1, 0, 3, 2))); } template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) { __m128d eq = _mm_cmpeq_pd(a.v, b.v); - return Packet1cd(pand(eq, vec2d_swizzle1(eq, 1, 0))); + return Packet1cd(pand(eq, vec2d_swizzle1(eq, 1, 0))); } template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { -- cgit v1.2.3 From 2eccbaf3f73f34a2bac3420377ea844358dfaf5a Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 17 Jan 2019 17:45:08 -0800 Subject: Add missing logical packet ops for GPU and NEON. --- Eigen/src/Core/arch/GPU/PacketMath.h | 111 ++++++++++++++++++++++++++++++++++ Eigen/src/Core/arch/NEON/PacketMath.h | 2 + 2 files changed, 113 insertions(+) (limited to 'Eigen/src/Core/arch/GPU') diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h index ddf37b9c1..e3b2d56ec 100644 --- a/Eigen/src/Core/arch/GPU/PacketMath.h +++ b/Eigen/src/Core/arch/GPU/PacketMath.h @@ -100,6 +100,117 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1(const do return make_double2(from, from); } +namespace { + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a, + const float& b) { + return __int_as_float(__float_as_int(a) & __float_as_int(b)); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_and(const double& a, + const double& b) { + return __longlong_as_double(__double_as_longlong(a) & + __double_as_longlong(b)); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_or(const float& a, + const float& b) { + return __int_as_float(__float_as_int(a) | __float_as_int(b)); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_or(const double& a, + const double& b) { + return __longlong_as_double(__double_as_longlong(a) | + __double_as_longlong(b)); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_xor(const float& a, + const float& b) { + return __int_as_float(__float_as_int(a) ^ __float_as_int(b)); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_xor(const double& a, + const double& b) { + return __longlong_as_double(__double_as_longlong(a) ^ + __double_as_longlong(b)); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_andnot(const float& a, + const float& b) { + return __int_as_float(__float_as_int(a) & ~__float_as_int(b)); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_andnot(const double& a, + const double& b) { + return __longlong_as_double(__double_as_longlong(a) & + ~__double_as_longlong(b)); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float eq_mask(const float& a, + const float& b) { + return __int_as_float(a == b ? 0xffffffffu : 0u); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double eq_mask(const double& a, + const double& b) { + return __longlong_as_double(a == b ? 0xffffffffffffffffull : 0ull); +} + +} // namespace + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand(const float4& a, + const float4& b) { + return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y), + bitwise_and(a.z, b.z), bitwise_and(a.w, b.w)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand(const double2& a, + const double2& b) { + return make_double2(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por(const float4& a, + const float4& b) { + return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y), + bitwise_or(a.z, b.z), bitwise_or(a.w, b.w)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por(const double2& a, + const double2& b) { + return make_double2(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor(const float4& a, + const float4& b) { + return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y), + bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor(const double2& a, + const double2& b) { + return make_double2(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot(const float4& a, + const float4& b) { + return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y), + bitwise_andnot(a.z, b.z), bitwise_andnot(a.w, b.w)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 +pandnot(const double2& a, const double2& b) { + return make_double2(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq(const float4& a, + const float4& b) { + return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z), + eq_mask(a.w, b.w)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 +pcmp_eq(const double2& a, const double2& b) { + return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y)); +} template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset(const float& a) { return make_float4(a, a+1, a+2, a+3); diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index a8a7b63c9..76e6b3966 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -711,6 +711,8 @@ template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, con return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return vreinterpretq_f64_u64(vceqq_f64(a,b)); } + template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); } template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); } -- cgit v1.2.3 From 4d7f31710299fd869def962f2070c252ae1aaa67 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 21 Feb 2019 13:32:13 -0800 Subject: Add a few missing packet ops: cmp_eq for NEON. pfloor for GPU. --- Eigen/src/Core/arch/GPU/PacketMath.h | 9 +++++++++ Eigen/src/Core/arch/NEON/Complex.h | 24 ++++++++++++++++++++++++ 2 files changed, 33 insertions(+) (limited to 'Eigen/src/Core/arch/GPU') diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h index c1b097fb9..cd4615a45 100644 --- a/Eigen/src/Core/arch/GPU/PacketMath.h +++ b/Eigen/src/Core/arch/GPU/PacketMath.h @@ -53,6 +53,7 @@ template<> struct packet_traits : default_packet_traits HasBetaInc = 1, HasBlend = 0, + HasFloor = 1, }; }; @@ -86,6 +87,7 @@ template<> struct packet_traits : default_packet_traits HasBetaInc = 1, HasBlend = 0, + HasFloor = 1, }; }; @@ -408,6 +410,13 @@ template<> EIGEN_DEVICE_FUNC inline double2 pabs(const double2& a) { return make_double2(fabs(a.x), fabs(a.y)); } +template<> EIGEN_DEVICE_FUNC inline float4 pfloor(const float4& a) { + return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double2 pfloor(const double2& a) { + return make_double2(floor(a.x), floor(a.y)); +} + EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { float tmp = kernel.packet[0].y; diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index d149275b5..e9da4a3f6 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -101,6 +101,18 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con return Packet2cf(vaddq_f32(v1, v2)); } +template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) +{ + // Compare real and imaginary parts of a and b to get the mask vector: + // [re(a[0])==re(b[0]), im(a[0])==im(b[0]), re(a[1])==re(b[1]), im(a[1])==im(b[1])] + Packet4f eq = pcmp_eq(a.v, b.v); + // Swap real/imag elements in the mask in to get: + // [im(a[0])==im(b[0]), re(a[0])==re(b[0]), im(a[1])==im(b[1]), re(a[1])==re(b[1])] + Packet4f eq_swapped = vrev64q_f32(eq); + // Return re(a)==re(b) && im(a)==im(b) by computing bitwise AND of eq and eq_swapped + return Packet2cf(pand(eq, eq_swapped)); +} + template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); @@ -361,6 +373,18 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con return Packet1cd(vaddq_f64(v1, v2)); } +template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) +{ + // Compare real and imaginary parts of a and b to get the mask vector: + // [re(a)==re(b), im(a)==im(b)] + Packet2d eq = pcmp_eq(a.v, b.v); + // Swap real/imag elements in the mask in to get: + // [im(a)==im(b), re(a)==re(b)] + Packet2d eq_swapped = vrev64q_u32(eq); + // Return re(a)==re(b) & im(a)==im(b) by computing bitwise AND of eq and eq_swapped + return Packet1cd(pand(eq, eq_swapped)); +} + template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); -- cgit v1.2.3 From 77f7d4a894a633c2b8b72221a7b5f81e8d140182 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Mon, 11 Mar 2019 17:51:16 -0700 Subject: Clean up PacketMathHalf.h and add a few missing logical packet ops. --- Eigen/src/Core/arch/GPU/PacketMathHalf.h | 80 +++++++++++++++++++++++++++++--- 1 file changed, 74 insertions(+), 6 deletions(-) (limited to 'Eigen/src/Core/arch/GPU') diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index 316ac0283..5917ec1b8 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -137,15 +137,21 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) { - half2 result; - unsigned temp = *(reinterpret_cast(&(a))); - *(reinterpret_cast(&(result))) = temp & 0x7FFF7FFF; - return result; + half a1 = __low2half(a); + half a2 = __high2half(a); + half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF); + half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF); + return __halves2half2(result1, result2); } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& a) { - half2 result; - *(reinterpret_cast(&(result))) = 0xffffffffu; + half true_half = half_impl::raw_uint16_to_half(0xffffu); + return pset1(true_half); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& a) { + half false_half = half_impl::raw_uint16_to_half(0x0000u); + return pset1(false_half); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void @@ -175,6 +181,68 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen: #endif } +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a, + const half2& b) { + half true_half = half_impl::raw_uint16_to_half(0xffffu); + half false_half = half_impl::raw_uint16_to_half(0x0000u); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); + half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half; + half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half; + return __halves2half2(eq1, eq2); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a, + const half2& b) { + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); + half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x); + half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x); + return __halves2half2(result1, result2); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a, + const half2& b) { + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); + half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x); + half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x); + return __halves2half2(result1, result2); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a, + const half2& b) { + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); + half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x); + half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x); + return __halves2half2(result1, result2); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a, + const half2& b) { + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); + half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x); + half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x); + return __halves2half2(result1, result2); +} + template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { #if defined(EIGEN_HIP_DEVICE_COMPILE) -- cgit v1.2.3 From 8450a6d519454f318f490c797e089c2f0fc540f2 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 14 Mar 2019 15:18:06 -0700 Subject: Clean up half packet traits and add a few more missing packet ops. --- Eigen/src/Core/arch/GPU/PacketMathHalf.h | 73 +++++++++++++++++++++++++++++--- 1 file changed, 67 insertions(+), 6 deletions(-) (limited to 'Eigen/src/Core/arch/GPU') diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index 5917ec1b8..869fa7ec6 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -30,6 +30,7 @@ template<> struct packet_traits : default_packet_traits size=2, HasHalfPacket = 0, HasAdd = 1, + HasSub = 1, HasMul = 1, HasDiv = 1, HasSqrt = 1, @@ -572,6 +573,7 @@ struct packet_traits : default_packet_traits { HasAdd = 1, HasSub = 1, HasMul = 1, + HasDiv = 1, HasNegate = 1, HasAbs = 0, HasAbs2 = 0, @@ -579,7 +581,6 @@ struct packet_traits : default_packet_traits { HasMax = 0, HasConj = 0, HasSetLinear = 0, - HasDiv = 0, HasSqrt = 0, HasRsqrt = 0, HasExp = 0, @@ -770,6 +771,13 @@ template<> EIGEN_STRONG_INLINE Packet16h pmul(const Packet16h& a, con return float2half(rf); } +template<> EIGEN_STRONG_INLINE Packet16h pdiv(const Packet16h& a, const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + Packet16f rf = pdiv(af, bf); + return float2half(rf); +} + template<> EIGEN_STRONG_INLINE half predux(const Packet16h& from) { Packet16f from_float = half2float(from); return half(predux(from_float)); @@ -1054,6 +1062,7 @@ struct packet_traits : default_packet_traits { HasAdd = 1, HasSub = 1, HasMul = 1, + HasDiv = 1, HasNegate = 1, HasAbs = 0, HasAbs2 = 0, @@ -1061,7 +1070,6 @@ struct packet_traits : default_packet_traits { HasMax = 0, HasConj = 0, HasSetLinear = 0, - HasDiv = 0, HasSqrt = 0, HasRsqrt = 0, HasExp = 0, @@ -1221,6 +1229,13 @@ template<> EIGEN_STRONG_INLINE Packet8h pmul(const Packet8h& a, const return float2half(rf); } +template<> EIGEN_STRONG_INLINE Packet8h pdiv(const Packet8h& a, const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = pdiv(af, bf); + return float2half(rf); +} + template<> EIGEN_STRONG_INLINE Packet8h pgather(const Eigen::half* from, Index stride) { Packet8h result; @@ -1407,9 +1422,10 @@ struct packet_traits : default_packet_traits { AlignedOnScalar = 1, size = 4, HasHalfPacket = 0, - HasAdd = 0, - HasSub = 0, - HasMul = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, HasNegate = 0, HasAbs = 0, HasAbs2 = 0, @@ -1417,7 +1433,6 @@ struct packet_traits : default_packet_traits { HasMax = 0, HasConj = 0, HasSetLinear = 0, - HasDiv = 0, HasSqrt = 0, HasRsqrt = 0, HasExp = 0, @@ -1464,6 +1479,29 @@ template<> EIGEN_STRONG_INLINE Packet4h padd(const Packet4h& a, const return result; } +template<> EIGEN_STRONG_INLINE Packet4h psub(const Packet4h& a, const Packet4h& b) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + __int64_t b64 = _mm_cvtm64_si64(b.x); + + Eigen::half h[4]; + + Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); + Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); + h[0] = ha - hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); + h[1] = ha - hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); + h[2] = ha - hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); + h[3] = ha - hb; + Packet4h result; + result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); + return result; +} + template<> EIGEN_STRONG_INLINE Packet4h pmul(const Packet4h& a, const Packet4h& b) { __int64_t a64 = _mm_cvtm64_si64(a.x); __int64_t b64 = _mm_cvtm64_si64(b.x); @@ -1487,6 +1525,29 @@ template<> EIGEN_STRONG_INLINE Packet4h pmul(const Packet4h& a, const return result; } +template<> EIGEN_STRONG_INLINE Packet4h pdiv(const Packet4h& a, const Packet4h& b) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + __int64_t b64 = _mm_cvtm64_si64(b.x); + + Eigen::half h[4]; + + Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); + Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); + h[0] = ha / hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); + h[1] = ha / hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); + h[2] = ha / hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); + h[3] = ha / hb; + Packet4h result; + result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); + return result; +} + template<> EIGEN_STRONG_INLINE Packet4h pload(const Eigen::half* from) { Packet4h result; result.x = _mm_cvtsi64_m64(*reinterpret_cast(from)); -- cgit v1.2.3