aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--Eigen/src/Core/GenericPacketMath.h47
-rw-r--r--Eigen/src/Core/arch/AVX/Complex.h4
-rw-r--r--Eigen/src/Core/arch/AVX/PacketMath.h19
-rw-r--r--Eigen/src/Core/arch/AVX512/PacketMath.h26
-rw-r--r--Eigen/src/Core/arch/GPU/PacketMathHalf.h16
-rw-r--r--Eigen/src/Core/arch/SSE/Complex.h5
-rwxr-xr-xEigen/src/Core/arch/SSE/PacketMath.h11
-rw-r--r--test/packetmath.cpp2
8 files changed, 95 insertions, 35 deletions
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 8bdf16e16..777c74f57 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -214,17 +214,21 @@ pxor(const Packet& a, const Packet& b) { return a ^ b; }
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
pandnot(const Packet& a, const Packet& b) { return a & (~b); }
-/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
+/** \internal \returns a packet with constant coefficients set from bits */
+template<typename Packet,typename BitsType> EIGEN_DEVICE_FUNC inline Packet
+pset1frombits(BitsType a);
+
+/** \internal \returns zeros */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pset1(const typename unpacket_traits<Packet>::type& a) { return a; }
+pzero(const Packet& a) { return pxor(a,a); }
-/** \internal \returns the bitwise not of \a a */
+/** \internal \returns ones */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pnot(const Packet& a) {
- typedef typename unpacket_traits<Packet>::type Scalar;
- Packet ones = pset1<Packet>(Scalar(1));
- return pandnot(ones, a);
-}
+pones(const Packet& /*a*/) { Packet b; memset(&b, 0xff, sizeof(b)); return b;}
+
+/** \internal \returns the bitwise not of \a a */
+template <typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pnot(const Packet& a) { return pxor(pones(a), a);}
/** \internal \returns \a a shifted by N bits to the right */
template<int N> EIGEN_DEVICE_FUNC inline int
@@ -250,36 +254,25 @@ pfrexp(const Packet &a, Packet &exponent) { return std::frexp(a,&exponent); }
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
pldexp(const Packet &a, const Packet &exponent) { return std::ldexp(a,exponent); }
-/** \internal \returns zeros */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pzero(const Packet& a) { return pxor(a,a); }
-
/** \internal \returns bits of \a or \b according to the input bit mask \a mask */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pselect(const Packet& mask, const Packet& a, const Packet& b) {
- return por(pand(a,mask),pandnot(b,mask));
-}
+pselect(const Packet& mask, const Packet& a, const Packet& b) { return por(pand(a,mask),pandnot(b,mask)); }
/** \internal \returns a <= b as a bit mask */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pcmp_le(const Packet& a, const Packet& b); /* { return a<=b ? pnot(pxor(a,a)) : pxor(a,a); } */
+pcmp_le(const Packet& a, const Packet& b) { return a<=b ? pones(a) : pzero(a); }
/** \internal \returns a < b as a bit mask */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pcmp_lt(const Packet& a, const Packet& b); /* { return a<b ? pnot(pxor(a,a)) : pxor(a,a); } */
+pcmp_lt(const Packet& a, const Packet& b) { return a<b ? pones(a) : pzero(a); }
/** \internal \returns a == b as a bit mask */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pcmp_eq(const Packet& a, const Packet& b)
-{
- typedef typename unpacket_traits<Packet>::type Scalar;
- Packet zeros = pset1<Packet>(Scalar(0));
- return a==b ? pnot(zeros) : zeros;
-}
+pcmp_eq(const Packet& a, const Packet& b) { return a==b ? pones(a) : pzero(a); }
/** \internal \returns a < b or a==NaN or b==NaN as a bit mask */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pcmp_lt_or_nan(const Packet& a, const Packet& b); /* { return pnot(pcmp_le(b,a)); } */
+pcmp_lt_or_nan(const Packet& a, const Packet& b) { return pnot(pcmp_le(b,a)); }
/** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
@@ -289,9 +282,9 @@ pload(const typename unpacket_traits<Packet>::type* from) { return *from; }
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }
-/** \internal \returns a packet with constant coefficients set from bits */
-template<typename Packet,typename BitsType> EIGEN_DEVICE_FUNC inline Packet
-pset1frombits(BitsType a);
+/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pset1(const typename unpacket_traits<Packet>::type& a) { return a; }
/** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h
index 23687c624..d880ef593 100644
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -76,6 +76,8 @@ EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) {
return Packet4cf(real_and_imag_equal);
}
+template<> EIGEN_STRONG_INLINE Packet4cf pones<Packet4cf>(const Packet4cf& a) { return Packet4cf(pones(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf pnot<Packet4cf>(const Packet4cf& a) { return Packet4cf(pnot(a.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf pand <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf por <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf pxor <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); }
@@ -290,6 +292,8 @@ EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) {
return Packet2cd(real_and_imag_equal);
}
+template<> EIGEN_STRONG_INLINE Packet2cd pones<Packet2cd>(const Packet2cd& a) { return Packet2cd(pones(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd pnot<Packet2cd>(const Packet2cd& a) { return Packet2cd(pnot(a.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd pand <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd por <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd pxor <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); }
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index a6af48f21..f6a514fbf 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -250,6 +250,25 @@ template<> EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) { ret
template<> EIGEN_STRONG_INLINE Packet8f pfloor<Packet8f>(const Packet8f& a) { return _mm256_floor_ps(a); }
template<> EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) { return _mm256_floor_pd(a); }
+
+#ifdef EIGEN_VECTORIZE_AVX2
+template<> EIGEN_STRONG_INLINE Packet8i pones<Packet8i>(const Packet8i& a) {
+ return _mm256_cmpeq_epi64(a,a);
+}
+#else
+template<> EIGEN_STRONG_INLINE Packet8i pones<Packet8i>(const Packet8i& /*a*/) {
+ const unsigned int o = 0xffffffffu;
+ return _mm256_set_epi32(o, o, o, o, o, o, o, o);
+}
+#endif
+template<> EIGEN_STRONG_INLINE Packet8f pones<Packet8f>(const Packet8f& a) {
+ return _mm256_castsi256_ps(pones<Packet8i>(_mm256_castps_si256(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4d pones<Packet4d>(const Packet4d& a) {
+ return _mm256_castsi256_pd(pones<Packet8i>(_mm256_castpd_si256(a)));
+}
+
template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(const Packet8i& a, const Packet8i& b) {
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index 68adf5e57..d258fd07b 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -295,12 +295,6 @@ template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packe
return cat256(lo, hi);
}
-template<> EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) {
- __m256 lo = pcmp_eq(extract256<0>(a), extract256<0>(b));
- __m256 hi = pcmp_eq(extract256<1>(a), extract256<1>(b));
- return cat256(lo, hi);
-}
-
template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
__m256 lo = pcmp_lt_or_nan(extract256<0>(a), extract256<0>(b));
__m256 hi = pcmp_lt_or_nan(extract256<1>(a), extract256<1>(b));
@@ -317,14 +311,30 @@ template <>
EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) {
__mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ);
return _mm512_castsi512_ps(
- _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffff));
+ _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
}
template <>
EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) {
__mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ);
return _mm512_castsi512_pd(
- _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffff));
+ _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16i pones<Packet16i>(const Packet16i& /*a*/) {
+ const unsigned int o = 0xffffffffu;
+ return _mm512_set_epi32(o, o, o, o, o, o, o, o, o, o, o, o, o, o, o, o);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pones<Packet16f>(const Packet16f& a) {
+ return _mm512_castsi512_ps(pones<Packet16i>(_mm512_castps_si512(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d pones<Packet8d>(const Packet8d& a) {
+ return _mm512_castsi512_pd(pones<Packet16i>(_mm512_castpd_si512(a)));
}
template <>
diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
index 3e35f96cc..c4dfedcf8 100644
--- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
@@ -143,6 +143,10 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs<half2>(const half2&
return result;
}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pones<half2>(const half2& a) {
+ half2 result;
+ *(reinterpret_cast<unsigned*>(&(result))) = 0xffffffffu;
+}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
ptranspose(PacketBlock<half2,2>& kernel) {
@@ -640,6 +644,14 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
#endif
}
+template<> EIGEN_STRONG_INLINE Packet16h pnot(const Packet16h& a) {
+ Packet16h r; r.x = _mm256_xor_si256(a.x, pcmp_eq(a.x, a.x)); return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pones(const Packet16h& a) {
+ Packet16h r; r.x = Packet8i(pones(a.x)); return r;
+}
+
template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) {
// in some cases Packet8i is a wrapper around __m256i, so we need to
// cast to Packet8i to call the correct overload.
@@ -1085,6 +1097,10 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
#endif
}
+template<> EIGEN_STRONG_INLINE Packet8h pones(const Packet8h& a) {
+ Packet8h r; r.x = _mm_cmpeq_epi32(a.x, a.x); return r;
+}
+
template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) {
// in some cases Packet4i is a wrapper around __m128i, so we either need to
// cast to Packet4i to directly call the intrinsics as below:
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index a7304193b..8372cedfb 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -82,6 +82,9 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con
#endif
}
+template<> EIGEN_STRONG_INLINE Packet2cf pones <Packet2cf>(const Packet2cf& a) { return Packet2cf(pones(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pnot <Packet2cf>(const Packet2cf& a) { return Packet2cf(pnot(a.v)); }
+
template<> EIGEN_STRONG_INLINE Packet2cf pand <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf por <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf pxor <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); }
@@ -305,6 +308,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, con
#endif
}
+template<> EIGEN_STRONG_INLINE Packet1cd pones <Packet1cd>(const Packet1cd& a) { return Packet1cd(pones(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pnot <Packet1cd>(const Packet1cd& a) { return Packet1cd(pnot(a.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd pand <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd por <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd pxor <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); }
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index b8a5497a9..6dd2f8a46 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -378,6 +378,17 @@ template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4
template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pones<Packet4i>(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); }
+template<> EIGEN_STRONG_INLINE Packet4f
+pones<Packet4f>(const Packet4f& a) {
+ Packet4i b = _mm_castps_si128(a);
+ return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b));
+}
+template<> EIGEN_STRONG_INLINE Packet2d
+pones<Packet2d>(const Packet2d& a) {
+ Packet4i b = _mm_castpd_si128(a);
+ return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b));
+}
template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index a88b7bba9..460cfbdbe 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -239,6 +239,8 @@ template<typename Scalar,typename Packet> void packetmath()
CHECK_CWISE2_IF(PacketTraits::HasDiv, REF_DIV, internal::pdiv);
CHECK_CWISE1(internal::pnot, internal::pnot);
+ CHECK_CWISE1(internal::pzero, internal::pzero);
+ CHECK_CWISE1(internal::pones, internal::pones);
CHECK_CWISE1(internal::negate, internal::pnegate);
CHECK_CWISE1(numext::conj, internal::pconj);