aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2018-11-30 15:56:08 +0100
committerGravatar Gael Guennebaud <g.gael@free.fr>2018-11-30 15:56:08 +0100
commit69ace742be6f00f4280d312e046b0b1422fd112c (patch)
tree83d296bd2070b1617791dcf0ee8c3bcd6c5ade2a
parentfa87f9d876f38e470e5070a451f92a3c19c9d0fe (diff)
Several improvements regarding packet-bitwise operations:
- add unit tests - optimize their AVX512f implementation - add missing implementations (half, Packet4f, ...)
-rw-r--r--Eigen/src/Core/GenericPacketMath.h20
-rw-r--r--Eigen/src/Core/arch/AVX/PacketMath.h28
-rw-r--r--Eigen/src/Core/arch/AVX512/PacketMath.h150
-rw-r--r--Eigen/src/Core/arch/GPU/PacketMathHalf.h26
-rw-r--r--test/packetmath.cpp42
5 files changed, 140 insertions, 126 deletions
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index cc044de22..9c2a437bf 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -158,13 +158,11 @@ preinterpret(const Packet& a); /* { return reinterpret_cast<const Target&>(a); }
/** \internal \returns a + b (coeff-wise) */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-padd(const Packet& a,
- const Packet& b) { return a+b; }
+padd(const Packet& a, const Packet& b) { return a+b; }
/** \internal \returns a - b (coeff-wise) */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-psub(const Packet& a,
- const Packet& b) { return a-b; }
+psub(const Packet& a, const Packet& b) { return a-b; }
/** \internal \returns -a (coeff-wise) */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
@@ -177,23 +175,19 @@ pconj(const Packet& a) { return numext::conj(a); }
/** \internal \returns a * b (coeff-wise) */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pmul(const Packet& a,
- const Packet& b) { return a*b; }
+pmul(const Packet& a, const Packet& b) { return a*b; }
/** \internal \returns a / b (coeff-wise) */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pdiv(const Packet& a,
- const Packet& b) { return a/b; }
+pdiv(const Packet& a, const Packet& b) { return a/b; }
/** \internal \returns the min of \a a and \a b (coeff-wise) */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pmin(const Packet& a,
- const Packet& b) { return numext::mini(a, b); }
+pmin(const Packet& a, const Packet& b) { return numext::mini(a, b); }
/** \internal \returns the max of \a a and \a b (coeff-wise) */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pmax(const Packet& a,
- const Packet& b) { return numext::maxi(a, b); }
+pmax(const Packet& a, const Packet& b) { return numext::maxi(a, b); }
/** \internal \returns the absolute value of \a a */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
@@ -217,7 +211,7 @@ pxor(const Packet& a, const Packet& b) { return a ^ b; }
/** \internal \returns the bitwise andnot of \a a and \a b */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pandnot(const Packet& a, const Packet& b) { return a & (!b); }
+pandnot(const Packet& a, const Packet& b) { return a & (~b); }
/** \internal \returns \a a shifted by N bits to the right */
template<int N> EIGEN_DEVICE_FUNC inline int
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 284a32676..b49bae0de 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -241,15 +241,43 @@ template<> EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) { re
template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+ return _mm256_and_si256(a,b);
+#else
+ return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
+#endif
+}
template<> EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i por<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+ return _mm256_or_si256(a,b);
+#else
+ return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
+#endif
+}
template<> EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i pxor<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+ return _mm256_xor_si256(a,b);
+#else
+ return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
+#endif
+}
template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(b,a); }
template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(b,a); }
+template<> EIGEN_STRONG_INLINE Packet8i pandnot<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+ return _mm256_andnot_si256(b,a);
+#else
+ return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(b),_mm256_castsi256_ps(a)));
+#endif
+}
template<> EIGEN_STRONG_INLINE Packet8f pselect<Packet8f>(const Packet8f& mask, const Packet8f& a, const Packet8f& b)
{ return _mm256_blendv_ps(b,a,mask); }
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index 7d90ce4c1..1d38fb758 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -268,30 +268,20 @@ template<> EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packe
return _mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1);
}
+
+template <>
+EIGEN_STRONG_INLINE Packet16i pand<Packet16i>(const Packet16i& a,
+ const Packet16i& b) {
+ return _mm512_and_si512(a,b);
+}
+
template <>
EIGEN_STRONG_INLINE Packet16f pand<Packet16f>(const Packet16f& a,
const Packet16f& b) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
return _mm512_and_ps(a, b);
#else
- Packet16f res = _mm512_undefined_ps();
- Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
- Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
- res = _mm512_insertf32x4(res, _mm_and_ps(lane0_a, lane0_b), 0);
-
- Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
- Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
- res = _mm512_insertf32x4(res, _mm_and_ps(lane1_a, lane1_b), 1);
-
- Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
- Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
- res = _mm512_insertf32x4(res, _mm_and_ps(lane2_a, lane2_b), 2);
-
- Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
- Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
- res = _mm512_insertf32x4(res, _mm_and_ps(lane3_a, lane3_b), 3);
-
- return res;
+ return _mm512_castsi512_ps(pand(_mm512_castps_si512(a),_mm512_castps_si512(b)));
#endif
}
template <>
@@ -312,30 +302,18 @@ EIGEN_STRONG_INLINE Packet8d pand<Packet8d>(const Packet8d& a,
return res;
#endif
}
+
+template <>
+EIGEN_STRONG_INLINE Packet16i por<Packet16i>(const Packet16i& a, const Packet16i& b) {
+ return _mm512_or_si512(a, b);
+}
+
template <>
-EIGEN_STRONG_INLINE Packet16f por<Packet16f>(const Packet16f& a,
- const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet16f por<Packet16f>(const Packet16f& a, const Packet16f& b) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
return _mm512_or_ps(a, b);
#else
- Packet16f res = _mm512_undefined_ps();
- Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
- Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
- res = _mm512_insertf32x4(res, _mm_or_ps(lane0_a, lane0_b), 0);
-
- Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
- Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
- res = _mm512_insertf32x4(res, _mm_or_ps(lane1_a, lane1_b), 1);
-
- Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
- Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
- res = _mm512_insertf32x4(res, _mm_or_ps(lane2_a, lane2_b), 2);
-
- Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
- Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
- res = _mm512_insertf32x4(res, _mm_or_ps(lane3_a, lane3_b), 3);
-
- return res;
+ return _mm512_castsi512_ps(por(_mm512_castps_si512(a),_mm512_castps_si512(b)));
#endif
}
@@ -345,106 +323,52 @@ EIGEN_STRONG_INLINE Packet8d por<Packet8d>(const Packet8d& a,
#ifdef EIGEN_VECTORIZE_AVX512DQ
return _mm512_or_pd(a, b);
#else
- Packet8d res = _mm512_undefined_pd();
- Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
- Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
- res = _mm512_insertf64x4(res, _mm256_or_pd(lane0_a, lane0_b), 0);
-
- Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
- Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
- res = _mm512_insertf64x4(res, _mm256_or_pd(lane1_a, lane1_b), 1);
-
- return res;
+ return _mm512_castsi512_pd(por(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
#endif
}
template <>
-EIGEN_STRONG_INLINE Packet16f pxor<Packet16f>(const Packet16f& a,
- const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet16i pxor<Packet16i>(const Packet16i& a, const Packet16i& b) {
+ return _mm512_xor_si512(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pxor<Packet16f>(const Packet16f& a, const Packet16f& b) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
return _mm512_xor_ps(a, b);
#else
- Packet16f res = _mm512_undefined_ps();
- Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
- Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
- res = _mm512_insertf32x4(res, _mm_xor_ps(lane0_a, lane0_b), 0);
-
- Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
- Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
- res = _mm512_insertf32x4(res, _mm_xor_ps(lane1_a, lane1_b), 1);
-
- Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
- Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
- res = _mm512_insertf32x4(res, _mm_xor_ps(lane2_a, lane2_b), 2);
-
- Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
- Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
- res = _mm512_insertf32x4(res, _mm_xor_ps(lane3_a, lane3_b), 3);
-
- return res;
+ return _mm512_castsi512_ps(pxor(_mm512_castps_si512(a),_mm512_castps_si512(b)));
#endif
}
+
template <>
-EIGEN_STRONG_INLINE Packet8d pxor<Packet8d>(const Packet8d& a,
- const Packet8d& b) {
+EIGEN_STRONG_INLINE Packet8d pxor<Packet8d>(const Packet8d& a, const Packet8d& b) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
return _mm512_xor_pd(a, b);
#else
- Packet8d res = _mm512_undefined_pd();
- Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
- Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
- res = _mm512_insertf64x4(res, _mm256_xor_pd(lane0_a, lane0_b), 0);
-
- Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
- Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
- res = _mm512_insertf64x4(res, _mm256_xor_pd(lane1_a, lane1_b), 1);
-
- return res;
+ return _mm512_castsi512_pd(pxor(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
#endif
}
template <>
-EIGEN_STRONG_INLINE Packet16f pandnot<Packet16f>(const Packet16f& a,
- const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet16i pandnot<Packet16i>(const Packet16i& a, const Packet16i& b) {
+ return _mm512_andnot_si512(b, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pandnot<Packet16f>(const Packet16f& a, const Packet16f& b) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
return _mm512_andnot_ps(b, a);
#else
- Packet16f res = _mm512_undefined_ps();
- Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
- Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
- res = _mm512_insertf32x4(res, pandnot(lane0_a, lane0_b), 0);
-
- Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
- Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
- res = _mm512_insertf32x4(res, pandnot(lane1_a, lane1_b), 1);
-
- Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
- Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
- res = _mm512_insertf32x4(res, pandnot(lane2_a, lane2_b), 2);
-
- Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
- Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
- res = _mm512_insertf32x4(res, pandnot(lane3_a, lane3_b), 3);
-
- return res;
+ return _mm512_castsi512_ps(pandnot(_mm512_castps_si512(a),_mm512_castps_si512(b)));
#endif
}
template <>
-EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a,
- const Packet8d& b) {
+EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a,const Packet8d& b) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
- return _mm512_andnot_pd(a, b);
+ return _mm512_andnot_pd(b, a);
#else
- Packet8d res = _mm512_undefined_pd();
- Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
- Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
- res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane0_a, lane0_b), 0);
-
- Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
- Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
- res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane1_a, lane1_b), 1);
-
- return res;
+ return _mm512_castsi512_pd(pandnot(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
#endif
}
diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
index 8787adcde..cdd2b001b 100644
--- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
@@ -640,6 +640,19 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
#endif
}
+template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) {
+ Packet16h r; r.x = por(a.x,b.x); return r;
+}
+template<> EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a,const Packet16h& b) {
+ Packet16h r; r.x = pxor(a.x,b.x); return r;
+}
+template<> EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a,const Packet16h& b) {
+ Packet16h r; r.x = pand(a.x,b.x); return r;
+}
+template<> EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a,const Packet16h& b) {
+ Packet16h r; r.x = pandnot(a.x,b.x); return r;
+}
+
template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
// FIXME we could do that with bit manipulation
Packet16f af = half2float(a);
@@ -1063,6 +1076,19 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
#endif
}
+template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) {
+ Packet8h r; r.x = por(a.x,b.x); return r;
+}
+template<> EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a,const Packet8h& b) {
+ Packet8h r; r.x = pxor(a.x,b.x); return r;
+}
+template<> EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a,const Packet8h& b) {
+ Packet8h r; r.x = pand(a.x,b.x); return r;
+}
+template<> EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a,const Packet8h& b) {
+ Packet8h r; r.x = pandnot(a.x,b.x); return r;
+}
+
template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index 43c33ba94..144083f1b 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -27,7 +27,44 @@ bool g_first_pass = true;
namespace Eigen {
namespace internal {
+
template<typename T> T negate(const T& x) { return -x; }
+
+template<typename T>
+Map<const Array<unsigned char,sizeof(T),1> >
+bits(const T& x) {
+ return Map<const Array<unsigned char,sizeof(T),1> >(reinterpret_cast<const unsigned char *>(&x));
+}
+
+// The following implement bitwise operations on floating point types
+template<typename T,typename Bits,typename Func>
+T apply_bit_op(Bits a, Bits b, Func f) {
+ Array<unsigned char,sizeof(T),1> res;
+ for(Index i=0; i<res.size();++i) res[i] = f(a[i],b[i]);
+ return *reinterpret_cast<T*>(&res);
+}
+
+#define EIGEN_TEST_MAKE_BITWISE2(OP,FUNC,T) \
+ template<> T EIGEN_CAT(p,OP)(const T& a,const T& b) { \
+ return apply_bit_op<T>(bits(a),bits(b),FUNC); \
+ }
+
+#define EIGEN_TEST_MAKE_BITWISE(OP,FUNC) \
+ EIGEN_TEST_MAKE_BITWISE2(OP,FUNC,float) \
+ EIGEN_TEST_MAKE_BITWISE2(OP,FUNC,double) \
+ EIGEN_TEST_MAKE_BITWISE2(OP,FUNC,half) \
+ EIGEN_TEST_MAKE_BITWISE2(OP,FUNC,std::complex<float>) \
+ EIGEN_TEST_MAKE_BITWISE2(OP,FUNC,std::complex<double>)
+
+EIGEN_TEST_MAKE_BITWISE(xor,std::bit_xor<unsigned char>())
+EIGEN_TEST_MAKE_BITWISE(and,std::bit_and<unsigned char>())
+EIGEN_TEST_MAKE_BITWISE(or, std::bit_or<unsigned char>())
+struct bit_andnot{
+ template<typename T> T
+ operator()(T a, T b) const { return a & (~b); }
+};
+EIGEN_TEST_MAKE_BITWISE(andnot, bit_andnot())
+
}
}
@@ -304,6 +341,11 @@ template<typename Scalar,typename Packet> void packetmath()
}
}
+ CHECK_CWISE2_IF(true, internal::por, internal::por);
+ CHECK_CWISE2_IF(true, internal::pxor, internal::pxor);
+ CHECK_CWISE2_IF(true, internal::pand, internal::pand);
+ CHECK_CWISE2_IF(true, internal::pandnot, internal::pandnot);
+
if (PacketTraits::HasBlend) {
Packet thenPacket = internal::pload<Packet>(data1);
Packet elsePacket = internal::pload<Packet>(data2);