aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/GenericPacketMath.h
diff options
context:
space:
mode:
authorGravatar Rasmus Munk Larsen <rmlarsen@google.com>2020-04-20 20:16:28 +0000
committerGravatar Rasmus Munk Larsen <rmlarsen@google.com>2020-04-20 20:16:28 +0000
commit2f6ddaa25c605b3fdfb991ebd6c4e945c81f1067 (patch)
treea7e0a3067141f79156da0dae03f9a3c9055d40e2 /Eigen/src/Core/GenericPacketMath.h
parent00f6340153860ffb3e4776a3f42aa851b596a094 (diff)
Add partial vectorization for matrices and tensors of bool. This speeds up boolean operations on Tensors by up to 25x.
Benchmark numbers for the logical and of two NxN tensors: name old time/op new time/op delta BM_booleanAnd_1T/3 [using 1 threads] 14.6ns ± 0% 14.4ns ± 0% -0.96% BM_booleanAnd_1T/4 [using 1 threads] 20.5ns ±12% 9.0ns ± 0% -56.07% BM_booleanAnd_1T/7 [using 1 threads] 41.7ns ± 0% 10.5ns ± 0% -74.87% BM_booleanAnd_1T/8 [using 1 threads] 52.1ns ± 0% 10.1ns ± 0% -80.59% BM_booleanAnd_1T/10 [using 1 threads] 76.3ns ± 0% 13.8ns ± 0% -81.87% BM_booleanAnd_1T/15 [using 1 threads] 167ns ± 0% 16ns ± 0% -90.45% BM_booleanAnd_1T/16 [using 1 threads] 188ns ± 0% 16ns ± 0% -91.57% BM_booleanAnd_1T/31 [using 1 threads] 667ns ± 0% 34ns ± 0% -94.83% BM_booleanAnd_1T/32 [using 1 threads] 710ns ± 0% 35ns ± 0% -95.01% BM_booleanAnd_1T/64 [using 1 threads] 2.80µs ± 0% 0.11µs ± 0% -95.93% BM_booleanAnd_1T/128 [using 1 threads] 11.2µs ± 0% 0.4µs ± 0% -96.11% BM_booleanAnd_1T/256 [using 1 threads] 44.6µs ± 0% 2.5µs ± 0% -94.31% BM_booleanAnd_1T/512 [using 1 threads] 178µs ± 0% 10µs ± 0% -94.35% BM_booleanAnd_1T/1k [using 1 threads] 717µs ± 0% 78µs ± 1% -89.07% BM_booleanAnd_1T/2k [using 1 threads] 2.87ms ± 0% 0.31ms ± 1% -89.08% BM_booleanAnd_1T/4k [using 1 threads] 11.7ms ± 0% 1.9ms ± 4% -83.55% BM_booleanAnd_1T/10k [using 1 threads] 70.3ms ± 0% 17.2ms ± 4% -75.48%
Diffstat (limited to 'Eigen/src/Core/GenericPacketMath.h')
-rw-r--r--Eigen/src/Core/GenericPacketMath.h98
1 files changed, 55 insertions, 43 deletions
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 637aac9f9..ec61ac697 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -224,36 +224,6 @@ pabs(const unsigned long long& a) { return a; }
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
parg(const Packet& a) { using numext::arg; return arg(a); }
-/** \internal \returns the bitwise and of \a a and \a b */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pand(const Packet& a, const Packet& b) { return a & b; }
-
-/** \internal \returns the bitwise or of \a a and \a b */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-por(const Packet& a, const Packet& b) { return a | b; }
-
-/** \internal \returns the bitwise xor of \a a and \a b */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pxor(const Packet& a, const Packet& b) { return a ^ b; }
-
-/** \internal \returns the bitwise andnot of \a a and \a b */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pandnot(const Packet& a, const Packet& b) { return a & (~b); }
-
-/** \internal \returns ones */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-ptrue(const Packet& /*a*/) { Packet b; memset((void*)&b, 0xff, sizeof(b)); return b;}
-
-template <typename RealScalar>
-EIGEN_DEVICE_FUNC inline std::complex<RealScalar> ptrue(const std::complex<RealScalar>& /*a*/) {
- RealScalar b;
- b = ptrue(b);
- return std::complex<RealScalar>(b, b);
-}
-
-/** \internal \returns the bitwise not of \a a */
-template <typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pnot(const Packet& a) { return pxor(ptrue(a), a);}
/** \internal \returns \a a logically shifted by N bits to the right */
template<int N> EIGEN_DEVICE_FUNC inline int
@@ -294,6 +264,35 @@ pldexp(const Packet &a, const Packet &exponent) {
return ldexp(a, static_cast<int>(exponent));
}
+// Notice: The following ops accept and operator on bitwise masks.
+// The value of each field in a masks is Scalar(0) or ~Scalar(0).
+// For boolean packet like Packet16b, this is different from the
+// representation of true and false, which are 1 and 0.
+// As an example
+// ptrue<Packet16b>() = 0xffffffffffffffffffffffffffffffff
+// while
+// pset1<Packet16b>(true) = 0x01010101010101010101010101010101
+
+/** \internal \returns the bitwise and of \a a and \a b */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pand(const Packet& a, const Packet& b) { return a & b; }
+
+/** \internal \returns the bitwise or of \a a and \a b */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+por(const Packet& a, const Packet& b) { return a | b; }
+
+/** \internal \returns the bitwise xor of \a a and \a b */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pxor(const Packet& a, const Packet& b) { return a ^ b; }
+
+/** \internal \returns the bitwise and of \a a and not \a b */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pandnot(const Packet& a, const Packet& b) { return a & (~b); }
+
+/** \internal \returns ones */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+ptrue(const Packet& /*a*/) { Packet b; memset((void*)&b, 0xff, sizeof(b)); return b;}
+
/** \internal \returns zeros */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
pzero(const Packet& a) { return pxor(a,a); }
@@ -308,21 +307,16 @@ template<> EIGEN_DEVICE_FUNC inline double pzero<double>(const double& a) {
return 0.;
}
-/** \internal \returns bits of \a or \b according to the input bit mask \a mask */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pselect(const Packet& mask, const Packet& a, const Packet& b) {
- return por(pand(a,mask),pandnot(b,mask));
-}
-
-template<> EIGEN_DEVICE_FUNC inline float pselect<float>(
- const float& mask, const float& a, const float&b) {
- return numext::equal_strict(mask,0.f) ? b : a;
+template <typename RealScalar>
+EIGEN_DEVICE_FUNC inline std::complex<RealScalar> ptrue(const std::complex<RealScalar>& /*a*/) {
+ RealScalar b;
+ b = ptrue(b);
+ return std::complex<RealScalar>(b, b);
}
-template<> EIGEN_DEVICE_FUNC inline double pselect<double>(
- const double& mask, const double& a, const double& b) {
- return numext::equal_strict(mask,0.) ? b : a;
-}
+/** \internal \returns the bitwise not of \a a */
+template <typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pnot(const Packet& a) { return pxor(ptrue(a), a);}
/** \internal \returns a <= b as a bit mask */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
@@ -340,6 +334,24 @@ pcmp_eq(const Packet& a, const Packet& b) { return a==b ? ptrue(a) : pzero(a); }
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
pcmp_lt_or_nan(const Packet& a, const Packet& b) { return pnot(pcmp_le(b,a)); }
+/** \internal \returns \a or \b for each field in packet according to \mask */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pselect(const Packet& mask, const Packet& a, const Packet& b) {
+ return por(pand(a,mask),pandnot(b,mask));
+}
+
+template<> EIGEN_DEVICE_FUNC inline float pselect<float>(
+ const float& cond, const float& a, const float&b) {
+ return numext::equal_strict(cond,0.f) ? b : a;
+}
+
+template<> EIGEN_DEVICE_FUNC inline double pselect<double>(
+ const double& cond, const double& a, const double& b) {
+ return numext::equal_strict(cond,0.) ? b : a;
+}
+
+
+
/** \internal \returns the min of \a a and \a b (coeff-wise) */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
pabsdiff(const Packet& a, const Packet& b) { return pselect(pcmp_lt(a, b), psub(b, a), psub(a, b)); }