diff options
author | Rasmus Munk Larsen <rmlarsen@google.com> | 2020-04-20 20:16:28 +0000 |
---|---|---|
committer | Rasmus Munk Larsen <rmlarsen@google.com> | 2020-04-20 20:16:28 +0000 |
commit | 2f6ddaa25c605b3fdfb991ebd6c4e945c81f1067 (patch) | |
tree | a7e0a3067141f79156da0dae03f9a3c9055d40e2 /Eigen/src/Core/GenericPacketMath.h | |
parent | 00f6340153860ffb3e4776a3f42aa851b596a094 (diff) |
Add partial vectorization for matrices and tensors of bool. This speeds up boolean operations on Tensors by up to 25x.
Benchmark numbers for the logical and of two NxN tensors:
name old time/op new time/op delta
BM_booleanAnd_1T/3 [using 1 threads] 14.6ns ± 0% 14.4ns ± 0% -0.96%
BM_booleanAnd_1T/4 [using 1 threads] 20.5ns ±12% 9.0ns ± 0% -56.07%
BM_booleanAnd_1T/7 [using 1 threads] 41.7ns ± 0% 10.5ns ± 0% -74.87%
BM_booleanAnd_1T/8 [using 1 threads] 52.1ns ± 0% 10.1ns ± 0% -80.59%
BM_booleanAnd_1T/10 [using 1 threads] 76.3ns ± 0% 13.8ns ± 0% -81.87%
BM_booleanAnd_1T/15 [using 1 threads] 167ns ± 0% 16ns ± 0% -90.45%
BM_booleanAnd_1T/16 [using 1 threads] 188ns ± 0% 16ns ± 0% -91.57%
BM_booleanAnd_1T/31 [using 1 threads] 667ns ± 0% 34ns ± 0% -94.83%
BM_booleanAnd_1T/32 [using 1 threads] 710ns ± 0% 35ns ± 0% -95.01%
BM_booleanAnd_1T/64 [using 1 threads] 2.80µs ± 0% 0.11µs ± 0% -95.93%
BM_booleanAnd_1T/128 [using 1 threads] 11.2µs ± 0% 0.4µs ± 0% -96.11%
BM_booleanAnd_1T/256 [using 1 threads] 44.6µs ± 0% 2.5µs ± 0% -94.31%
BM_booleanAnd_1T/512 [using 1 threads] 178µs ± 0% 10µs ± 0% -94.35%
BM_booleanAnd_1T/1k [using 1 threads] 717µs ± 0% 78µs ± 1% -89.07%
BM_booleanAnd_1T/2k [using 1 threads] 2.87ms ± 0% 0.31ms ± 1% -89.08%
BM_booleanAnd_1T/4k [using 1 threads] 11.7ms ± 0% 1.9ms ± 4% -83.55%
BM_booleanAnd_1T/10k [using 1 threads] 70.3ms ± 0% 17.2ms ± 4% -75.48%
Diffstat (limited to 'Eigen/src/Core/GenericPacketMath.h')
-rw-r--r-- | Eigen/src/Core/GenericPacketMath.h | 98 |
1 files changed, 55 insertions, 43 deletions
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 637aac9f9..ec61ac697 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -224,36 +224,6 @@ pabs(const unsigned long long& a) { return a; } template<typename Packet> EIGEN_DEVICE_FUNC inline Packet parg(const Packet& a) { using numext::arg; return arg(a); } -/** \internal \returns the bitwise and of \a a and \a b */ -template<typename Packet> EIGEN_DEVICE_FUNC inline Packet -pand(const Packet& a, const Packet& b) { return a & b; } - -/** \internal \returns the bitwise or of \a a and \a b */ -template<typename Packet> EIGEN_DEVICE_FUNC inline Packet -por(const Packet& a, const Packet& b) { return a | b; } - -/** \internal \returns the bitwise xor of \a a and \a b */ -template<typename Packet> EIGEN_DEVICE_FUNC inline Packet -pxor(const Packet& a, const Packet& b) { return a ^ b; } - -/** \internal \returns the bitwise andnot of \a a and \a b */ -template<typename Packet> EIGEN_DEVICE_FUNC inline Packet -pandnot(const Packet& a, const Packet& b) { return a & (~b); } - -/** \internal \returns ones */ -template<typename Packet> EIGEN_DEVICE_FUNC inline Packet -ptrue(const Packet& /*a*/) { Packet b; memset((void*)&b, 0xff, sizeof(b)); return b;} - -template <typename RealScalar> -EIGEN_DEVICE_FUNC inline std::complex<RealScalar> ptrue(const std::complex<RealScalar>& /*a*/) { - RealScalar b; - b = ptrue(b); - return std::complex<RealScalar>(b, b); -} - -/** \internal \returns the bitwise not of \a a */ -template <typename Packet> EIGEN_DEVICE_FUNC inline Packet -pnot(const Packet& a) { return pxor(ptrue(a), a);} /** \internal \returns \a a logically shifted by N bits to the right */ template<int N> EIGEN_DEVICE_FUNC inline int @@ -294,6 +264,35 @@ pldexp(const Packet &a, const Packet &exponent) { return ldexp(a, static_cast<int>(exponent)); } +// Notice: The following ops accept and operator on bitwise masks. +// The value of each field in a masks is Scalar(0) or ~Scalar(0). +// For boolean packet like Packet16b, this is different from the +// representation of true and false, which are 1 and 0. +// As an example +// ptrue<Packet16b>() = 0xffffffffffffffffffffffffffffffff +// while +// pset1<Packet16b>(true) = 0x01010101010101010101010101010101 + +/** \internal \returns the bitwise and of \a a and \a b */ +template<typename Packet> EIGEN_DEVICE_FUNC inline Packet +pand(const Packet& a, const Packet& b) { return a & b; } + +/** \internal \returns the bitwise or of \a a and \a b */ +template<typename Packet> EIGEN_DEVICE_FUNC inline Packet +por(const Packet& a, const Packet& b) { return a | b; } + +/** \internal \returns the bitwise xor of \a a and \a b */ +template<typename Packet> EIGEN_DEVICE_FUNC inline Packet +pxor(const Packet& a, const Packet& b) { return a ^ b; } + +/** \internal \returns the bitwise and of \a a and not \a b */ +template<typename Packet> EIGEN_DEVICE_FUNC inline Packet +pandnot(const Packet& a, const Packet& b) { return a & (~b); } + +/** \internal \returns ones */ +template<typename Packet> EIGEN_DEVICE_FUNC inline Packet +ptrue(const Packet& /*a*/) { Packet b; memset((void*)&b, 0xff, sizeof(b)); return b;} + /** \internal \returns zeros */ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pzero(const Packet& a) { return pxor(a,a); } @@ -308,21 +307,16 @@ template<> EIGEN_DEVICE_FUNC inline double pzero<double>(const double& a) { return 0.; } -/** \internal \returns bits of \a or \b according to the input bit mask \a mask */ -template<typename Packet> EIGEN_DEVICE_FUNC inline Packet -pselect(const Packet& mask, const Packet& a, const Packet& b) { - return por(pand(a,mask),pandnot(b,mask)); -} - -template<> EIGEN_DEVICE_FUNC inline float pselect<float>( - const float& mask, const float& a, const float&b) { - return numext::equal_strict(mask,0.f) ? b : a; +template <typename RealScalar> +EIGEN_DEVICE_FUNC inline std::complex<RealScalar> ptrue(const std::complex<RealScalar>& /*a*/) { + RealScalar b; + b = ptrue(b); + return std::complex<RealScalar>(b, b); } -template<> EIGEN_DEVICE_FUNC inline double pselect<double>( - const double& mask, const double& a, const double& b) { - return numext::equal_strict(mask,0.) ? b : a; -} +/** \internal \returns the bitwise not of \a a */ +template <typename Packet> EIGEN_DEVICE_FUNC inline Packet +pnot(const Packet& a) { return pxor(ptrue(a), a);} /** \internal \returns a <= b as a bit mask */ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet @@ -340,6 +334,24 @@ pcmp_eq(const Packet& a, const Packet& b) { return a==b ? ptrue(a) : pzero(a); } template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcmp_lt_or_nan(const Packet& a, const Packet& b) { return pnot(pcmp_le(b,a)); } +/** \internal \returns \a or \b for each field in packet according to \mask */ +template<typename Packet> EIGEN_DEVICE_FUNC inline Packet +pselect(const Packet& mask, const Packet& a, const Packet& b) { + return por(pand(a,mask),pandnot(b,mask)); +} + +template<> EIGEN_DEVICE_FUNC inline float pselect<float>( + const float& cond, const float& a, const float&b) { + return numext::equal_strict(cond,0.f) ? b : a; +} + +template<> EIGEN_DEVICE_FUNC inline double pselect<double>( + const double& cond, const double& a, const double& b) { + return numext::equal_strict(cond,0.) ? b : a; +} + + + /** \internal \returns the min of \a a and \a b (coeff-wise) */ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pabsdiff(const Packet& a, const Packet& b) { return pselect(pcmp_lt(a, b), psub(b, a), psub(a, b)); } |