diff options
author | Rasmus Munk Larsen <rmlarsen@google.com> | 2020-05-11 13:23:31 -0700 |
---|---|---|
committer | Rasmus Munk Larsen <rmlarsen@google.com> | 2020-05-14 22:39:13 +0000 |
commit | 9b411757abd8458f9689b1384c6bf75da9b82357 (patch) | |
tree | eb23c0c65eaa4df204880c94b287fcf1b43e14cd /Eigen/src/Core/GenericPacketMath.h | |
parent | d640276d31a7dea9207a68a061a6fa7c9fdf50e5 (diff) |
Add missing packet ops for bool, and make it pass the same packet op unit tests as other arithmetic types.
This change also contains a few minor cleanups:
1. Remove packet op pnot, which is not needed for anything other than pcmp_le_or_nan,
which can be done in other ways.
2. Remove the "HasInsert" enum, which is no longer needed since we removed the
corresponding packet ops.
3. Add faster pselect op for Packet4i when SSE4.1 is supported.
Among other things, this makes the fast transposeInPlace() method available for Matrix<bool>.
Run on ************** (72 X 2994 MHz CPUs); 2020-05-09T10:51:02.372347913-07:00
CPU: Intel Skylake Xeon with HyperThreading (36 cores) dL1:32KB dL2:1024KB dL3:24MB
Benchmark Time(ns) CPU(ns) Iterations
-----------------------------------------------------------------------
BM_TransposeInPlace<float>/4 9.77 9.77 71670320
BM_TransposeInPlace<float>/8 21.9 21.9 31929525
BM_TransposeInPlace<float>/16 66.6 66.6 10000000
BM_TransposeInPlace<float>/32 243 243 2879561
BM_TransposeInPlace<float>/59 844 844 829767
BM_TransposeInPlace<float>/64 933 933 750567
BM_TransposeInPlace<float>/128 3944 3945 177405
BM_TransposeInPlace<float>/256 16853 16853 41457
BM_TransposeInPlace<float>/512 204952 204968 3448
BM_TransposeInPlace<float>/1k 1053889 1053861 664
BM_TransposeInPlace<bool>/4 14.4 14.4 48637301
BM_TransposeInPlace<bool>/8 36.0 36.0 19370222
BM_TransposeInPlace<bool>/16 31.5 31.5 22178902
BM_TransposeInPlace<bool>/32 111 111 6272048
BM_TransposeInPlace<bool>/59 626 626 1000000
BM_TransposeInPlace<bool>/64 428 428 1632689
BM_TransposeInPlace<bool>/128 1677 1677 417377
BM_TransposeInPlace<bool>/256 7126 7126 96264
BM_TransposeInPlace<bool>/512 29021 29024 24165
BM_TransposeInPlace<bool>/1k 116321 116330 6068
Diffstat (limited to 'Eigen/src/Core/GenericPacketMath.h')
-rw-r--r-- | Eigen/src/Core/GenericPacketMath.h | 67 |
1 files changed, 29 insertions, 38 deletions
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 449793372..d25b45ab0 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -58,7 +58,6 @@ struct default_packet_traits HasConj = 1, HasSetLinear = 1, HasBlend = 0, - HasInsert = 0, HasDiv = 0, HasSqrt = 0, @@ -191,8 +190,10 @@ psub(const Packet& a, const Packet& b) { return a-b; } template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pnegate(const Packet& a) { return -a; } -/** \internal \returns conj(a) (coeff-wise) */ +template<> EIGEN_DEVICE_FUNC inline bool +pnegate(const bool& a) { return !a; } +/** \internal \returns conj(a) (coeff-wise) */ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pconj(const Packet& a) { return numext::conj(a); } @@ -269,38 +270,9 @@ pldexp(const Packet &a, const Packet &exponent) { return ldexp(a, static_cast<int>(exponent)); } -// Notice: The following ops accept and operator on bitwise masks. -// The value of each field in a masks is Scalar(0) or ~Scalar(0). -// For boolean packet like Packet16b, this is different from the -// representation of true and false, which are 1 and 0. -// As an example -// ptrue<Packet16b>() = 0xffffffffffffffffffffffffffffffff -// while -// pset1<Packet16b>(true) = 0x01010101010101010101010101010101 - -/** \internal \returns the bitwise and of \a a and \a b */ +/** \internal \returns zero bits */ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet -pand(const Packet& a, const Packet& b) { return a & b; } - -/** \internal \returns the bitwise or of \a a and \a b */ -template<typename Packet> EIGEN_DEVICE_FUNC inline Packet -por(const Packet& a, const Packet& b) { return a | b; } - -/** \internal \returns the bitwise xor of \a a and \a b */ -template<typename Packet> EIGEN_DEVICE_FUNC inline Packet -pxor(const Packet& a, const Packet& b) { return a ^ b; } - -/** \internal \returns the bitwise and of \a a and not \a b */ -template<typename Packet> EIGEN_DEVICE_FUNC inline Packet -pandnot(const Packet& a, const Packet& b) { return a & (~b); } - -/** \internal \returns ones */ -template<typename Packet> EIGEN_DEVICE_FUNC inline Packet -ptrue(const Packet& /*a*/) { Packet b; memset((void*)&b, 0xff, sizeof(b)); return b;} - -/** \internal \returns zeros */ -template<typename Packet> EIGEN_DEVICE_FUNC inline Packet -pzero(const Packet& a) { return pxor(a,a); } +pzero(const Packet& /*a*/) { Packet b; memset((void*)&b, 0, sizeof(b)); return b;} template<> EIGEN_DEVICE_FUNC inline float pzero<float>(const float& a) { EIGEN_UNUSED_VARIABLE(a); @@ -312,6 +284,10 @@ template<> EIGEN_DEVICE_FUNC inline double pzero<double>(const double& a) { return 0.; } +/** \internal \returns one bits */ +template<typename Packet> EIGEN_DEVICE_FUNC inline Packet +ptrue(const Packet& /*a*/) { Packet b; memset((void*)&b, 0xff, sizeof(b)); return b;} + template <typename RealScalar> EIGEN_DEVICE_FUNC inline std::complex<RealScalar> ptrue(const std::complex<RealScalar>& /*a*/) { RealScalar b; @@ -319,9 +295,21 @@ EIGEN_DEVICE_FUNC inline std::complex<RealScalar> ptrue(const std::complex<RealS return std::complex<RealScalar>(b, b); } -/** \internal \returns the bitwise not of \a a */ -template <typename Packet> EIGEN_DEVICE_FUNC inline Packet -pnot(const Packet& a) { return pxor(ptrue(a), a);} +/** \internal \returns the bitwise and of \a a and \a b */ +template<typename Packet> EIGEN_DEVICE_FUNC inline Packet +pand(const Packet& a, const Packet& b) { return a & b; } + +/** \internal \returns the bitwise or of \a a and \a b */ +template<typename Packet> EIGEN_DEVICE_FUNC inline Packet +por(const Packet& a, const Packet& b) { return a | b; } + +/** \internal \returns the bitwise xor of \a a and \a b */ +template<typename Packet> EIGEN_DEVICE_FUNC inline Packet +pxor(const Packet& a, const Packet& b) { return a ^ b; } + +/** \internal \returns the bitwise and of \a a and not \a b */ +template<typename Packet> EIGEN_DEVICE_FUNC inline Packet +pandnot(const Packet& a, const Packet& b) { return pand(a, pxor(ptrue(b), b)); } /** \internal \returns a <= b as a bit mask */ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet @@ -337,7 +325,7 @@ pcmp_eq(const Packet& a, const Packet& b) { return a==b ? ptrue(a) : pzero(a); } /** \internal \returns a < b or a==NaN or b==NaN as a bit mask */ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet -pcmp_lt_or_nan(const Packet& a, const Packet& b) { return pnot(pcmp_le(b,a)); } +pcmp_lt_or_nan(const Packet& a, const Packet& b) { return a>=b ? pzero(a) : ptrue(a); } /** \internal \returns \a or \b for each field in packet according to \mask */ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet @@ -355,7 +343,10 @@ template<> EIGEN_DEVICE_FUNC inline double pselect<double>( return numext::equal_strict(cond,0.) ? b : a; } - +template<> EIGEN_DEVICE_FUNC inline bool pselect<bool>( + const bool& cond, const bool& a, const bool& b) { + return cond ? a : b; +} /** \internal \returns the min of \a a and \a b (coeff-wise) */ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet |