aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch/SSE
diff options
context:
space:
mode:
authorGravatar Rasmus Munk Larsen <rmlarsen@google.com>2020-04-20 20:16:28 +0000
committerGravatar Rasmus Munk Larsen <rmlarsen@google.com>2020-04-20 20:16:28 +0000
commit2f6ddaa25c605b3fdfb991ebd6c4e945c81f1067 (patch)
treea7e0a3067141f79156da0dae03f9a3c9055d40e2 /Eigen/src/Core/arch/SSE
parent00f6340153860ffb3e4776a3f42aa851b596a094 (diff)
Add partial vectorization for matrices and tensors of bool. This speeds up boolean operations on Tensors by up to 25x.
Benchmark numbers for the logical and of two NxN tensors: name old time/op new time/op delta BM_booleanAnd_1T/3 [using 1 threads] 14.6ns ± 0% 14.4ns ± 0% -0.96% BM_booleanAnd_1T/4 [using 1 threads] 20.5ns ±12% 9.0ns ± 0% -56.07% BM_booleanAnd_1T/7 [using 1 threads] 41.7ns ± 0% 10.5ns ± 0% -74.87% BM_booleanAnd_1T/8 [using 1 threads] 52.1ns ± 0% 10.1ns ± 0% -80.59% BM_booleanAnd_1T/10 [using 1 threads] 76.3ns ± 0% 13.8ns ± 0% -81.87% BM_booleanAnd_1T/15 [using 1 threads] 167ns ± 0% 16ns ± 0% -90.45% BM_booleanAnd_1T/16 [using 1 threads] 188ns ± 0% 16ns ± 0% -91.57% BM_booleanAnd_1T/31 [using 1 threads] 667ns ± 0% 34ns ± 0% -94.83% BM_booleanAnd_1T/32 [using 1 threads] 710ns ± 0% 35ns ± 0% -95.01% BM_booleanAnd_1T/64 [using 1 threads] 2.80µs ± 0% 0.11µs ± 0% -95.93% BM_booleanAnd_1T/128 [using 1 threads] 11.2µs ± 0% 0.4µs ± 0% -96.11% BM_booleanAnd_1T/256 [using 1 threads] 44.6µs ± 0% 2.5µs ± 0% -94.31% BM_booleanAnd_1T/512 [using 1 threads] 178µs ± 0% 10µs ± 0% -94.35% BM_booleanAnd_1T/1k [using 1 threads] 717µs ± 0% 78µs ± 1% -89.07% BM_booleanAnd_1T/2k [using 1 threads] 2.87ms ± 0% 0.31ms ± 1% -89.08% BM_booleanAnd_1T/4k [using 1 threads] 11.7ms ± 0% 1.9ms ± 4% -83.55% BM_booleanAnd_1T/10k [using 1 threads] 70.3ms ± 0% 17.2ms ± 4% -75.48%
Diffstat (limited to 'Eigen/src/Core/arch/SSE')
-rwxr-xr-xEigen/src/Core/arch/SSE/PacketMath.h47
1 files changed, 47 insertions, 0 deletions
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 6a4dd5253..a56ae9dde 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -44,10 +44,12 @@ typedef __m128d Packet2d;
#endif
typedef eigen_packet_wrapper<__m128i, 0> Packet4i;
+typedef eigen_packet_wrapper<__m128i, 1> Packet16b;
template<> struct is_arithmetic<__m128> { enum { value = true }; };
template<> struct is_arithmetic<__m128i> { enum { value = true }; };
template<> struct is_arithmetic<__m128d> { enum { value = true }; };
+template<> struct is_arithmetic<Packet16b> { enum { value = true }; };
#define EIGEN_SSE_SHUFFLE_MASK(p,q,r,s) ((s)<<6|(r)<<4|(q)<<2|(p))
@@ -158,6 +160,30 @@ template<> struct packet_traits<int> : default_packet_traits
};
};
+template<> struct packet_traits<bool> : default_packet_traits
+{
+ typedef Packet16b type;
+ typedef Packet16b half;
+ enum {
+ Vectorizable = 1,
+ AlignedOnScalar = 1,
+ HasHalfPacket = 0,
+ size=16,
+
+ HasAdd = 0,
+ HasSub = 0,
+ HasShift = 0,
+ HasMul = 0,
+ HasNegate = 0,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 0,
+ HasMax = 0,
+ HasConj = 0,
+ HasReduxp = 0
+ };
+};
+
template<> struct unpacket_traits<Packet4f> {
typedef float type;
typedef Packet4f half;
@@ -174,6 +200,11 @@ template<> struct unpacket_traits<Packet4i> {
typedef Packet4i half;
enum {size=4, alignment=Aligned16, vectorizable=false, masked_load_available=false, masked_store_available=false};
};
+template<> struct unpacket_traits<Packet16b> {
+ typedef bool type;
+ typedef Packet16b half;
+ enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+};
#ifndef EIGEN_VECTORIZE_AVX
template<> struct scalar_div_cost<float,true> { enum { value = 7 }; };
@@ -192,6 +223,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { re
template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); }
#endif
+template<> EIGEN_STRONG_INLINE Packet16b pset1<Packet16b>(const bool& from) { return _mm_set1_epi8(static_cast<char>(from)); }
template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) { return _mm_castsi128_ps(pset1<Packet4i>(from)); }
@@ -385,8 +417,11 @@ template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2
template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return _mm_cmplt_epi32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packet16b& b) { return _mm_cmpeq_epi8(a,b); }
+
template<> EIGEN_STRONG_INLINE Packet4i ptrue<Packet4i>(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); }
+template<> EIGEN_STRONG_INLINE Packet16b ptrue<Packet16b>(const Packet16b& a) { return _mm_cmpeq_epi32(a, a); }
template<> EIGEN_STRONG_INLINE Packet4f
ptrue<Packet4f>(const Packet4f& a) {
Packet4i b = _mm_castps_si128(a);
@@ -398,17 +433,21 @@ ptrue<Packet2d>(const Packet2d& a) {
return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b));
}
+
template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16b pand<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_and_si128(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16b por<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_or_si128(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16b pxor<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_xor_si128(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(b,a); }
template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); }
@@ -471,6 +510,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
+template<> EIGEN_STRONG_INLINE Packet16b pload<Packet16b>(const bool* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
#if EIGEN_COMP_MSVC
template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
@@ -505,6 +545,11 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
EIGEN_DEBUG_UNALIGNED_LOAD
return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
}
+template<> EIGEN_STRONG_INLINE Packet16b ploadu<Packet16b>(const bool* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD
+ return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
+}
+
template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
{
@@ -522,10 +567,12 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
+template<> EIGEN_STRONG_INLINE void pstore<bool>(bool* to, const Packet16b& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); }
template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from); }
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<bool>(bool* to, const Packet16b& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
{