Extend support for Packet16b:

* Add ptranspose<*,4> to support matmul and add unit test for Matrix<bool> * Matrix<bool> * work around a bug in slicing of Tensor<bool>. * Add tensor tests This speeds up matmul for boolean matrices by about 10x name old time/op new time/op delta BM_MatMul<bool>/8 267ns ± 0% 479ns ± 0% +79.25% (p=0.008 n=5+5) BM_MatMul<bool>/32 6.42µs ± 0% 0.87µs ± 0% -86.50% (p=0.008 n=5+5) BM_MatMul<bool>/64 43.3µs ± 0% 5.9µs ± 0% -86.42% (p=0.008 n=5+5) BM_MatMul<bool>/128 315µs ± 0% 44µs ± 0% -85.98% (p=0.008 n=5+5) BM_MatMul<bool>/256 2.41ms ± 0% 0.34ms ± 0% -85.68% (p=0.008 n=5+5) BM_MatMul<bool>/512 18.8ms ± 0% 2.7ms ± 0% -85.53% (p=0.008 n=5+5) BM_MatMul<bool>/1k 149ms ± 0% 22ms ± 0% -85.40% (p=0.008 n=5+5)
author: Rasmus Munk Larsen <rmlarsen@google.com> 2020-04-24 17:29:25 -0700
committer: Rasmus Munk Larsen <rmlarsen@google.com> 2020-04-28 16:12:47 +0000
commit: ab773c7e914633ec4a3ee1f7cdea8b168d3bce1a (patch)
tree: 5a82a3071e3c5de39e934ce144023c20f62b2de4 /Eigen/src/Core/arch/SSE/PacketMath.h
parent: b47c7779937c1984b7cd2f1d2f8df33d67c396f7 (diff)
1 files changed, 35 insertions, 2 deletions
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index ac0799467..16215ec72 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -170,10 +170,10 @@ template<> struct packet_traits<bool> : default_packet_traits
     HasHalfPacket = 0,
     size=16,
 
-    HasAdd       = 0,
+    HasAdd       = 1,
     HasSub       = 0,
     HasShift     = 0,
-    HasMul       = 0,
+    HasMul       = 1,
     HasNegate    = 0,
     HasAbs       = 0,
     HasAbs2      = 0,
@@ -249,6 +249,8 @@ template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const
 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); }
 
+template<> EIGEN_STRONG_INLINE Packet16b padd<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_or_si128(a,b); }
+
 template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); }
@@ -290,6 +292,8 @@ template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const
 #endif
 }
 
+template<> EIGEN_STRONG_INLINE Packet16b pmul<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_and_si128(a,b); }
+
 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); }
 
@@ -646,6 +650,7 @@ template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int
 template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { return _mm_cvtss_f32(a); }
 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return _mm_cvtsd_f64(a); }
 template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { return _mm_cvtsi128_si32(a); }
+template<> EIGEN_STRONG_INLINE bool   pfirst<Packet16b>(const Packet16b& a) { int x = _mm_cvtsi128_si32(a); return static_cast<bool>(x & 1); }
 #endif
 
 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
@@ -762,6 +767,7 @@ template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
   Packet4i tmp0 = _mm_hadd_epi32(a,a);
   return pfirst<Packet4i>(_mm_hadd_epi32(tmp0,tmp0));
 }
+
 #else
 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
 {
@@ -769,8 +775,22 @@ template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
   return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1));
 }
 #endif
+
+#ifdef EIGEN_VECTORIZE_SSE4_1
+template<> EIGEN_STRONG_INLINE bool predux<Packet16b>(const Packet16b& a) {
+  Packet16b tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a,a));
+  return _mm_extract_epi64(tmp, 0) != 0;
+}
+#else
+template<> EIGEN_STRONG_INLINE bool predux<Packet16b>(const Packet16b& a) {
+Packet4i tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a,a));
+  return (pfirst(tmp) != 0) || (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) != 0);
+}
+#endif
+
 // Other reduction functions:
 
+
 // mul
 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
 {
@@ -987,6 +1007,19 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
   kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
 }
 
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet16b,4>& kernel) {
+  __m128i T0 =  _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]);
+  __m128i T1 =  _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]);
+  __m128i T2 =  _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]);
+  __m128i T3 =  _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]);
+  kernel.packet[0] = _mm_unpacklo_epi16(T0, T2);
+  kernel.packet[1] = _mm_unpackhi_epi16(T0, T2);
+  kernel.packet[2] = _mm_unpacklo_epi16(T1, T3);
+  kernel.packet[3] = _mm_unpackhi_epi16(T1, T3);
+}
+
+
 template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
author	Rasmus Munk Larsen <rmlarsen@google.com>	2020-04-24 17:29:25 -0700
committer	Rasmus Munk Larsen <rmlarsen@google.com>	2020-04-28 16:12:47 +0000
commit	ab773c7e914633ec4a3ee1f7cdea8b168d3bce1a (patch)
tree	5a82a3071e3c5de39e934ce144023c20f62b2de4 /Eigen/src/Core/arch/SSE/PacketMath.h
parent	b47c7779937c1984b7cd2f1d2f8df33d67c396f7 (diff)