Add missing packet ops for bool, and make it pass the same packet op unit tests as other arithmetic types.

This change also contains a few minor cleanups: 1. Remove packet op pnot, which is not needed for anything other than pcmp_le_or_nan, which can be done in other ways. 2. Remove the "HasInsert" enum, which is no longer needed since we removed the corresponding packet ops. 3. Add faster pselect op for Packet4i when SSE4.1 is supported. Among other things, this makes the fast transposeInPlace() method available for Matrix<bool>. Run on ************** (72 X 2994 MHz CPUs); 2020-05-09T10:51:02.372347913-07:00 CPU: Intel Skylake Xeon with HyperThreading (36 cores) dL1:32KB dL2:1024KB dL3:24MB Benchmark Time(ns) CPU(ns) Iterations ----------------------------------------------------------------------- BM_TransposeInPlace<float>/4 9.77 9.77 71670320 BM_TransposeInPlace<float>/8 21.9 21.9 31929525 BM_TransposeInPlace<float>/16 66.6 66.6 10000000 BM_TransposeInPlace<float>/32 243 243 2879561 BM_TransposeInPlace<float>/59 844 844 829767 BM_TransposeInPlace<float>/64 933 933 750567 BM_TransposeInPlace<float>/128 3944 3945 177405 BM_TransposeInPlace<float>/256 16853 16853 41457 BM_TransposeInPlace<float>/512 204952 204968 3448 BM_TransposeInPlace<float>/1k 1053889 1053861 664 BM_TransposeInPlace<bool>/4 14.4 14.4 48637301 BM_TransposeInPlace<bool>/8 36.0 36.0 19370222 BM_TransposeInPlace<bool>/16 31.5 31.5 22178902 BM_TransposeInPlace<bool>/32 111 111 6272048 BM_TransposeInPlace<bool>/59 626 626 1000000 BM_TransposeInPlace<bool>/64 428 428 1632689 BM_TransposeInPlace<bool>/128 1677 1677 417377 BM_TransposeInPlace<bool>/256 7126 7126 96264 BM_TransposeInPlace<bool>/512 29021 29024 24165 BM_TransposeInPlace<bool>/1k 116321 116330 6068
author: Rasmus Munk Larsen <rmlarsen@google.com> 2020-05-11 13:23:31 -0700
committer: Rasmus Munk Larsen <rmlarsen@google.com> 2020-05-14 22:39:13 +0000
commit: 9b411757abd8458f9689b1384c6bf75da9b82357 (patch)
tree: eb23c0c65eaa4df204880c94b287fcf1b43e14cd
parent: d640276d31a7dea9207a68a061a6fa7c9fdf50e5 (diff)
9 files changed, 243 insertions, 152 deletions
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 449793372..d25b45ab0 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -58,7 +58,6 @@ struct default_packet_traits
     HasConj      = 1,
     HasSetLinear = 1,
     HasBlend     = 0,
-    HasInsert    = 0,
 
     HasDiv    = 0,
     HasSqrt   = 0,
@@ -191,8 +190,10 @@ psub(const Packet& a, const Packet& b) { return a-b; }
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pnegate(const Packet& a) { return -a; }
 
-/** \internal \returns conj(a) (coeff-wise) */
+template<> EIGEN_DEVICE_FUNC inline bool
+pnegate(const bool& a) { return !a; }
 
+/** \internal \returns conj(a) (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pconj(const Packet& a) { return numext::conj(a); }
 
@@ -269,38 +270,9 @@ pldexp(const Packet &a, const Packet &exponent) {
   return ldexp(a, static_cast<int>(exponent));
 }
 
-// Notice: The following ops accept and operator on bitwise masks.
-// The value of each field in a masks is Scalar(0) or ~Scalar(0).
-// For boolean packet like Packet16b, this is different from the
-// representation of true and false, which are 1 and 0.
-// As an example
-//    ptrue<Packet16b>()     = 0xffffffffffffffffffffffffffffffff
-// while
-//    pset1<Packet16b>(true) = 0x01010101010101010101010101010101
-
-/** \internal \returns the bitwise and of \a a and \a b */
+/** \internal \returns zero bits */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pand(const Packet& a, const Packet& b) { return a & b; }
-
-/** \internal \returns the bitwise or of \a a and \a b */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-por(const Packet& a, const Packet& b) { return a | b; }
-
-/** \internal \returns the bitwise xor of \a a and \a b */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pxor(const Packet& a, const Packet& b) { return a ^ b; }
-
-/** \internal \returns the bitwise and of \a a and not \a b */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pandnot(const Packet& a, const Packet& b) { return a & (~b); }
-
-/** \internal \returns ones */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-ptrue(const Packet& /*a*/) { Packet b; memset((void*)&b, 0xff, sizeof(b)); return b;}
-
-/** \internal \returns zeros */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pzero(const Packet& a) { return pxor(a,a); }
+pzero(const Packet& /*a*/) { Packet b; memset((void*)&b, 0, sizeof(b)); return b;}
 
 template<> EIGEN_DEVICE_FUNC inline float pzero<float>(const float& a) {
   EIGEN_UNUSED_VARIABLE(a);
@@ -312,6 +284,10 @@ template<> EIGEN_DEVICE_FUNC inline double pzero<double>(const double& a) {
   return 0.;
 }
 
+/** \internal \returns one bits */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+ptrue(const Packet& /*a*/) { Packet b; memset((void*)&b, 0xff, sizeof(b)); return b;}
+
 template <typename RealScalar>
 EIGEN_DEVICE_FUNC inline std::complex<RealScalar> ptrue(const std::complex<RealScalar>& /*a*/) {
   RealScalar b;
@@ -319,9 +295,21 @@ EIGEN_DEVICE_FUNC inline std::complex<RealScalar> ptrue(const std::complex<RealS
   return std::complex<RealScalar>(b, b);
 }
 
-/** \internal \returns the bitwise not of \a a */
-template <typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pnot(const Packet& a) { return pxor(ptrue(a), a);}
+/** \internal \returns the bitwise and of \a a and \a b */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pand(const Packet& a, const Packet& b) { return a & b; }
+
+/** \internal \returns the bitwise or of \a a and \a b */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+por(const Packet& a, const Packet& b) { return a | b; }
+
+/** \internal \returns the bitwise xor of \a a and \a b */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pxor(const Packet& a, const Packet& b) { return a ^ b; }
+
+/** \internal \returns the bitwise and of \a a and not \a b */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pandnot(const Packet& a, const Packet& b) { return pand(a, pxor(ptrue(b), b)); }
 
 /** \internal \returns a <= b as a bit mask */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
@@ -337,7 +325,7 @@ pcmp_eq(const Packet& a, const Packet& b) { return a==b ? ptrue(a) : pzero(a); }
 
 /** \internal \returns a < b or a==NaN or b==NaN as a bit mask */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pcmp_lt_or_nan(const Packet& a, const Packet& b) { return pnot(pcmp_le(b,a)); } 
+pcmp_lt_or_nan(const Packet& a, const Packet& b) { return a>=b ? pzero(a) : ptrue(a); } 
 
 /** \internal \returns \a or \b for each field in packet according to \mask */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
@@ -355,7 +343,10 @@ template<> EIGEN_DEVICE_FUNC inline double pselect<double>(
   return numext::equal_strict(cond,0.) ? b : a;
 }
 
-
+template<> EIGEN_DEVICE_FUNC inline bool pselect<bool>(
+    const bool& cond, const bool& a, const bool& b) {
+  return cond ? a : b;
+}
 
 /** \internal \returns the min of \a a and \a b  (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h
index c2d5205f2..23568cae9 100644
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -42,8 +42,7 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
     HasAbs2   = 0,
     HasMin    = 0,
     HasMax    = 0,
-    HasSetLinear = 0,
-    HasInsert = 1
+    HasSetLinear = 0
   };
 };
 #endif
@@ -77,7 +76,6 @@ EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) {
 }
 
 template<> EIGEN_STRONG_INLINE Packet4cf ptrue<Packet4cf>(const Packet4cf& a) { return Packet4cf(ptrue(Packet8f(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet4cf pnot<Packet4cf>(const Packet4cf& a) { return Packet4cf(pnot(Packet8f(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet4cf pand   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cf por    <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cf pxor   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); }
@@ -267,7 +265,6 @@ EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) {
 }
 
 template<> EIGEN_STRONG_INLINE Packet2cd ptrue<Packet2cd>(const Packet2cd& a) { return Packet2cd(ptrue(Packet4d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet2cd pnot<Packet2cd>(const Packet2cd& a) { return Packet2cd(pnot(Packet4d(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet2cd pand   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cd por    <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cd pxor   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); }
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 35a329e3f..b27a62cbc 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -62,7 +62,6 @@ template<> struct packet_traits<float>  : default_packet_traits
     AlignedOnScalar = 1,
     size = 8,
     HasHalfPacket = 1,
-    HasInsert = 1,
 
     HasDiv = 1,
     HasSin = EIGEN_FAST_MATH,
@@ -93,7 +92,6 @@ template<> struct packet_traits<double> : default_packet_traits
     AlignedOnScalar = 1,
     size=4,
     HasHalfPacket = 1,
-    HasInsert = 1,
 
     HasDiv  = 1,
     HasExp  = 1,
@@ -131,8 +129,7 @@ struct packet_traits<Eigen::half> : default_packet_traits {
     HasRsqrt = 0,
     HasExp = 0,
     HasLog = 0,
-    HasBlend = 0,
-    HasInsert = 1
+    HasBlend = 0
   };
 };
 #endif
diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h
index dc2ae0a35..747f7a5d5 100644
--- a/Eigen/src/Core/arch/AVX512/Complex.h
+++ b/Eigen/src/Core/arch/AVX512/Complex.h
@@ -41,8 +41,7 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
     HasAbs2   = 0,
     HasMin    = 0,
     HasMax    = 0,
-    HasSetLinear = 0,
-    HasInsert = 1
+    HasSetLinear = 0
   };
 };
 
@@ -59,7 +58,6 @@ template<> struct unpacket_traits<Packet8cf> {
 };
 
 template<> EIGEN_STRONG_INLINE Packet8cf ptrue<Packet8cf>(const Packet8cf& a) { return Packet8cf(ptrue(Packet16f(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet8cf pnot<Packet8cf>(const Packet8cf& a) { return Packet8cf(pnot(Packet16f(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet8cf padd<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_add_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet8cf psub<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_sub_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet8cf pnegate(const Packet8cf& a)
@@ -266,7 +264,6 @@ template<> EIGEN_STRONG_INLINE Packet4cd pmul<Packet4cd>(const Packet4cd& a, con
 }
 
 template<> EIGEN_STRONG_INLINE Packet4cd ptrue<Packet4cd>(const Packet4cd& a) { return Packet4cd(ptrue(Packet8d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet4cd pnot<Packet4cd>(const Packet4cd& a) { return Packet4cd(pnot(Packet8d(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet4cd pand   <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pand(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cd por    <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(por(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cd pxor   <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pxor(a.v,b.v)); }
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index 10a1d4adb..ad37ad620 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -73,8 +73,7 @@ struct packet_traits<half> : default_packet_traits {
     HasRsqrt = 0,
     HasExp = 0,
     HasLog = 0,
-    HasBlend = 0,
-    HasInsert = 1
+    HasBlend = 0
   };
 };
 
@@ -88,7 +87,6 @@ template<> struct packet_traits<float>  : default_packet_traits
     size = 16,
     HasHalfPacket = 1,
     HasBlend = 0,
-    HasInsert = 1,
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
 #if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
@@ -117,7 +115,6 @@ template<> struct packet_traits<double> : default_packet_traits
     AlignedOnScalar = 1,
     size = 8,
     HasHalfPacket = 1,
-    HasInsert = 1,
 #if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
     HasSqrt = EIGEN_FAST_MATH,
     HasRsqrt = EIGEN_FAST_MATH,
@@ -1323,11 +1320,6 @@ template<> EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) {
   return ptrue(Packet8i(a));
 }
 
-template<> EIGEN_STRONG_INLINE Packet16h pnot(const Packet16h& a) {
-  return _mm256_xor_si256(a, ptrue(a));
-}
-
-
 template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) {
   // in some cases Packet8i is a wrapper around __m256i, so we need to
   // cast to Packet8i to call the correct overload.
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index ee5a938b9..e11af1dca 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -136,7 +136,6 @@ struct packet_traits<float> : default_packet_traits
     HasConj      = 1,
     HasSetLinear = 0,
     HasBlend     = 0,
-    HasInsert    = 1,
 
     HasDiv   = 1,
     HasFloor = 1,
@@ -177,8 +176,7 @@ struct packet_traits<int8_t> : default_packet_traits
     HasMax       = 1,
     HasConj      = 1,
     HasSetLinear = 0,
-    HasBlend     = 0,
-    HasInsert    = 1,
+    HasBlend     = 0
   };
 };
 
@@ -209,7 +207,6 @@ struct packet_traits<uint8_t> : default_packet_traits
     HasConj      = 1,
     HasSetLinear = 0,
     HasBlend     = 0,
-    HasInsert    = 1,
 
     HasSqrt = 1
   };
@@ -241,8 +238,7 @@ struct packet_traits<int16_t> : default_packet_traits
     HasMax       = 1,
     HasConj      = 1,
     HasSetLinear = 0,
-    HasBlend     = 0,
-    HasInsert    = 1,
+    HasBlend     = 0
   };
 };
 
@@ -273,8 +269,6 @@ struct packet_traits<uint16_t> : default_packet_traits
     HasConj      = 1,
     HasSetLinear = 0,
     HasBlend     = 0,
-    HasInsert    = 1,
-
     HasSqrt = 1
   };
 };
@@ -305,8 +299,7 @@ struct packet_traits<int32_t> : default_packet_traits
     HasMax       = 1,
     HasConj      = 1,
     HasSetLinear = 0,
-    HasBlend     = 0,
-    HasInsert    = 1,
+    HasBlend     = 0
   };
 };
 
@@ -337,7 +330,6 @@ struct packet_traits<uint32_t> : default_packet_traits
     HasConj      = 1,
     HasSetLinear = 0,
     HasBlend     = 0,
-    HasInsert    = 1,
 
     HasSqrt = 1
   };
@@ -370,8 +362,7 @@ struct packet_traits<int64_t> : default_packet_traits
     HasMax       = 1,
     HasConj      = 1,
     HasSetLinear = 0,
-    HasBlend     = 0,
-    HasInsert    = 1,
+    HasBlend     = 0
   };
 };
 
@@ -402,8 +393,7 @@ struct packet_traits<uint64_t> : default_packet_traits
     HasMax       = 1,
     HasConj      = 1,
     HasSetLinear = 0,
-    HasBlend     = 0,
-    HasInsert    = 1,
+    HasBlend     = 0
   };
 };
 
@@ -1498,42 +1488,6 @@ template<> EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(const Packet2l& a, con
 template<> EIGEN_STRONG_INLINE Packet2ul pandnot<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
 { return vbicq_u64(a,b); }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pnot<Packet2f>(const Packet2f& a)
-{ return vreinterpret_f32_u32(vmvn_u32(vreinterpret_u32_f32(a))); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pnot<Packet4f>(const Packet4f& a)
-{ return vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(a))); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c pnot<Packet4c>(const Packet4c& a)
-{ return ~a; }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pnot<Packet8c>(const Packet8c& a)
-{ return vmvn_s8(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pnot<Packet16c>(const Packet16c& a)
-{ return vmvnq_s8(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc pnot<Packet4uc>(const Packet4uc& a)
-{ return ~a; }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pnot<Packet8uc>(const Packet8uc& a)
-{ return vmvn_u8(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pnot<Packet16uc>(const Packet16uc& a)
-{ return vmvnq_u8(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pnot<Packet4s>(const Packet4s& a)
-{ return vmvn_s16(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pnot<Packet8s>(const Packet8s& a)
-{ return vmvnq_s16(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pnot<Packet4us>(const Packet4us& a)
-{ return vmvn_u16(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pnot<Packet8us>(const Packet8us& a)
-{ return vmvnq_u16(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pnot<Packet2i>(const Packet2i& a)
-{ return vmvn_s32(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pnot<Packet4i>(const Packet4i& a)
-{ return vmvnq_s32(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pnot<Packet2ui>(const Packet2ui& a)
-{ return vmvn_u32(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pnot<Packet4ui>(const Packet4ui& a)
-{ return vmvnq_u32(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pnot<Packet2l>(const Packet2l& a)
-{ return vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(a))); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pnot<Packet2ul>(const Packet2ul& a)
-{ return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(a))); }
 
 template<int N> EIGEN_STRONG_INLINE Packet4c parithmetic_shift_right(Packet4c& a)
 { return vget_lane_s32(vreinterpret_s32_s8(vshr_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0); }
@@ -3218,7 +3172,6 @@ template<> struct packet_traits<double>  : default_packet_traits
     HasConj      = 1,
     HasSetLinear = 0,
     HasBlend     = 0,
-    HasInsert    = 1,
 
     HasDiv   = 1,
     HasFloor = 1,
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index 8bf8bfe85..0d322a2a1 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -45,8 +45,7 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
     HasMin    = 0,
     HasMax    = 0,
     HasSetLinear = 0,
-    HasBlend  = 1,
-    HasInsert = 1
+    HasBlend  = 1
   };
 };
 #endif
@@ -84,7 +83,6 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con
 }
 
 template<> EIGEN_STRONG_INLINE Packet2cf ptrue  <Packet2cf>(const Packet2cf& a) { return Packet2cf(ptrue(Packet4f(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet2cf pnot   <Packet2cf>(const Packet2cf& a) { return Packet2cf(pnot(Packet4f(a.v))); }
 
 template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); }
@@ -292,7 +290,6 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, con
 }
 
 template<> EIGEN_STRONG_INLINE Packet1cd ptrue  <Packet1cd>(const Packet1cd& a) { return Packet1cd(ptrue(Packet2d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet1cd pnot   <Packet1cd>(const Packet1cd& a) { return Packet1cd(pnot(Packet2d(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); }
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 645aee0cd..9f2e922a2 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -108,7 +108,6 @@ struct packet_traits<float> : default_packet_traits {
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
     HasBlend = 1,
-    HasInsert = 1,
     HasFloor = 1
 
 #ifdef EIGEN_VECTORIZE_SSE4_1
@@ -133,8 +132,7 @@ struct packet_traits<double> : default_packet_traits {
     HasExp  = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
-    HasBlend = 1,
-    HasInsert = 1
+    HasBlend = 1
 
 #ifdef EIGEN_VECTORIZE_SSE4_1
     ,
@@ -171,10 +169,10 @@ template<> struct packet_traits<bool> : default_packet_traits
     size=16,
 
     HasAdd       = 1,
-    HasSub       = 0,
+    HasSub       = 1,
     HasShift     = 0,
     HasMul       = 1,
-    HasNegate    = 0,
+    HasNegate    = 1,
     HasAbs       = 0,
     HasAbs2      = 0,
     HasMin       = 0,
@@ -254,6 +252,7 @@ template<> EIGEN_STRONG_INLINE Packet16b padd<Packet16b>(const Packet16b& a, con
 template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet16b psub<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_xor_si128(a,b); }
 
 template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
 {
@@ -270,6 +269,11 @@ template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
   return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a);
 }
 
+template<> EIGEN_STRONG_INLINE Packet16b pnegate(const Packet16b& a)
+{
+  return psub(pset1<Packet16b>(false), a);
+}
+
 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
@@ -305,11 +309,29 @@ template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d&
 #endif
 
 #ifdef EIGEN_VECTORIZE_SSE4_1
-template<> EIGEN_DEVICE_FUNC inline Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {  return _mm_blendv_ps(b,a,mask); }
+template<> EIGEN_DEVICE_FUNC inline Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
+  return _mm_blendv_ps(b,a,mask);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
+  return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b),_mm_castsi128_ps(a),_mm_castsi128_ps(mask)));
+}
 
 template<> EIGEN_DEVICE_FUNC inline Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) {  return _mm_blendv_pd(b,a,mask); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet16b pselect(const Packet16b& mask, const Packet16b& a, const Packet16b& b) {
+  return _mm_blendv_epi8(b,a,mask);
+}
+#else
+template<> EIGEN_DEVICE_FUNC inline Packet16b pselect(const Packet16b& mask, const Packet16b& a, const Packet16b& b) {
+  Packet16b a_part = _mm_and_si128(mask, a);
+  Packet16b b_part = _mm_andnot_si128(mask, b);
+  return _mm_or_si128(a_part, b_part);
+}
 #endif
 
+
+
 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
 #if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
   // There appears to be a bug in GCC, by which the optimizer may
@@ -567,6 +589,23 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
   return vec4i_swizzle1(tmp, 0, 0, 1, 1);
 }
 
+// Loads 8 bools from memory and returns the packet
+// {b0, b0, b1, b1, b2, b2, b3, b3, b4, b4, b5, b5, b6, b6, b7, b7}
+template<> EIGEN_STRONG_INLINE Packet16b ploaddup<Packet16b>(const bool*     from)
+{
+  __m128i tmp = _mm_castpd_si128(pload1<Packet2d>(reinterpret_cast<const double*>(from)));
+  return  _mm_unpacklo_epi8(tmp, tmp);
+}
+
+// Loads 4 bools from memory and returns the packet
+// {b0, b0  b0, b0, b1, b1, b1, b1, b2, b2, b2, b2, b3, b3, b3, b3}
+template<> EIGEN_STRONG_INLINE Packet16b
+ploadquad<Packet16b>(const bool* from) {
+  __m128i tmp = _mm_castps_si128(pload1<Packet4f>(reinterpret_cast<const float*>(from)));
+  tmp = _mm_unpacklo_epi8(tmp, tmp);
+  return  _mm_unpacklo_epi16(tmp, tmp);
+}
+
 template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
 template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
 template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
@@ -588,6 +627,14 @@ template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const dou
 template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
 {
  return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet16b pgather<bool, Packet16b>(const bool* from, Index stride)
+{
+  return _mm_set_epi8(from[15*stride], from[14*stride], from[13*stride], from[12*stride],
+                      from[11*stride], from[10*stride], from[9*stride], from[8*stride],
+                      from[7*stride], from[6*stride], from[5*stride], from[4*stride],
+                      from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
  }
 
 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
@@ -609,6 +656,14 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const
   to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
   to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
 }
+template<> EIGEN_DEVICE_FUNC inline void pscatter<bool, Packet16b>(bool* to, const Packet16b& from, Index stride)
+{
+  to[4*stride*0] = _mm_cvtsi128_si32(from);
+  to[4*stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
+  to[4*stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
+  to[4*stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
+}
+
 
 // some compilers might be tempted to perform multiple moves instead of using a vector path.
 template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a)
@@ -653,12 +708,19 @@ template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { retu
 template<> EIGEN_STRONG_INLINE bool   pfirst<Packet16b>(const Packet16b& a) { int x = _mm_cvtsi128_si32(a); return static_cast<bool>(x & 1); }
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
-{ return _mm_shuffle_ps(a,a,0x1B); }
-template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
-{ return _mm_shuffle_pd(a,a,0x1); }
-template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
-{ return _mm_shuffle_epi32(a,0x1B); }
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return _mm_shuffle_ps(a,a,0x1B); }
+template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return _mm_shuffle_pd(a,a,0x1); }
+template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return _mm_shuffle_epi32(a,0x1B); }
+template<> EIGEN_STRONG_INLINE Packet16b preverse(const Packet16b& a) {
+#ifdef EIGEN_VECTORIZE_SSSE3
+  __m128i mask = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  return _mm_shuffle_epi8(a, mask);
+#else
+  Packet16b tmp = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3));
+  tmp = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
+  return _mm_or_si128(_mm_slli_epi16(tmp, 8), _mm_srli_epi16(tmp, 8));
+#endif
+}
 
 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
 {
@@ -777,7 +839,7 @@ template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
 #endif
 
 template<> EIGEN_STRONG_INLINE bool predux<Packet16b>(const Packet16b& a) {
-Packet4i tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a,a));
+  Packet4i tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a,a));
   return (pfirst(tmp) != 0) || (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) != 0);
 }
 
@@ -804,6 +866,12 @@ template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
   return  (aux[0] * aux[1]) * (aux[2] * aux[3]);
 }
 
+template<> EIGEN_STRONG_INLINE bool predux_mul<Packet16b>(const Packet16b& a) {
+  Packet4i tmp = _mm_and_si128(a, _mm_unpackhi_epi64(a,a));
+  return ((pfirst<Packet4i>(tmp) == 0x01010101) &&
+          (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) == 0x01010101));
+}
+
 // min
 template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
 {
@@ -904,6 +972,87 @@ ptranspose(PacketBlock<Packet16b,4>& kernel) {
   kernel.packet[3] = _mm_unpackhi_epi16(T1, T3);
 }
 
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet16b,16>& kernel) {
+  // If we number the elements in the input thus:
+  // kernel.packet[ 0] = {00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 0a, 0b, 0c, 0d, 0e, 0f}
+  // kernel.packet[ 1] = {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 1a, 1b, 1c, 1d, 1e, 1f}
+  // ...
+  // kernel.packet[15] = {f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, fa, fb, fc, fd, fe, ff},
+  //
+  // the desired output is:
+  // kernel.packet[ 0] = {00, 10, 20, 30, 40, 50, 60, 70, 80, 90, a0, b0, c0, d0, e0, f0}
+  // kernel.packet[ 1] = {01, 11, 21, 31, 41, 51, 61, 71, 81, 91, a1, b1, c1, d1, e1, f1}
+  // ...
+  // kernel.packet[15] = {0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, af, bf, cf, df, ef, ff},
+  __m128i t0 =  _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  __m128i t1 =  _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]); // 08 18 09 19 0a 1a 0b 1b 0c 1c 0d 1d 0e 1e 0f 1f
+  __m128i t2 =  _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]); // 20 30 21 31 22 32 ...                     27 37
+  __m128i t3 =  _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]); // 28 38 29 39 2a 3a ...                     2f 3f
+  __m128i t4 =  _mm_unpacklo_epi8(kernel.packet[4], kernel.packet[5]); // 40 50 41 51 42 52                         47 57
+  __m128i t5 =  _mm_unpackhi_epi8(kernel.packet[4], kernel.packet[5]); // 48 58 49 59 4a 5a
+  __m128i t6 =  _mm_unpacklo_epi8(kernel.packet[6], kernel.packet[7]);
+  __m128i t7 =  _mm_unpackhi_epi8(kernel.packet[6], kernel.packet[7]);
+  __m128i t8 =  _mm_unpacklo_epi8(kernel.packet[8], kernel.packet[9]);
+  __m128i t9 =  _mm_unpackhi_epi8(kernel.packet[8], kernel.packet[9]);
+  __m128i ta =  _mm_unpacklo_epi8(kernel.packet[10], kernel.packet[11]);
+  __m128i tb =  _mm_unpackhi_epi8(kernel.packet[10], kernel.packet[11]);
+  __m128i tc =  _mm_unpacklo_epi8(kernel.packet[12], kernel.packet[13]);
+  __m128i td =  _mm_unpackhi_epi8(kernel.packet[12], kernel.packet[13]);
+  __m128i te =  _mm_unpacklo_epi8(kernel.packet[14], kernel.packet[15]);
+  __m128i tf =  _mm_unpackhi_epi8(kernel.packet[14], kernel.packet[15]);
+
+  __m128i s0 =  _mm_unpacklo_epi16(t0, t2); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  __m128i s1 =  _mm_unpackhi_epi16(t0, t2); // 04 14 24 34
+  __m128i s2 =  _mm_unpacklo_epi16(t1, t3); // 08 18 28 38 ...
+  __m128i s3 =  _mm_unpackhi_epi16(t1, t3); // 0c 1c 2c 3c ...
+  __m128i s4 =  _mm_unpacklo_epi16(t4, t6); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+  __m128i s5 =  _mm_unpackhi_epi16(t4, t6); // 44 54 64 74 ...
+  __m128i s6 =  _mm_unpacklo_epi16(t5, t7);
+  __m128i s7 =  _mm_unpackhi_epi16(t5, t7);
+  __m128i s8 =  _mm_unpacklo_epi16(t8, ta);
+  __m128i s9 =  _mm_unpackhi_epi16(t8, ta);
+  __m128i sa =  _mm_unpacklo_epi16(t9, tb);
+  __m128i sb =  _mm_unpackhi_epi16(t9, tb);
+  __m128i sc =  _mm_unpacklo_epi16(tc, te);
+  __m128i sd =  _mm_unpackhi_epi16(tc, te);
+  __m128i se =  _mm_unpacklo_epi16(td, tf);
+  __m128i sf =  _mm_unpackhi_epi16(td, tf);
+
+  __m128i u0 =  _mm_unpacklo_epi32(s0, s4); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  __m128i u1 =  _mm_unpackhi_epi32(s0, s4); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+  __m128i u2 =  _mm_unpacklo_epi32(s1, s5);
+  __m128i u3 =  _mm_unpackhi_epi32(s1, s5);
+  __m128i u4 =  _mm_unpacklo_epi32(s2, s6);
+  __m128i u5 =  _mm_unpackhi_epi32(s2, s6);
+  __m128i u6 =  _mm_unpacklo_epi32(s3, s7);
+  __m128i u7 =  _mm_unpackhi_epi32(s3, s7);
+  __m128i u8 =  _mm_unpacklo_epi32(s8, sc);
+  __m128i u9 =  _mm_unpackhi_epi32(s8, sc);
+  __m128i ua =  _mm_unpacklo_epi32(s9, sd);
+  __m128i ub =  _mm_unpackhi_epi32(s9, sd);
+  __m128i uc =  _mm_unpacklo_epi32(sa, se);
+  __m128i ud =  _mm_unpackhi_epi32(sa, se);
+  __m128i ue =  _mm_unpacklo_epi32(sb, sf);
+  __m128i uf =  _mm_unpackhi_epi32(sb, sf);
+
+  kernel.packet[0]  = _mm_unpacklo_epi64(u0, u8);
+  kernel.packet[1]  = _mm_unpackhi_epi64(u0, u8);
+  kernel.packet[2]  = _mm_unpacklo_epi64(u1, u9);
+  kernel.packet[3]  = _mm_unpackhi_epi64(u1, u9);
+  kernel.packet[4]  = _mm_unpacklo_epi64(u2, ua);
+  kernel.packet[5]  = _mm_unpackhi_epi64(u2, ua);
+  kernel.packet[6]  = _mm_unpacklo_epi64(u3, ub);
+  kernel.packet[7]  = _mm_unpackhi_epi64(u3, ub);
+  kernel.packet[8]  = _mm_unpacklo_epi64(u4, uc);
+  kernel.packet[9]  = _mm_unpackhi_epi64(u4, uc);
+  kernel.packet[10] = _mm_unpacklo_epi64(u5, ud);
+  kernel.packet[11] = _mm_unpackhi_epi64(u5, ud);
+  kernel.packet[12] = _mm_unpacklo_epi64(u6, ue);
+  kernel.packet[13] = _mm_unpackhi_epi64(u6, ue);
+  kernel.packet[14] = _mm_unpacklo_epi64(u7, uf);
+  kernel.packet[15] = _mm_unpackhi_epi64(u7, uf);
+}
 
 template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
   const __m128i zero = _mm_setzero_si128();
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index c7732e6e6..e59e9df21 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -10,11 +10,25 @@
 
 #include "packetmath_test_shared.h"
 
-#define REF_ADD(a,b) ((a)+(b))
-#define REF_SUB(a,b) ((a)-(b))
-#define REF_MUL(a,b) ((a)*(b))
-#define REF_DIV(a,b) ((a)/(b))
-#define REF_ABS_DIFF(a,b) ((a)>(b)?(a)-(b):(b)-(a))
+template <typename T>
+inline T REF_ADD(const T& a, const T& b) { return a + b;}
+template <typename T>
+inline T REF_SUB(const T& a, const T& b) { return a - b;}
+template <typename T>
+inline T REF_MUL(const T& a, const T& b) { return a * b;}
+template <typename T>
+inline T REF_DIV(const T& a, const T& b) { return a / b;}
+template <typename T>
+inline T REF_ABS_DIFF(const T& a, const T& b) { return a>b ? a - b : b-a;}
+
+// Specializations for bool
+template <>
+inline bool REF_ADD(const bool& a, const bool& b) { return a || b;}
+template <>
+inline bool REF_SUB(const bool& a, const bool& b) { return a ^ b;}
+template <>
+inline bool REF_MUL(const bool& a, const bool& b) { return a && b;}
+
 
 template<typename FromScalar, typename FromPacket, typename ToScalar, typename ToPacket, bool CanCast = false>
 struct test_cast_helper;
@@ -70,7 +84,8 @@ void test_cast() {
   test_cast_helper<FromScalar, FromPacket, ToScalar, ToPacket, CanCast>::run();
 }
 
-template<typename Scalar,typename Packet> void packetmath_boolean()
+template<typename Scalar,typename Packet>
+void packetmath_boolean_mask_ops()
 {
   const int PacketSize = internal::unpacket_traits<Packet>::size;
   const int size = 2*PacketSize;
@@ -82,9 +97,18 @@ template<typename Scalar,typename Packet> void packetmath_boolean()
   {
     data1[i] = internal::random<Scalar>();
   }
-  CHECK_CWISE2_IF(true, internal::por, internal::por);
-  CHECK_CWISE2_IF(true, internal::pxor, internal::pxor);
-  CHECK_CWISE2_IF(true, internal::pand, internal::pand);
+  CHECK_CWISE1(internal::ptrue, internal::ptrue);
+  CHECK_CWISE2_IF(true, internal::pandnot, internal::pandnot);
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = Scalar(i);
+    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
+  }
+  CHECK_CWISE2_IF(true, internal::pcmp_eq, internal::pcmp_eq);
+}
+
+template<>
+void packetmath_boolean_mask_ops<bool, internal::Packet16b>()
+{
 }
 
 template<typename Scalar,typename Packet> void packetmath()
@@ -171,9 +195,6 @@ template<typename Scalar,typename Packet> void packetmath()
   CHECK_CWISE2_IF(PacketTraits::HasMul, REF_MUL,  internal::pmul);
   CHECK_CWISE2_IF(PacketTraits::HasDiv, REF_DIV, internal::pdiv);
 
-  CHECK_CWISE1(internal::pnot, internal::pnot);
-  CHECK_CWISE1(internal::pzero, internal::pzero);
-  CHECK_CWISE1(internal::ptrue, internal::ptrue);
   if (PacketTraits::HasNegate)
     CHECK_CWISE1(internal::negate, internal::pnegate);
   CHECK_CWISE1(numext::conj, internal::pconj);
@@ -252,7 +273,7 @@ template<typename Scalar,typename Packet> void packetmath()
 
   ref[0] = Scalar(1);
   for (int i=0; i<PacketSize; ++i)
-    ref[0] *= data1[i];
+    ref[0] = REF_MUL(ref[0], data1[i]);
   VERIFY(internal::isApprox(ref[0], internal::predux_mul(internal::pload<Packet>(data1))) && "internal::predux_mul");
 
   for (int i=0; i<PacketSize; ++i)
@@ -272,6 +293,7 @@ template<typename Scalar,typename Packet> void packetmath()
     }
   }
 
+
   if (PacketTraits::HasBlend) {
     Packet thenPacket = internal::pload<Packet>(data1);
     Packet elsePacket = internal::pload<Packet>(data2);
@@ -304,26 +326,22 @@ template<typename Scalar,typename Packet> void packetmath()
     CHECK_CWISE3_IF(true, internal::pselect, internal::pselect);
   }
 
-  {
-    for (int i = 0; i < PacketSize; ++i) {
-      data1[i] = Scalar(i);
-      data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
-    }
-    CHECK_CWISE2_IF(true, internal::pcmp_eq, internal::pcmp_eq);
-  }
-
   CHECK_CWISE1_IF(PacketTraits::HasSqrt, numext::sqrt, internal::psqrt);
 
   for (int i=0; i<size; ++i)
   {
     data1[i] = internal::random<Scalar>();
   }
-  CHECK_CWISE2_IF(true, internal::pandnot, internal::pandnot);
+  CHECK_CWISE1(internal::pzero, internal::pzero);
+  CHECK_CWISE2_IF(true, internal::por, internal::por);
+  CHECK_CWISE2_IF(true, internal::pxor, internal::pxor);
+  CHECK_CWISE2_IF(true, internal::pand, internal::pand);
 
-  packetmath_boolean<Scalar, Packet>();
+  packetmath_boolean_mask_ops<Scalar, Packet>();
 }
 
 
+
 template<typename Scalar,typename Packet> void packetmath_real()
 {
   typedef internal::packet_traits<Scalar> PacketTraits;
@@ -753,7 +771,7 @@ EIGEN_DECLARE_TEST(packetmath)
     CALL_SUBTEST_12( test::runner<std::complex<double> >::run() );
     CALL_SUBTEST_13(( packetmath<half,internal::packet_traits<half>::type>() ));
 #ifdef EIGEN_PACKET_MATH_SSE_H
-    CALL_SUBTEST_14(( packetmath_boolean<bool,internal::packet_traits<bool>::type>() ));
+    CALL_SUBTEST_14(( packetmath<bool,internal::packet_traits<bool>::type>() ));
 #endif
     g_first_pass = false;
   }
author	Rasmus Munk Larsen <rmlarsen@google.com>	2020-05-11 13:23:31 -0700
committer	Rasmus Munk Larsen <rmlarsen@google.com>	2020-05-14 22:39:13 +0000
commit	9b411757abd8458f9689b1384c6bf75da9b82357 (patch)
tree	eb23c0c65eaa4df204880c94b287fcf1b43e14cd
parent	d640276d31a7dea9207a68a061a6fa7c9fdf50e5 (diff)