From e15bb785adf756f3e48410ee681ca97ad5bb3e76 Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Wed, 9 Jan 2019 16:34:23 -0800
Subject: Collapsed revision * Add packet up "pones". Write pnot(a) as
 pxor(pones(a), a). * Collapsed revision * Simplify a bit. * Undo useless
 diffs. * Fix typo.

---
 Eigen/src/Core/GenericPacketMath.h       | 29 ++++++++-----------
 Eigen/src/Core/arch/AVX/Complex.h        | 10 ++++---
 Eigen/src/Core/arch/AVX/PacketMath.h     | 19 +++++++++++++
 Eigen/src/Core/arch/AVX512/Complex.h     | 10 ++-----
 Eigen/src/Core/arch/AVX512/PacketMath.h  | 49 +++++++++++++++++++-------------
 Eigen/src/Core/arch/GPU/PacketMathHalf.h | 16 +++++++++++
 Eigen/src/Core/arch/SSE/Complex.h        | 11 ++++---
 Eigen/src/Core/arch/SSE/PacketMath.h     | 11 +++++++
 test/packetmath.cpp                      |  2 ++
 9 files changed, 104 insertions(+), 53 deletions(-)
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 8bdf16e16..8bcceaa7b 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -214,17 +214,13 @@ pxor(const Packet& a, const Packet& b) { return a ^ b; }
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pandnot(const Packet& a, const Packet& b) { return a & (~b); }
 
-/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
+/** \internal \returns ones */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pset1(const typename unpacket_traits<Packet>::type& a) { return a; }
+pones(const Packet& /*a*/) { Packet b; memset(&b, 0xff, sizeof(b)); return b;}
 
 /** \internal \returns the bitwise not of \a a */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pnot(const Packet& a) {
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  Packet ones  = pset1<Packet>(Scalar(1));
-  return pandnot(ones, a);
-}
+template <typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pnot(const Packet& a) { return pxor(pones(a), a);}
 
 /** \internal \returns \a a shifted by N bits to the right */
 template<int N> EIGEN_DEVICE_FUNC inline int
@@ -262,24 +258,19 @@ pselect(const Packet& mask, const Packet& a, const Packet& b) {
 
 /** \internal \returns a <= b as a bit mask */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pcmp_le(const Packet& a, const Packet& b); /* { return a<=b ? pnot(pxor(a,a)) : pxor(a,a); } */
+pcmp_le(const Packet& a, const Packet& b)  { return a<=b ? pones(a) : pzero(a); }
 
 /** \internal \returns a < b as a bit mask */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pcmp_lt(const Packet& a, const Packet& b); /* { return a<b  ? pnot(pxor(a,a)) : pxor(a,a); } */
+pcmp_lt(const Packet& a, const Packet& b)  { return a<b ? pones(a) : pzero(a); }
 
 /** \internal \returns a == b as a bit mask */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pcmp_eq(const Packet& a, const Packet& b)
-{
-  typedef typename unpacket_traits<Packet>::type Scalar;
-  Packet zeros  = pset1<Packet>(Scalar(0));
-  return a==b ? pnot(zeros) : zeros;
-}
+pcmp_eq(const Packet& a, const Packet& b) { return a==b ? pones(a) : pzero(a); }
 
 /** \internal \returns a < b or a==NaN or b==NaN as a bit mask */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pcmp_lt_or_nan(const Packet& a, const Packet& b); /* { return pnot(pcmp_le(b,a)); } */
+pcmp_lt_or_nan(const Packet& a, const Packet& b) { return pnot(pcmp_le(b,a)); } 
 
 /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
@@ -289,6 +280,10 @@ pload(const typename unpacket_traits<Packet>::type* from) { return *from; }
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }
 
+/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pset1(const typename unpacket_traits<Packet>::type& a) { return a; }
+
 /** \internal \returns a packet with constant coefficients set from bits */
 template<typename Packet,typename BitsType> EIGEN_DEVICE_FUNC inline Packet
 pset1frombits(BitsType a);
diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h
index 23687c624..9f1bb969e 100644
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -72,10 +72,11 @@ template<> EIGEN_STRONG_INLINE Packet4cf pmul<Packet4cf>(const Packet4cf& a, con
 template <>
 EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) {
   __m256 eq = _mm256_cmp_ps(a.v, b.v, _CMP_EQ_OQ);
-  __m256 real_and_imag_equal = _mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1));
-  return Packet4cf(real_and_imag_equal);
+  return Packet4cf(_mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1)));
 }
 
+template<> EIGEN_STRONG_INLINE Packet4cf pones<Packet4cf>(const Packet4cf& a) { return Packet4cf(pones(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf pnot<Packet4cf>(const Packet4cf& a) { return Packet4cf(pnot(a.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cf pand   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cf por    <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cf pxor   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); }
@@ -286,10 +287,11 @@ template<> EIGEN_STRONG_INLINE Packet2cd pmul<Packet2cd>(const Packet2cd& a, con
 template <>
 EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) {
   __m256d eq = _mm256_cmp_pd(a.v, b.v, _CMP_EQ_OQ);
-  __m256d real_and_imag_equal = _mm256_and_pd(eq, _mm256_permute_pd(eq, 0x5));
-  return Packet2cd(real_and_imag_equal);
+  return Packet2cd(pand(eq, _mm256_permute_pd(eq, 0x5)));
 }
 
+template<> EIGEN_STRONG_INLINE Packet2cd pones<Packet2cd>(const Packet2cd& a) { return Packet2cd(pones(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd pnot<Packet2cd>(const Packet2cd& a) { return Packet2cd(pnot(a.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cd pand   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cd por    <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cd pxor   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); }
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index a6af48f21..f6a514fbf 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -250,6 +250,25 @@ template<> EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) { ret
 template<> EIGEN_STRONG_INLINE Packet8f pfloor<Packet8f>(const Packet8f& a) { return _mm256_floor_ps(a); }
 template<> EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) { return _mm256_floor_pd(a); }
 
+
+#ifdef EIGEN_VECTORIZE_AVX2
+template<> EIGEN_STRONG_INLINE Packet8i pones<Packet8i>(const Packet8i& a) {
+  return _mm256_cmpeq_epi64(a,a);
+}
+#else
+template<> EIGEN_STRONG_INLINE Packet8i pones<Packet8i>(const Packet8i& /*a*/) {
+  const unsigned int o = 0xffffffffu;
+  return _mm256_set_epi32(o, o, o, o, o, o, o, o);
+}
+#endif
+template<> EIGEN_STRONG_INLINE Packet8f pones<Packet8f>(const Packet8f& a) {
+  return _mm256_castsi256_ps(pones<Packet8i>(_mm256_castps_si256(a)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4d pones<Packet4d>(const Packet4d& a) {
+  return _mm256_castsi256_pd(pones<Packet8i>(_mm256_castpd_si256(a)));
+}
+
 template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(const Packet8i& a, const Packet8i& b) {
diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h
index 2c613f870..154fedc25 100644
--- a/Eigen/src/Core/arch/AVX512/Complex.h
+++ b/Eigen/src/Core/arch/AVX512/Complex.h
@@ -83,10 +83,7 @@ template<> EIGEN_STRONG_INLINE Packet8cf pandnot<Packet8cf>(const Packet8cf& a,
 template <>
 EIGEN_STRONG_INLINE Packet8cf pcmp_eq(const Packet8cf& a, const Packet8cf& b) {
   __m512 eq = pcmp_eq<Packet16f>(a.v, b.v);
-  __m512 eq_swap_real_imag = _mm512_permute_ps(eq, 0xB1);
-  __m512i real_and_imag_equal = _mm512_and_si512(
-      _mm512_castps_si512(eq), _mm512_castps_si512(eq_swap_real_imag));
-  return Packet8cf(_mm512_castsi512_ps(real_and_imag_equal));
+  return Packet8cf(pand(eq, _mm512_permute_ps(eq, 0xB1)));
 }
 
 template<> EIGEN_STRONG_INLINE Packet8cf pload <Packet8cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet8cf(pload<Packet16f>(&numext::real_ref(*from))); }
@@ -279,10 +276,7 @@ template<> EIGEN_STRONG_INLINE Packet4cd pandnot<Packet4cd>(const Packet4cd& a,
 template <>
 EIGEN_STRONG_INLINE Packet4cd pcmp_eq(const Packet4cd& a, const Packet4cd& b) {
   __m512d eq = pcmp_eq<Packet8d>(a.v, b.v);
-  __m512d eq_swap_real_imag = _mm512_permute_pd(eq, 0x55);
-  __m512i real_and_imag_equal = _mm512_and_si512(
-      _mm512_castpd_si512(eq), _mm512_castpd_si512(eq_swap_real_imag));
-  return Packet4cd(_mm512_castsi512_pd(real_and_imag_equal));
+  return Packet4cd(pand(eq, _mm512_permute_pd(eq, 0x55)));
 }
 
 template<> EIGEN_STRONG_INLINE Packet4cd pload <Packet4cd>(const std::complex<double>* from)
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index 68adf5e57..9666c4e22 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -284,47 +284,56 @@ EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) {
 #endif
 
 template<> EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) {
-  __m256 lo = pcmp_le(extract256<0>(a), extract256<0>(b));
-  __m256 hi = pcmp_le(extract256<1>(a), extract256<1>(b));
-  return cat256(lo, hi);
+  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ);
+  return _mm512_castsi512_ps(
+      _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
 }
 
 template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) {
-  __m256 lo = pcmp_lt(extract256<0>(a), extract256<0>(b));
-  __m256 hi = pcmp_lt(extract256<1>(a), extract256<1>(b));
-  return cat256(lo, hi);
-}
-
-template<> EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) {
-  __m256 lo = pcmp_eq(extract256<0>(a), extract256<0>(b));
-  __m256 hi = pcmp_eq(extract256<1>(a), extract256<1>(b));
-  return cat256(lo, hi);
+  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ);
+  return _mm512_castsi512_ps(
+      _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
 }
 
 template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
-  __m256 lo = pcmp_lt_or_nan(extract256<0>(a), extract256<0>(b));
-  __m256 hi = pcmp_lt_or_nan(extract256<1>(a), extract256<1>(b));
-  return cat256(lo, hi);
+  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGT_UQ);
+  return _mm512_castsi512_ps(
+      _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
 }
 
 template<> EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) {
-  __m256i lo = _mm256_cmpeq_epi32(_mm512_extracti64x4_epi64(a, 0), _mm512_extracti64x4_epi64(b, 0));
-  __m256i hi = _mm256_cmpeq_epi32(_mm512_extracti64x4_epi64(a, 1), _mm512_extracti64x4_epi64(b, 1));
-  return _mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1);
+  __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _CMP_EQ_OQ);
+  return _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu);
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) {
   __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ);
   return _mm512_castsi512_ps(
-      _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffff));
+      _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) {
   __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ);
   return _mm512_castsi512_pd(
-      _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffff));
+      _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16i pones<Packet16i>(const Packet16i& /*a*/) {
+  const unsigned int o = 0xffffffffu;
+  return _mm512_set_epi32(o, o, o, o, o, o, o, o, o, o, o, o, o, o, o, o);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pones<Packet16f>(const Packet16f& a) {
+  return _mm512_castsi512_ps(pones<Packet16i>(_mm512_castps_si512(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d pones<Packet8d>(const Packet8d& a) {
+  return _mm512_castsi512_pd(pones<Packet16i>(_mm512_castpd_si512(a)));
 }
 
 template <>
diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
index 3e35f96cc..c4dfedcf8 100644
--- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
@@ -143,6 +143,10 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs<half2>(const half2&
   return result;
 }
 
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pones<half2>(const half2& a) {
+  half2 result;
+  *(reinterpret_cast<unsigned*>(&(result))) = 0xffffffffu;
+}  
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
 ptranspose(PacketBlock<half2,2>& kernel) {
@@ -640,6 +644,14 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
 #endif
 }
 
+template<> EIGEN_STRONG_INLINE Packet16h pnot(const Packet16h& a) {
+  Packet16h r; r.x = _mm256_xor_si256(a.x, pcmp_eq(a.x, a.x)); return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pones(const Packet16h& a) {
+  Packet16h r; r.x = Packet8i(pones(a.x)); return r;
+}
+
 template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) {
   // in some cases Packet8i is a wrapper around __m256i, so we need to 
   // cast to Packet8i to call the correct overload.
@@ -1085,6 +1097,10 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
 #endif
 }
 
+template<> EIGEN_STRONG_INLINE Packet8h pones(const Packet8h& a) {
+  Packet8h r; r.x = _mm_cmpeq_epi32(a.x, a.x); return r;
+}
+
 template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) {
   // in some cases Packet4i is a wrapper around __m128i, so we either need to 
   // cast to Packet4i to directly call the intrinsics as below:
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index a7304193b..fa84097ac 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -82,6 +82,9 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con
   #endif
 }
 
+template<> EIGEN_STRONG_INLINE Packet2cf pones  <Packet2cf>(const Packet2cf& a) { return Packet2cf(pones(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pnot   <Packet2cf>(const Packet2cf& a) { return Packet2cf(pnot(a.v)); }
+
 template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); }
@@ -305,6 +308,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, con
   #endif
 }
 
+template<> EIGEN_STRONG_INLINE Packet1cd pones  <Packet1cd>(const Packet1cd& a) { return Packet1cd(pones(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pnot   <Packet1cd>(const Packet1cd& a) { return Packet1cd(pnot(a.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); }
@@ -442,15 +447,13 @@ ptranspose(PacketBlock<Packet2cf,2>& kernel) {
 template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b)
 {
   __m128 eq = _mm_cmpeq_ps(a.v, b.v);
-  __m128 real_and_imag_equal = _mm_and_ps(eq, vec4f_swizzle1(eq, 1, 0, 3, 2));
-  return Packet2cf(real_and_imag_equal);
+  return Packet2cf(pand(eq, vec4f_swizzle1(eq, 1, 0, 3, 2)));
 }
 
 template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b)
 {
   __m128d eq = _mm_cmpeq_pd(a.v, b.v);
-  __m128d real_and_imag_equal = _mm_and_pd(eq, vec2d_swizzle1(eq, 1, 0));
-  return Packet1cd(real_and_imag_equal);
+  return Packet1cd(pand(eq, vec2d_swizzle1(eq, 1, 0)));
 }
 
 template<>  EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index b8a5497a9..6dd2f8a46 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -378,6 +378,17 @@ template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4
 template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); }
 
+template<> EIGEN_STRONG_INLINE Packet4i pones<Packet4i>(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); }
+template<> EIGEN_STRONG_INLINE Packet4f
+pones<Packet4f>(const Packet4f& a) {
+  Packet4i b = _mm_castps_si128(a);
+  return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b));
+}
+template<> EIGEN_STRONG_INLINE Packet2d
+pones<Packet2d>(const Packet2d& a) {
+  Packet4i b = _mm_castpd_si128(a);
+  return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b));
+}
 
 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index a88b7bba9..460cfbdbe 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -239,6 +239,8 @@ template<typename Scalar,typename Packet> void packetmath()
   CHECK_CWISE2_IF(PacketTraits::HasDiv, REF_DIV, internal::pdiv);
 
   CHECK_CWISE1(internal::pnot, internal::pnot);
+  CHECK_CWISE1(internal::pzero, internal::pzero);
+  CHECK_CWISE1(internal::pones, internal::pones);
   CHECK_CWISE1(internal::negate, internal::pnegate);
   CHECK_CWISE1(numext::conj, internal::pconj);
 
-- 
cgit v1.2.3


From 0abe03764c697ed8da37ce4421dd1918aa7a9b5f Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 10 Jan 2019 10:27:55 -0800
Subject: Fix shorten-64-to-32 warning in TensorContractionThreadPool

---
 unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index 9666bf167..d68409e26 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -890,7 +890,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
 
       TENSOR_CONTRACTION_DISPATCH(
           this->template evalGemmPartialWithoutOutputKernel, Alignment,
-          (buf, begin, end, /*num_threads=*/num_blocks));
+          (buf, begin, end,
+           /*num_threads=*/internal::convert_index<int>(num_blocks)));
 
       // Check if it was the last task in l0 range.
       const Index l0_index = block_idx / l0_size;
-- 
cgit v1.2.3


From 0522460a0d01d4253183349a49144b5ad8ba2f9f Mon Sep 17 00:00:00 2001
From: Christoph Hertzberg <chtz@informatik.uni-bremen.de>
Date: Fri, 11 Jan 2019 11:07:56 +0100
Subject: bug #1656: Enable failtests only if BUILD_TESTING is enabled

---
 CMakeLists.txt | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5255e9600..48c0a6367 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -469,6 +469,8 @@ if(BUILD_TESTING)
   else()
     add_subdirectory(test EXCLUDE_FROM_ALL)
   endif()
+
+  add_subdirectory(failtest)
 endif()
 
 if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
@@ -519,8 +521,6 @@ message(STATUS "")
 message(STATUS "Configured Eigen ${EIGEN_VERSION_NUMBER}")
 message(STATUS "")
 
-add_subdirectory(failtest)
-
 string(TOLOWER "${CMAKE_GENERATOR}" cmake_generator_tolower)
 if(cmake_generator_tolower MATCHES "makefile")
   message(STATUS "Some things you can do now:")
@@ -537,8 +537,10 @@ if(cmake_generator_tolower MATCHES "makefile")
   message(STATUS "              |   Or:")
   message(STATUS "              |     cmake . -DINCLUDE_INSTALL_DIR=yourdir")
   message(STATUS "make doc      | Generate the API documentation, requires Doxygen & LaTeX")
-  message(STATUS "make check    | Build and run the unit-tests. Read this page:")
-  message(STATUS "              |   http://eigen.tuxfamily.org/index.php?title=Tests")
+  if(BUILD_TESTING)
+    message(STATUS "make check    | Build and run the unit-tests. Read this page:")
+    message(STATUS "              |   http://eigen.tuxfamily.org/index.php?title=Tests")
+  endif()
   message(STATUS "make blas     | Build BLAS library (not the same thing as Eigen)")
   message(STATUS "make uninstall| Removes files installed by make install")
   message(STATUS "--------------+--------------------------------------------------------------")
-- 
cgit v1.2.3


From 3c9add6598cc35e5317788627dfa81f517e89e07 Mon Sep 17 00:00:00 2001
From: Mark D Ryan <mark.d.ryan@intel.com>
Date: Fri, 11 Jan 2019 14:02:09 +0100
Subject: Remove reinterpret_cast from AVX512 complex implementation

The reinterpret_casts used in ptranspose(PacketBlock<Packet8cf,4>&)
ptranspose(PacketBlock<Packet8cf,8>&) don't appear to be working
correctly.  They're used to convert the kernel parameters to
PacketBlock<Packet8d,T>& so that the complex number versions of
ptranspose can be written using the existing double implementations.
Unfortunately, they don't seem to work and are responsible for 9 unit
test failures in the AVX512 build of tensorflow master.  This commit
fixes the issue by manually initialising PacketBlock<Packet8d,T>
variables with the contents of the kernel parameter before calling
the double version of ptranspose, and then copying the resulting
values back into the kernel parameter before returning.
---
 Eigen/src/Core/arch/AVX512/Complex.h | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h
index 42cdfcd25..9c4ee1235 100644
--- a/Eigen/src/Core/arch/AVX512/Complex.h
+++ b/Eigen/src/Core/arch/AVX512/Complex.h
@@ -390,12 +390,40 @@ template<> EIGEN_STRONG_INLINE Packet4cd pcplxflip<Packet4cd>(const Packet4cd& x
 
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet8cf,4>& kernel) {
-  ptranspose(reinterpret_cast<PacketBlock<Packet8d,4>&>(kernel));
+  PacketBlock<Packet8d,4> pb;
+  
+  pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v);
+  pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v);
+  pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v);
+  pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v);
+  ptranspose(pb);
+  kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]);
+  kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]);
+  kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]);
+  kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]);
 }
 
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet8cf,8>& kernel) {
-  ptranspose(reinterpret_cast<PacketBlock<Packet8d,8>&>(kernel));
+  PacketBlock<Packet8d,8> pb;
+  
+  pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v);
+  pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v);
+  pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v);
+  pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v);
+  pb.packet[4] = _mm512_castps_pd(kernel.packet[4].v);
+  pb.packet[5] = _mm512_castps_pd(kernel.packet[5].v);
+  pb.packet[6] = _mm512_castps_pd(kernel.packet[6].v);
+  pb.packet[7] = _mm512_castps_pd(kernel.packet[7].v);
+  ptranspose(pb);
+  kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]);
+  kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]);
+  kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]);
+  kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]);
+  kernel.packet[4].v = _mm512_castpd_ps(pb.packet[4]);
+  kernel.packet[5].v = _mm512_castpd_ps(pb.packet[5]);
+  kernel.packet[6].v = _mm512_castpd_ps(pb.packet[6]);
+  kernel.packet[7].v = _mm512_castpd_ps(pb.packet[7]);
 }
 
 EIGEN_DEVICE_FUNC inline void
-- 
cgit v1.2.3