From e15bb785adf756f3e48410ee681ca97ad5bb3e76 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 9 Jan 2019 16:34:23 -0800 Subject: Collapsed revision * Add packet up "pones". Write pnot(a) as pxor(pones(a), a). * Collapsed revision * Simplify a bit. * Undo useless diffs. * Fix typo. --- Eigen/src/Core/GenericPacketMath.h | 29 ++++++++----------- Eigen/src/Core/arch/AVX/Complex.h | 10 ++++--- Eigen/src/Core/arch/AVX/PacketMath.h | 19 +++++++++++++ Eigen/src/Core/arch/AVX512/Complex.h | 10 ++----- Eigen/src/Core/arch/AVX512/PacketMath.h | 49 +++++++++++++++++++------------- Eigen/src/Core/arch/GPU/PacketMathHalf.h | 16 +++++++++++ Eigen/src/Core/arch/SSE/Complex.h | 11 ++++--- Eigen/src/Core/arch/SSE/PacketMath.h | 11 +++++++ test/packetmath.cpp | 2 ++ 9 files changed, 104 insertions(+), 53 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 8bdf16e16..8bcceaa7b 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -214,17 +214,13 @@ pxor(const Packet& a, const Packet& b) { return a ^ b; } template EIGEN_DEVICE_FUNC inline Packet pandnot(const Packet& a, const Packet& b) { return a & (~b); } -/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ +/** \internal \returns ones */ template EIGEN_DEVICE_FUNC inline Packet -pset1(const typename unpacket_traits::type& a) { return a; } +pones(const Packet& /*a*/) { Packet b; memset(&b, 0xff, sizeof(b)); return b;} /** \internal \returns the bitwise not of \a a */ -template EIGEN_DEVICE_FUNC inline Packet -pnot(const Packet& a) { - typedef typename unpacket_traits::type Scalar; - Packet ones = pset1(Scalar(1)); - return pandnot(ones, a); -} +template EIGEN_DEVICE_FUNC inline Packet +pnot(const Packet& a) { return pxor(pones(a), a);} /** \internal \returns \a a shifted by N bits to the right */ template EIGEN_DEVICE_FUNC inline int @@ -262,24 +258,19 @@ pselect(const Packet& mask, const Packet& a, const Packet& b) { /** \internal \returns a <= b as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -pcmp_le(const Packet& a, const Packet& b); /* { return a<=b ? pnot(pxor(a,a)) : pxor(a,a); } */ +pcmp_le(const Packet& a, const Packet& b) { return a<=b ? pones(a) : pzero(a); } /** \internal \returns a < b as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -pcmp_lt(const Packet& a, const Packet& b); /* { return a EIGEN_DEVICE_FUNC inline Packet -pcmp_eq(const Packet& a, const Packet& b) -{ - typedef typename unpacket_traits::type Scalar; - Packet zeros = pset1(Scalar(0)); - return a==b ? pnot(zeros) : zeros; -} +pcmp_eq(const Packet& a, const Packet& b) { return a==b ? pones(a) : pzero(a); } /** \internal \returns a < b or a==NaN or b==NaN as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -pcmp_lt_or_nan(const Packet& a, const Packet& b); /* { return pnot(pcmp_le(b,a)); } */ +pcmp_lt_or_nan(const Packet& a, const Packet& b) { return pnot(pcmp_le(b,a)); } /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */ template EIGEN_DEVICE_FUNC inline Packet @@ -289,6 +280,10 @@ pload(const typename unpacket_traits::type* from) { return *from; } template EIGEN_DEVICE_FUNC inline Packet ploadu(const typename unpacket_traits::type* from) { return *from; } +/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ +template EIGEN_DEVICE_FUNC inline Packet +pset1(const typename unpacket_traits::type& a) { return a; } + /** \internal \returns a packet with constant coefficients set from bits */ template EIGEN_DEVICE_FUNC inline Packet pset1frombits(BitsType a); diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 23687c624..9f1bb969e 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -72,10 +72,11 @@ template<> EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, con template <> EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) { __m256 eq = _mm256_cmp_ps(a.v, b.v, _CMP_EQ_OQ); - __m256 real_and_imag_equal = _mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1)); - return Packet4cf(real_and_imag_equal); + return Packet4cf(_mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1))); } +template<> EIGEN_STRONG_INLINE Packet4cf pones(const Packet4cf& a) { return Packet4cf(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pnot(const Packet4cf& a) { return Packet4cf(pnot(a.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pand (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf por (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pxor (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); } @@ -286,10 +287,11 @@ template<> EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, con template <> EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) { __m256d eq = _mm256_cmp_pd(a.v, b.v, _CMP_EQ_OQ); - __m256d real_and_imag_equal = _mm256_and_pd(eq, _mm256_permute_pd(eq, 0x5)); - return Packet2cd(real_and_imag_equal); + return Packet2cd(pand(eq, _mm256_permute_pd(eq, 0x5))); } +template<> EIGEN_STRONG_INLINE Packet2cd pones(const Packet2cd& a) { return Packet2cd(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pnot(const Packet2cd& a) { return Packet2cd(pnot(a.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pand (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd por (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pxor (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index a6af48f21..f6a514fbf 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -250,6 +250,25 @@ template<> EIGEN_STRONG_INLINE Packet4d pceil(const Packet4d& a) { ret template<> EIGEN_STRONG_INLINE Packet8f pfloor(const Packet8f& a) { return _mm256_floor_ps(a); } template<> EIGEN_STRONG_INLINE Packet4d pfloor(const Packet4d& a) { return _mm256_floor_pd(a); } + +#ifdef EIGEN_VECTORIZE_AVX2 +template<> EIGEN_STRONG_INLINE Packet8i pones(const Packet8i& a) { + return _mm256_cmpeq_epi64(a,a); +} +#else +template<> EIGEN_STRONG_INLINE Packet8i pones(const Packet8i& /*a*/) { + const unsigned int o = 0xffffffffu; + return _mm256_set_epi32(o, o, o, o, o, o, o, o); +} +#endif +template<> EIGEN_STRONG_INLINE Packet8f pones(const Packet8f& a) { + return _mm256_castsi256_ps(pones(_mm256_castps_si256(a))); +} + +template<> EIGEN_STRONG_INLINE Packet4d pones(const Packet4d& a) { + return _mm256_castsi256_pd(pones(_mm256_castpd_si256(a))); +} + template<> EIGEN_STRONG_INLINE Packet8f pand(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pand(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet8i pand(const Packet8i& a, const Packet8i& b) { diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 2c613f870..154fedc25 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -83,10 +83,7 @@ template<> EIGEN_STRONG_INLINE Packet8cf pandnot(const Packet8cf& a, template <> EIGEN_STRONG_INLINE Packet8cf pcmp_eq(const Packet8cf& a, const Packet8cf& b) { __m512 eq = pcmp_eq(a.v, b.v); - __m512 eq_swap_real_imag = _mm512_permute_ps(eq, 0xB1); - __m512i real_and_imag_equal = _mm512_and_si512( - _mm512_castps_si512(eq), _mm512_castps_si512(eq_swap_real_imag)); - return Packet8cf(_mm512_castsi512_ps(real_and_imag_equal)); + return Packet8cf(pand(eq, _mm512_permute_ps(eq, 0xB1))); } template<> EIGEN_STRONG_INLINE Packet8cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet8cf(pload(&numext::real_ref(*from))); } @@ -279,10 +276,7 @@ template<> EIGEN_STRONG_INLINE Packet4cd pandnot(const Packet4cd& a, template <> EIGEN_STRONG_INLINE Packet4cd pcmp_eq(const Packet4cd& a, const Packet4cd& b) { __m512d eq = pcmp_eq(a.v, b.v); - __m512d eq_swap_real_imag = _mm512_permute_pd(eq, 0x55); - __m512i real_and_imag_equal = _mm512_and_si512( - _mm512_castpd_si512(eq), _mm512_castpd_si512(eq_swap_real_imag)); - return Packet4cd(_mm512_castsi512_pd(real_and_imag_equal)); + return Packet4cd(pand(eq, _mm512_permute_pd(eq, 0x55))); } template<> EIGEN_STRONG_INLINE Packet4cd pload (const std::complex* from) diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 68adf5e57..9666c4e22 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -284,47 +284,56 @@ EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { #endif template<> EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) { - __m256 lo = pcmp_le(extract256<0>(a), extract256<0>(b)); - __m256 hi = pcmp_le(extract256<1>(a), extract256<1>(b)); - return cat256(lo, hi); + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); } template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) { - __m256 lo = pcmp_lt(extract256<0>(a), extract256<0>(b)); - __m256 hi = pcmp_lt(extract256<1>(a), extract256<1>(b)); - return cat256(lo, hi); -} - -template<> EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) { - __m256 lo = pcmp_eq(extract256<0>(a), extract256<0>(b)); - __m256 hi = pcmp_eq(extract256<1>(a), extract256<1>(b)); - return cat256(lo, hi); + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); } template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) { - __m256 lo = pcmp_lt_or_nan(extract256<0>(a), extract256<0>(b)); - __m256 hi = pcmp_lt_or_nan(extract256<1>(a), extract256<1>(b)); - return cat256(lo, hi); + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGT_UQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); } template<> EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) { - __m256i lo = _mm256_cmpeq_epi32(_mm512_extracti64x4_epi64(a, 0), _mm512_extracti64x4_epi64(b, 0)); - __m256i hi = _mm256_cmpeq_epi32(_mm512_extracti64x4_epi64(a, 1), _mm512_extracti64x4_epi64(b, 1)); - return _mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1); + __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _CMP_EQ_OQ); + return _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu); } template <> EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) { __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ); return _mm512_castsi512_ps( - _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffff)); + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); } template <> EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) { __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ); return _mm512_castsi512_pd( - _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffff)); + _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu)); +} + +template <> +EIGEN_STRONG_INLINE Packet16i pones(const Packet16i& /*a*/) { + const unsigned int o = 0xffffffffu; + return _mm512_set_epi32(o, o, o, o, o, o, o, o, o, o, o, o, o, o, o, o); +} + +template <> +EIGEN_STRONG_INLINE Packet16f pones(const Packet16f& a) { + return _mm512_castsi512_ps(pones(_mm512_castps_si512(a))); +} + +template <> +EIGEN_STRONG_INLINE Packet8d pones(const Packet8d& a) { + return _mm512_castsi512_pd(pones(_mm512_castpd_si512(a))); } template <> diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index 3e35f96cc..c4dfedcf8 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -143,6 +143,10 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& return result; } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pones(const half2& a) { + half2 result; + *(reinterpret_cast(&(result))) = 0xffffffffu; +} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { @@ -640,6 +644,14 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { #endif } +template<> EIGEN_STRONG_INLINE Packet16h pnot(const Packet16h& a) { + Packet16h r; r.x = _mm256_xor_si256(a.x, pcmp_eq(a.x, a.x)); return r; +} + +template<> EIGEN_STRONG_INLINE Packet16h pones(const Packet16h& a) { + Packet16h r; r.x = Packet8i(pones(a.x)); return r; +} + template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) { // in some cases Packet8i is a wrapper around __m256i, so we need to // cast to Packet8i to call the correct overload. @@ -1085,6 +1097,10 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { #endif } +template<> EIGEN_STRONG_INLINE Packet8h pones(const Packet8h& a) { + Packet8h r; r.x = _mm_cmpeq_epi32(a.x, a.x); return r; +} + template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) { // in some cases Packet4i is a wrapper around __m128i, so we either need to // cast to Packet4i to directly call the intrinsics as below: diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index a7304193b..fa84097ac 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -82,6 +82,9 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con #endif } +template<> EIGEN_STRONG_INLINE Packet2cf pones (const Packet2cf& a) { return Packet2cf(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pnot (const Packet2cf& a) { return Packet2cf(pnot(a.v)); } + template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); } @@ -305,6 +308,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con #endif } +template<> EIGEN_STRONG_INLINE Packet1cd pones (const Packet1cd& a) { return Packet1cd(pones(a.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pnot (const Packet1cd& a) { return Packet1cd(pnot(a.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); } @@ -442,15 +447,13 @@ ptranspose(PacketBlock& kernel) { template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) { __m128 eq = _mm_cmpeq_ps(a.v, b.v); - __m128 real_and_imag_equal = _mm_and_ps(eq, vec4f_swizzle1(eq, 1, 0, 3, 2)); - return Packet2cf(real_and_imag_equal); + return Packet2cf(pand(eq, vec4f_swizzle1(eq, 1, 0, 3, 2))); } template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) { __m128d eq = _mm_cmpeq_pd(a.v, b.v); - __m128d real_and_imag_equal = _mm_and_pd(eq, vec2d_swizzle1(eq, 1, 0)); - return Packet1cd(real_and_imag_equal); + return Packet1cd(pand(eq, vec2d_swizzle1(eq, 1, 0))); } template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index b8a5497a9..6dd2f8a46 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -378,6 +378,17 @@ template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4 template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pones(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); } +template<> EIGEN_STRONG_INLINE Packet4f +pones(const Packet4f& a) { + Packet4i b = _mm_castps_si128(a); + return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b)); +} +template<> EIGEN_STRONG_INLINE Packet2d +pones(const Packet2d& a) { + Packet4i b = _mm_castpd_si128(a); + return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b)); +} template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } diff --git a/test/packetmath.cpp b/test/packetmath.cpp index a88b7bba9..460cfbdbe 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -239,6 +239,8 @@ template void packetmath() CHECK_CWISE2_IF(PacketTraits::HasDiv, REF_DIV, internal::pdiv); CHECK_CWISE1(internal::pnot, internal::pnot); + CHECK_CWISE1(internal::pzero, internal::pzero); + CHECK_CWISE1(internal::pones, internal::pones); CHECK_CWISE1(internal::negate, internal::pnegate); CHECK_CWISE1(numext::conj, internal::pconj); -- cgit v1.2.3 From 0abe03764c697ed8da37ce4421dd1918aa7a9b5f Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Thu, 10 Jan 2019 10:27:55 -0800 Subject: Fix shorten-64-to-32 warning in TensorContractionThreadPool --- unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 9666bf167..d68409e26 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -890,7 +890,8 @@ struct TensorEvaluatortemplate evalGemmPartialWithoutOutputKernel, Alignment, - (buf, begin, end, /*num_threads=*/num_blocks)); + (buf, begin, end, + /*num_threads=*/internal::convert_index(num_blocks))); // Check if it was the last task in l0 range. const Index l0_index = block_idx / l0_size; -- cgit v1.2.3 From 0522460a0d01d4253183349a49144b5ad8ba2f9f Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Fri, 11 Jan 2019 11:07:56 +0100 Subject: bug #1656: Enable failtests only if BUILD_TESTING is enabled --- CMakeLists.txt | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5255e9600..48c0a6367 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -469,6 +469,8 @@ if(BUILD_TESTING) else() add_subdirectory(test EXCLUDE_FROM_ALL) endif() + + add_subdirectory(failtest) endif() if(EIGEN_LEAVE_TEST_IN_ALL_TARGET) @@ -519,8 +521,6 @@ message(STATUS "") message(STATUS "Configured Eigen ${EIGEN_VERSION_NUMBER}") message(STATUS "") -add_subdirectory(failtest) - string(TOLOWER "${CMAKE_GENERATOR}" cmake_generator_tolower) if(cmake_generator_tolower MATCHES "makefile") message(STATUS "Some things you can do now:") @@ -537,8 +537,10 @@ if(cmake_generator_tolower MATCHES "makefile") message(STATUS " | Or:") message(STATUS " | cmake . -DINCLUDE_INSTALL_DIR=yourdir") message(STATUS "make doc | Generate the API documentation, requires Doxygen & LaTeX") - message(STATUS "make check | Build and run the unit-tests. Read this page:") - message(STATUS " | http://eigen.tuxfamily.org/index.php?title=Tests") + if(BUILD_TESTING) + message(STATUS "make check | Build and run the unit-tests. Read this page:") + message(STATUS " | http://eigen.tuxfamily.org/index.php?title=Tests") + endif() message(STATUS "make blas | Build BLAS library (not the same thing as Eigen)") message(STATUS "make uninstall| Removes files installed by make install") message(STATUS "--------------+--------------------------------------------------------------") -- cgit v1.2.3 From 3c9add6598cc35e5317788627dfa81f517e89e07 Mon Sep 17 00:00:00 2001 From: Mark D Ryan Date: Fri, 11 Jan 2019 14:02:09 +0100 Subject: Remove reinterpret_cast from AVX512 complex implementation The reinterpret_casts used in ptranspose(PacketBlock&) ptranspose(PacketBlock&) don't appear to be working correctly. They're used to convert the kernel parameters to PacketBlock& so that the complex number versions of ptranspose can be written using the existing double implementations. Unfortunately, they don't seem to work and are responsible for 9 unit test failures in the AVX512 build of tensorflow master. This commit fixes the issue by manually initialising PacketBlock variables with the contents of the kernel parameter before calling the double version of ptranspose, and then copying the resulting values back into the kernel parameter before returning. --- Eigen/src/Core/arch/AVX512/Complex.h | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 42cdfcd25..9c4ee1235 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -390,12 +390,40 @@ template<> EIGEN_STRONG_INLINE Packet4cd pcplxflip(const Packet4cd& x EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - ptranspose(reinterpret_cast&>(kernel)); + PacketBlock pb; + + pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v); + pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v); + pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v); + pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v); + ptranspose(pb); + kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]); + kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]); + kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]); + kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]); } EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - ptranspose(reinterpret_cast&>(kernel)); + PacketBlock pb; + + pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v); + pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v); + pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v); + pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v); + pb.packet[4] = _mm512_castps_pd(kernel.packet[4].v); + pb.packet[5] = _mm512_castps_pd(kernel.packet[5].v); + pb.packet[6] = _mm512_castps_pd(kernel.packet[6].v); + pb.packet[7] = _mm512_castps_pd(kernel.packet[7].v); + ptranspose(pb); + kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]); + kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]); + kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]); + kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]); + kernel.packet[4].v = _mm512_castpd_ps(pb.packet[4]); + kernel.packet[5].v = _mm512_castpd_ps(pb.packet[5]); + kernel.packet[6].v = _mm512_castpd_ps(pb.packet[6]); + kernel.packet[7].v = _mm512_castpd_ps(pb.packet[7]); } EIGEN_DEVICE_FUNC inline void -- cgit v1.2.3