From ab773c7e914633ec4a3ee1f7cdea8b168d3bce1a Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 24 Apr 2020 17:29:25 -0700 Subject: Extend support for Packet16b: MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add ptranspose<*,4> to support matmul and add unit test for Matrix * Matrix * work around a bug in slicing of Tensor. * Add tensor tests This speeds up matmul for boolean matrices by about 10x name old time/op new time/op delta BM_MatMul/8 267ns ± 0% 479ns ± 0% +79.25% (p=0.008 n=5+5) BM_MatMul/32 6.42µs ± 0% 0.87µs ± 0% -86.50% (p=0.008 n=5+5) BM_MatMul/64 43.3µs ± 0% 5.9µs ± 0% -86.42% (p=0.008 n=5+5) BM_MatMul/128 315µs ± 0% 44µs ± 0% -85.98% (p=0.008 n=5+5) BM_MatMul/256 2.41ms ± 0% 0.34ms ± 0% -85.68% (p=0.008 n=5+5) BM_MatMul/512 18.8ms ± 0% 2.7ms ± 0% -85.53% (p=0.008 n=5+5) BM_MatMul/1k 149ms ± 0% 22ms ± 0% -85.40% (p=0.008 n=5+5) --- Eigen/src/Core/GenericPacketMath.h | 6 +++++ Eigen/src/Core/arch/SSE/PacketMath.h | 37 +++++++++++++++++++++++++-- Eigen/src/Core/functors/BinaryFunctors.h | 44 +++++++++++++++----------------- 3 files changed, 62 insertions(+), 25 deletions(-) (limited to 'Eigen/src') diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 5612ef449..3f2489b46 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -179,6 +179,9 @@ preinterpret(const Packet& a); /* { return reinterpret_cast(a); } /** \internal \returns a + b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet padd(const Packet& a, const Packet& b) { return a+b; } +// Avoid compiler warning for boolean algebra. +template<> EIGEN_DEVICE_FUNC inline bool +padd(const bool& a, const bool& b) { return a || b; } /** \internal \returns a - b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet @@ -196,6 +199,9 @@ pconj(const Packet& a) { return numext::conj(a); } /** \internal \returns a * b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet pmul(const Packet& a, const Packet& b) { return a*b; } +// Avoid compiler warning for boolean algebra. +template<> EIGEN_DEVICE_FUNC inline bool +pmul(const bool& a, const bool& b) { return a && b; } /** \internal \returns a / b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index ac0799467..16215ec72 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -170,10 +170,10 @@ template<> struct packet_traits : default_packet_traits HasHalfPacket = 0, size=16, - HasAdd = 0, + HasAdd = 1, HasSub = 0, HasShift = 0, - HasMul = 0, + HasMul = 1, HasNegate = 0, HasAbs = 0, HasAbs2 = 0, @@ -249,6 +249,8 @@ template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); } +template<> EIGEN_STRONG_INLINE Packet16b padd(const Packet16b& a, const Packet16b& b) { return _mm_or_si128(a,b); } + template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); } @@ -290,6 +292,8 @@ template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const #endif } +template<> EIGEN_STRONG_INLINE Packet16b pmul(const Packet16b& a, const Packet16b& b) { return _mm_and_si128(a,b); } + template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); } @@ -646,6 +650,7 @@ template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { return _mm_cvtss_f32(a); } template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { return _mm_cvtsd_f64(a); } template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { return _mm_cvtsi128_si32(a); } +template<> EIGEN_STRONG_INLINE bool pfirst(const Packet16b& a) { int x = _mm_cvtsi128_si32(a); return static_cast(x & 1); } #endif template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) @@ -762,6 +767,7 @@ template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) Packet4i tmp0 = _mm_hadd_epi32(a,a); return pfirst(_mm_hadd_epi32(tmp0,tmp0)); } + #else template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) { @@ -769,8 +775,22 @@ template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1)); } #endif + +#ifdef EIGEN_VECTORIZE_SSE4_1 +template<> EIGEN_STRONG_INLINE bool predux(const Packet16b& a) { + Packet16b tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a,a)); + return _mm_extract_epi64(tmp, 0) != 0; +} +#else +template<> EIGEN_STRONG_INLINE bool predux(const Packet16b& a) { +Packet4i tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a,a)); + return (pfirst(tmp) != 0) || (pfirst(_mm_shuffle_epi32(tmp, 1)) != 0); +} +#endif + // Other reduction functions: + // mul template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) { @@ -987,6 +1007,19 @@ ptranspose(PacketBlock& kernel) { kernel.packet[3] = _mm_unpackhi_epi64(T2, T3); } +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m128i T0 = _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]); + __m128i T1 = _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]); + __m128i T2 = _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]); + __m128i T3 = _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]); + kernel.packet[0] = _mm_unpacklo_epi16(T0, T2); + kernel.packet[1] = _mm_unpackhi_epi16(T0, T2); + kernel.packet[2] = _mm_unpacklo_epi16(T1, T3); + kernel.packet[3] = _mm_unpackhi_epi16(T1, T3); +} + + template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { const __m128i zero = _mm_setzero_si128(); const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h index a2bc58c76..697816663 100644 --- a/Eigen/src/Core/functors/BinaryFunctors.h +++ b/Eigen/src/Core/functors/BinaryFunctors.h @@ -39,12 +39,12 @@ struct scalar_sum_op : binary_op_base EIGEN_SCALAR_BINARY_OP_PLUGIN } #endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a + b; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a + b; } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const { return internal::padd(a,b); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const { return internal::predux(a); } }; template @@ -56,15 +56,9 @@ struct functor_traits > { }; }; -/** \internal - * \brief Template specialization to deprecate the summation of boolean expressions. - * This is required to solve Bug 426. - * \sa DenseBase::count(), DenseBase::any(), ArrayBase::cast(), MatrixBase::cast() - */ -template<> struct scalar_sum_op : scalar_sum_op { - EIGEN_DEPRECATED - scalar_sum_op() {} -}; + +template<> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool scalar_sum_op::operator() (const bool& a, const bool& b) const { return a || b; } /** \internal @@ -83,12 +77,12 @@ struct scalar_product_op : binary_op_base EIGEN_SCALAR_BINARY_OP_PLUGIN } #endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const { return internal::pmul(a,b); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const { return internal::predux_mul(a); } }; template @@ -100,6 +94,10 @@ struct functor_traits > { }; }; +template<> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool scalar_product_op::operator() (const bool& a, const bool& b) const { return a && b; } + + /** \internal * \brief Template functor to compute the conjugate product of two scalars * @@ -116,11 +114,11 @@ struct scalar_conj_product_op : binary_op_base typedef typename ScalarBinaryOpTraits::ReturnType result_type; EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return conj_helper().pmul(a,b); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const { return conj_helper().pmul(a,b); } }; template @@ -141,12 +139,12 @@ struct scalar_min_op : binary_op_base { typedef typename ScalarBinaryOpTraits::ReturnType result_type; EIGEN_EMPTY_STRUCT_CTOR(scalar_min_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::mini(a, b); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::mini(a, b); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const { return internal::pmin(a,b); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const { return internal::predux_min(a); } }; template @@ -167,12 +165,12 @@ struct scalar_max_op : binary_op_base { typedef typename ScalarBinaryOpTraits::ReturnType result_type; EIGEN_EMPTY_STRUCT_CTOR(scalar_max_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::maxi(a, b); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::maxi(a, b); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const { return internal::pmax(a,b); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const { return internal::predux_max(a); } }; template -- cgit v1.2.3