diff options
author | Gael Guennebaud <g.gael@free.fr> | 2018-11-30 15:56:08 +0100 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2018-11-30 15:56:08 +0100 |
commit | 69ace742be6f00f4280d312e046b0b1422fd112c (patch) | |
tree | 83d296bd2070b1617791dcf0ee8c3bcd6c5ade2a /Eigen/src | |
parent | fa87f9d876f38e470e5070a451f92a3c19c9d0fe (diff) |
Several improvements regarding packet-bitwise operations:
- add unit tests
- optimize their AVX512f implementation
- add missing implementations (half, Packet4f, ...)
Diffstat (limited to 'Eigen/src')
-rw-r--r-- | Eigen/src/Core/GenericPacketMath.h | 20 | ||||
-rw-r--r-- | Eigen/src/Core/arch/AVX/PacketMath.h | 28 | ||||
-rw-r--r-- | Eigen/src/Core/arch/AVX512/PacketMath.h | 150 | ||||
-rw-r--r-- | Eigen/src/Core/arch/GPU/PacketMathHalf.h | 26 |
4 files changed, 98 insertions, 126 deletions
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index cc044de22..9c2a437bf 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -158,13 +158,11 @@ preinterpret(const Packet& a); /* { return reinterpret_cast<const Target&>(a); } /** \internal \returns a + b (coeff-wise) */ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet -padd(const Packet& a, - const Packet& b) { return a+b; } +padd(const Packet& a, const Packet& b) { return a+b; } /** \internal \returns a - b (coeff-wise) */ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet -psub(const Packet& a, - const Packet& b) { return a-b; } +psub(const Packet& a, const Packet& b) { return a-b; } /** \internal \returns -a (coeff-wise) */ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet @@ -177,23 +175,19 @@ pconj(const Packet& a) { return numext::conj(a); } /** \internal \returns a * b (coeff-wise) */ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet -pmul(const Packet& a, - const Packet& b) { return a*b; } +pmul(const Packet& a, const Packet& b) { return a*b; } /** \internal \returns a / b (coeff-wise) */ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet -pdiv(const Packet& a, - const Packet& b) { return a/b; } +pdiv(const Packet& a, const Packet& b) { return a/b; } /** \internal \returns the min of \a a and \a b (coeff-wise) */ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet -pmin(const Packet& a, - const Packet& b) { return numext::mini(a, b); } +pmin(const Packet& a, const Packet& b) { return numext::mini(a, b); } /** \internal \returns the max of \a a and \a b (coeff-wise) */ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet -pmax(const Packet& a, - const Packet& b) { return numext::maxi(a, b); } +pmax(const Packet& a, const Packet& b) { return numext::maxi(a, b); } /** \internal \returns the absolute value of \a a */ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet @@ -217,7 +211,7 @@ pxor(const Packet& a, const Packet& b) { return a ^ b; } /** \internal \returns the bitwise andnot of \a a and \a b */ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet -pandnot(const Packet& a, const Packet& b) { return a & (!b); } +pandnot(const Packet& a, const Packet& b) { return a & (~b); } /** \internal \returns \a a shifted by N bits to the right */ template<int N> EIGEN_DEVICE_FUNC inline int diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 284a32676..b49bae0de 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -241,15 +241,43 @@ template<> EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) { re template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_and_si256(a,b); +#else + return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b))); +#endif +} template<> EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8i por<Packet8i>(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_or_si256(a,b); +#else + return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b))); +#endif +} template<> EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8i pxor<Packet8i>(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_xor_si256(a,b); +#else + return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b))); +#endif +} template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(b,a); } template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(b,a); } +template<> EIGEN_STRONG_INLINE Packet8i pandnot<Packet8i>(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_andnot_si256(b,a); +#else + return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(b),_mm256_castsi256_ps(a))); +#endif +} template<> EIGEN_STRONG_INLINE Packet8f pselect<Packet8f>(const Packet8f& mask, const Packet8f& a, const Packet8f& b) { return _mm256_blendv_ps(b,a,mask); } diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 7d90ce4c1..1d38fb758 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -268,30 +268,20 @@ template<> EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packe return _mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1); } + +template <> +EIGEN_STRONG_INLINE Packet16i pand<Packet16i>(const Packet16i& a, + const Packet16i& b) { + return _mm512_and_si512(a,b); +} + template <> EIGEN_STRONG_INLINE Packet16f pand<Packet16f>(const Packet16f& a, const Packet16f& b) { #ifdef EIGEN_VECTORIZE_AVX512DQ return _mm512_and_ps(a, b); #else - Packet16f res = _mm512_undefined_ps(); - Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); - Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); - res = _mm512_insertf32x4(res, _mm_and_ps(lane0_a, lane0_b), 0); - - Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); - Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); - res = _mm512_insertf32x4(res, _mm_and_ps(lane1_a, lane1_b), 1); - - Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); - Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); - res = _mm512_insertf32x4(res, _mm_and_ps(lane2_a, lane2_b), 2); - - Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); - Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); - res = _mm512_insertf32x4(res, _mm_and_ps(lane3_a, lane3_b), 3); - - return res; + return _mm512_castsi512_ps(pand(_mm512_castps_si512(a),_mm512_castps_si512(b))); #endif } template <> @@ -312,30 +302,18 @@ EIGEN_STRONG_INLINE Packet8d pand<Packet8d>(const Packet8d& a, return res; #endif } + +template <> +EIGEN_STRONG_INLINE Packet16i por<Packet16i>(const Packet16i& a, const Packet16i& b) { + return _mm512_or_si512(a, b); +} + template <> -EIGEN_STRONG_INLINE Packet16f por<Packet16f>(const Packet16f& a, - const Packet16f& b) { +EIGEN_STRONG_INLINE Packet16f por<Packet16f>(const Packet16f& a, const Packet16f& b) { #ifdef EIGEN_VECTORIZE_AVX512DQ return _mm512_or_ps(a, b); #else - Packet16f res = _mm512_undefined_ps(); - Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); - Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); - res = _mm512_insertf32x4(res, _mm_or_ps(lane0_a, lane0_b), 0); - - Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); - Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); - res = _mm512_insertf32x4(res, _mm_or_ps(lane1_a, lane1_b), 1); - - Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); - Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); - res = _mm512_insertf32x4(res, _mm_or_ps(lane2_a, lane2_b), 2); - - Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); - Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); - res = _mm512_insertf32x4(res, _mm_or_ps(lane3_a, lane3_b), 3); - - return res; + return _mm512_castsi512_ps(por(_mm512_castps_si512(a),_mm512_castps_si512(b))); #endif } @@ -345,106 +323,52 @@ EIGEN_STRONG_INLINE Packet8d por<Packet8d>(const Packet8d& a, #ifdef EIGEN_VECTORIZE_AVX512DQ return _mm512_or_pd(a, b); #else - Packet8d res = _mm512_undefined_pd(); - Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0); - Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0); - res = _mm512_insertf64x4(res, _mm256_or_pd(lane0_a, lane0_b), 0); - - Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); - Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); - res = _mm512_insertf64x4(res, _mm256_or_pd(lane1_a, lane1_b), 1); - - return res; + return _mm512_castsi512_pd(por(_mm512_castpd_si512(a),_mm512_castpd_si512(b))); #endif } template <> -EIGEN_STRONG_INLINE Packet16f pxor<Packet16f>(const Packet16f& a, - const Packet16f& b) { +EIGEN_STRONG_INLINE Packet16i pxor<Packet16i>(const Packet16i& a, const Packet16i& b) { + return _mm512_xor_si512(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16f pxor<Packet16f>(const Packet16f& a, const Packet16f& b) { #ifdef EIGEN_VECTORIZE_AVX512DQ return _mm512_xor_ps(a, b); #else - Packet16f res = _mm512_undefined_ps(); - Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); - Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); - res = _mm512_insertf32x4(res, _mm_xor_ps(lane0_a, lane0_b), 0); - - Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); - Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); - res = _mm512_insertf32x4(res, _mm_xor_ps(lane1_a, lane1_b), 1); - - Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); - Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); - res = _mm512_insertf32x4(res, _mm_xor_ps(lane2_a, lane2_b), 2); - - Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); - Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); - res = _mm512_insertf32x4(res, _mm_xor_ps(lane3_a, lane3_b), 3); - - return res; + return _mm512_castsi512_ps(pxor(_mm512_castps_si512(a),_mm512_castps_si512(b))); #endif } + template <> -EIGEN_STRONG_INLINE Packet8d pxor<Packet8d>(const Packet8d& a, - const Packet8d& b) { +EIGEN_STRONG_INLINE Packet8d pxor<Packet8d>(const Packet8d& a, const Packet8d& b) { #ifdef EIGEN_VECTORIZE_AVX512DQ return _mm512_xor_pd(a, b); #else - Packet8d res = _mm512_undefined_pd(); - Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0); - Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0); - res = _mm512_insertf64x4(res, _mm256_xor_pd(lane0_a, lane0_b), 0); - - Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); - Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); - res = _mm512_insertf64x4(res, _mm256_xor_pd(lane1_a, lane1_b), 1); - - return res; + return _mm512_castsi512_pd(pxor(_mm512_castpd_si512(a),_mm512_castpd_si512(b))); #endif } template <> -EIGEN_STRONG_INLINE Packet16f pandnot<Packet16f>(const Packet16f& a, - const Packet16f& b) { +EIGEN_STRONG_INLINE Packet16i pandnot<Packet16i>(const Packet16i& a, const Packet16i& b) { + return _mm512_andnot_si512(b, a); +} + +template <> +EIGEN_STRONG_INLINE Packet16f pandnot<Packet16f>(const Packet16f& a, const Packet16f& b) { #ifdef EIGEN_VECTORIZE_AVX512DQ return _mm512_andnot_ps(b, a); #else - Packet16f res = _mm512_undefined_ps(); - Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); - Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); - res = _mm512_insertf32x4(res, pandnot(lane0_a, lane0_b), 0); - - Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); - Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); - res = _mm512_insertf32x4(res, pandnot(lane1_a, lane1_b), 1); - - Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); - Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); - res = _mm512_insertf32x4(res, pandnot(lane2_a, lane2_b), 2); - - Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); - Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); - res = _mm512_insertf32x4(res, pandnot(lane3_a, lane3_b), 3); - - return res; + return _mm512_castsi512_ps(pandnot(_mm512_castps_si512(a),_mm512_castps_si512(b))); #endif } template <> -EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a, - const Packet8d& b) { +EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a,const Packet8d& b) { #ifdef EIGEN_VECTORIZE_AVX512DQ - return _mm512_andnot_pd(a, b); + return _mm512_andnot_pd(b, a); #else - Packet8d res = _mm512_undefined_pd(); - Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0); - Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0); - res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane0_a, lane0_b), 0); - - Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); - Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); - res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane1_a, lane1_b), 1); - - return res; + return _mm512_castsi512_pd(pandnot(_mm512_castpd_si512(a),_mm512_castpd_si512(b))); #endif } diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index 8787adcde..cdd2b001b 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -640,6 +640,19 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { #endif } +template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) { + Packet16h r; r.x = por(a.x,b.x); return r; +} +template<> EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a,const Packet16h& b) { + Packet16h r; r.x = pxor(a.x,b.x); return r; +} +template<> EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a,const Packet16h& b) { + Packet16h r; r.x = pand(a.x,b.x); return r; +} +template<> EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a,const Packet16h& b) { + Packet16h r; r.x = pandnot(a.x,b.x); return r; +} + template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) { // FIXME we could do that with bit manipulation Packet16f af = half2float(a); @@ -1063,6 +1076,19 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { #endif } +template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) { + Packet8h r; r.x = por(a.x,b.x); return r; +} +template<> EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a,const Packet8h& b) { + Packet8h r; r.x = pxor(a.x,b.x); return r; +} +template<> EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a,const Packet8h& b) { + Packet8h r; r.x = pand(a.x,b.x); return r; +} +template<> EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a,const Packet8h& b) { + Packet8h r; r.x = pandnot(a.x,b.x); return r; +} + template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; } template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) { |