diff options
author | Rasmus Munk Larsen <rmlarsen@google.com> | 2020-12-08 18:13:35 -0800 |
---|---|---|
committer | Rasmus Munk Larsen <rmlarsen@google.com> | 2020-12-08 18:13:35 -0800 |
commit | 125cc9a5df6074756b89ea8aaa4e9a4b44b0f7e9 (patch) | |
tree | eaae025ea14e378c922e9fe901b10d111c41c41c /Eigen/src/Core/arch/AVX512 | |
parent | 8cfe0db108f54e4ceae2e94c47c5d2eb5116197b (diff) |
Implement vectorized complex square root.
Closes #1905
Measured speedup for sqrt of `complex<float>` on Skylake:
SSE:
```
name old time/op new time/op delta
BM_eigen_sqrt_ctype/1 49.4ns ± 0% 54.3ns ± 0% +10.01%
BM_eigen_sqrt_ctype/8 332ns ± 0% 50ns ± 1% -84.97%
BM_eigen_sqrt_ctype/64 2.81µs ± 1% 0.38µs ± 0% -86.49%
BM_eigen_sqrt_ctype/512 23.8µs ± 0% 3.0µs ± 0% -87.32%
BM_eigen_sqrt_ctype/4k 202µs ± 0% 24µs ± 2% -88.03%
BM_eigen_sqrt_ctype/32k 1.63ms ± 0% 0.19ms ± 0% -88.18%
BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 1.5ms ± 1% -88.20%
BM_eigen_sqrt_ctype/1M 52.1ms ± 0% 6.2ms ± 0% -88.18%
```
AVX2:
```
name old cpu/op new cpu/op delta
BM_eigen_sqrt_ctype/1 53.6ns ± 0% 55.6ns ± 0% +3.71%
BM_eigen_sqrt_ctype/8 334ns ± 0% 27ns ± 0% -91.86%
BM_eigen_sqrt_ctype/64 2.79µs ± 0% 0.22µs ± 2% -92.28%
BM_eigen_sqrt_ctype/512 23.8µs ± 1% 1.7µs ± 1% -92.81%
BM_eigen_sqrt_ctype/4k 201µs ± 0% 14µs ± 1% -93.24%
BM_eigen_sqrt_ctype/32k 1.62ms ± 0% 0.11ms ± 1% -93.29%
BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 0.9ms ± 1% -93.31%
BM_eigen_sqrt_ctype/1M 52.0ms ± 0% 3.5ms ± 1% -93.31%
```
AVX512:
```
name old cpu/op new cpu/op delta
BM_eigen_sqrt_ctype/1 53.7ns ± 0% 56.2ns ± 1% +4.75%
BM_eigen_sqrt_ctype/8 334ns ± 0% 18ns ± 2% -94.63%
BM_eigen_sqrt_ctype/64 2.79µs ± 0% 0.12µs ± 1% -95.54%
BM_eigen_sqrt_ctype/512 23.9µs ± 1% 1.0µs ± 1% -95.89%
BM_eigen_sqrt_ctype/4k 202µs ± 0% 8µs ± 1% -96.13%
BM_eigen_sqrt_ctype/32k 1.63ms ± 0% 0.06ms ± 1% -96.15%
BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 0.5ms ± 4% -96.11%
BM_eigen_sqrt_ctype/1M 52.1ms ± 0% 2.0ms ± 1% -96.13%
```
Diffstat (limited to 'Eigen/src/Core/arch/AVX512')
-rw-r--r-- | Eigen/src/Core/arch/AVX512/Complex.h | 17 | ||||
-rw-r--r-- | Eigen/src/Core/arch/AVX512/PacketMath.h | 13 |
2 files changed, 27 insertions, 3 deletions
diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 53ee53d17..45f22f436 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -37,6 +37,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits HasMul = 1, HasDiv = 1, HasNegate = 1, + HasSqrt = 1, HasAbs = 0, HasAbs2 = 0, HasMin = 0, @@ -47,6 +48,8 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits template<> struct unpacket_traits<Packet8cf> { typedef std::complex<float> type; + typedef Packet4cf half; + typedef Packet16f as_real; enum { size = 8, alignment=unpacket_traits<Packet16f>::alignment, @@ -54,7 +57,6 @@ template<> struct unpacket_traits<Packet8cf> { masked_load_available=false, masked_store_available=false }; - typedef Packet4cf half; }; template<> EIGEN_STRONG_INLINE Packet8cf ptrue<Packet8cf>(const Packet8cf& a) { return Packet8cf(ptrue(Packet16f(a.v))); } @@ -223,6 +225,7 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits HasMul = 1, HasDiv = 1, HasNegate = 1, + HasSqrt = 1, HasAbs = 0, HasAbs2 = 0, HasMin = 0, @@ -233,6 +236,8 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits template<> struct unpacket_traits<Packet4cd> { typedef std::complex<double> type; + typedef Packet2cd half; + typedef Packet8d as_real; enum { size = 4, alignment = unpacket_traits<Packet8d>::alignment, @@ -240,7 +245,6 @@ template<> struct unpacket_traits<Packet4cd> { masked_load_available=false, masked_store_available=false }; - typedef Packet2cd half; }; template<> EIGEN_STRONG_INLINE Packet4cd padd<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_add_pd(a.v,b.v)); } @@ -437,8 +441,15 @@ ptranspose(PacketBlock<Packet4cd,4>& kernel) { kernel.packet[0] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<0,2,0,2>::mask))); // [a0 b0 c0 d0] } -} // end namespace internal +template<> EIGEN_STRONG_INLINE Packet4cd psqrt<Packet4cd>(const Packet4cd& a) { + return psqrt_complex<Packet4cd>(a); +} + +template<> EIGEN_STRONG_INLINE Packet8cf psqrt<Packet8cf>(const Packet8cf& a) { + return psqrt_complex<Packet8cf>(a); +} +} // end namespace internal } // end namespace Eigen #endif // EIGEN_COMPLEX_AVX512_H diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index a001fb186..6662a5fe7 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -219,6 +219,19 @@ template<> EIGEN_STRONG_INLINE Packet16f pzero(const Packet16f& /*a*/) { return template<> EIGEN_STRONG_INLINE Packet8d pzero(const Packet8d& /*a*/) { return _mm512_setzero_pd(); } template<> EIGEN_STRONG_INLINE Packet16i pzero(const Packet16i& /*a*/) { return _mm512_setzero_si512(); } +template<> EIGEN_STRONG_INLINE Packet16f peven_mask(const Packet16f& /*a*/) { + return Packet16f(_mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1, + 0, -1, 0, -1, 0, -1, 0, -1)); +} +template<> EIGEN_STRONG_INLINE Packet16i peven_mask(const Packet16i& /*a*/) { + return Packet16i(_mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1, + 0, -1, 0, -1, 0, -1, 0, -1)); +} +template<> EIGEN_STRONG_INLINE Packet8d peven_mask(const Packet8d& /*a*/) { + return Packet8d(_mm512_set_epi32(0, 0, -1, -1, 0, 0, -1, -1, + 0, 0, -1, -1, 0, 0, -1, -1)); +} + template <> EIGEN_STRONG_INLINE Packet16f pload1<Packet16f>(const float* from) { return _mm512_broadcastss_ps(_mm_load_ps1(from)); |