diff options
author | Rasmus Munk Larsen <rmlarsen@google.com> | 2020-12-08 18:13:35 -0800 |
---|---|---|
committer | Rasmus Munk Larsen <rmlarsen@google.com> | 2020-12-08 18:13:35 -0800 |
commit | 125cc9a5df6074756b89ea8aaa4e9a4b44b0f7e9 (patch) | |
tree | eaae025ea14e378c922e9fe901b10d111c41c41c /test | |
parent | 8cfe0db108f54e4ceae2e94c47c5d2eb5116197b (diff) |
Implement vectorized complex square root.
Closes #1905
Measured speedup for sqrt of `complex<float>` on Skylake:
SSE:
```
name old time/op new time/op delta
BM_eigen_sqrt_ctype/1 49.4ns ± 0% 54.3ns ± 0% +10.01%
BM_eigen_sqrt_ctype/8 332ns ± 0% 50ns ± 1% -84.97%
BM_eigen_sqrt_ctype/64 2.81µs ± 1% 0.38µs ± 0% -86.49%
BM_eigen_sqrt_ctype/512 23.8µs ± 0% 3.0µs ± 0% -87.32%
BM_eigen_sqrt_ctype/4k 202µs ± 0% 24µs ± 2% -88.03%
BM_eigen_sqrt_ctype/32k 1.63ms ± 0% 0.19ms ± 0% -88.18%
BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 1.5ms ± 1% -88.20%
BM_eigen_sqrt_ctype/1M 52.1ms ± 0% 6.2ms ± 0% -88.18%
```
AVX2:
```
name old cpu/op new cpu/op delta
BM_eigen_sqrt_ctype/1 53.6ns ± 0% 55.6ns ± 0% +3.71%
BM_eigen_sqrt_ctype/8 334ns ± 0% 27ns ± 0% -91.86%
BM_eigen_sqrt_ctype/64 2.79µs ± 0% 0.22µs ± 2% -92.28%
BM_eigen_sqrt_ctype/512 23.8µs ± 1% 1.7µs ± 1% -92.81%
BM_eigen_sqrt_ctype/4k 201µs ± 0% 14µs ± 1% -93.24%
BM_eigen_sqrt_ctype/32k 1.62ms ± 0% 0.11ms ± 1% -93.29%
BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 0.9ms ± 1% -93.31%
BM_eigen_sqrt_ctype/1M 52.0ms ± 0% 3.5ms ± 1% -93.31%
```
AVX512:
```
name old cpu/op new cpu/op delta
BM_eigen_sqrt_ctype/1 53.7ns ± 0% 56.2ns ± 1% +4.75%
BM_eigen_sqrt_ctype/8 334ns ± 0% 18ns ± 2% -94.63%
BM_eigen_sqrt_ctype/64 2.79µs ± 0% 0.12µs ± 1% -95.54%
BM_eigen_sqrt_ctype/512 23.9µs ± 1% 1.0µs ± 1% -95.89%
BM_eigen_sqrt_ctype/4k 202µs ± 0% 8µs ± 1% -96.13%
BM_eigen_sqrt_ctype/32k 1.63ms ± 0% 0.06ms ± 1% -96.15%
BM_eigen_sqrt_ctype/256k 13.0ms ± 0% 0.5ms ± 4% -96.11%
BM_eigen_sqrt_ctype/1M 52.1ms ± 0% 2.0ms ± 1% -96.13%
```
Diffstat (limited to 'test')
-rw-r--r-- | test/packetmath.cpp | 53 |
1 files changed, 51 insertions, 2 deletions
diff --git a/test/packetmath.cpp b/test/packetmath.cpp index d995e8b71..0e49d93a9 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -473,8 +473,6 @@ void packetmath() { CHECK_CWISE3_IF(true, internal::pselect, internal::pselect); } - CHECK_CWISE1_IF(PacketTraits::HasSqrt, numext::sqrt, internal::psqrt); - for (int i = 0; i < size; ++i) { data1[i] = internal::random<Scalar>(); } @@ -486,6 +484,11 @@ void packetmath() { packetmath_boolean_mask_ops<Scalar, Packet>(); packetmath_pcast_ops_runner<Scalar, Packet>::run(); packetmath_minus_zero_add<Scalar, Packet>(); + + for (int i = 0; i < size; ++i) { + data1[i] = numext::abs(internal::random<Scalar>()); + } + CHECK_CWISE1_IF(PacketTraits::HasSqrt, numext::sqrt, internal::psqrt); } // Notice that this definition works for complex types as well. @@ -899,6 +902,8 @@ void test_conj_helper(Scalar* data1, Scalar* data2, Scalar* ref, Scalar* pval) { template <typename Scalar, typename Packet> void packetmath_complex() { + typedef internal::packet_traits<Scalar> PacketTraits; + typedef typename Scalar::value_type RealScalar; const int PacketSize = internal::unpacket_traits<Packet>::size; const int size = PacketSize * 4; @@ -917,11 +922,55 @@ void packetmath_complex() { test_conj_helper<Scalar, Packet, true, false>(data1, data2, ref, pval); test_conj_helper<Scalar, Packet, true, true>(data1, data2, ref, pval); + // Test pcplxflip. { for (int i = 0; i < PacketSize; ++i) ref[i] = Scalar(std::imag(data1[i]), std::real(data1[i])); internal::pstore(pval, internal::pcplxflip(internal::pload<Packet>(data1))); VERIFY(test::areApprox(ref, pval, PacketSize) && "pcplxflip"); } + + if (PacketTraits::HasSqrt) { + for (int i = 0; i < size; ++i) { + data1[i] = Scalar(internal::random<RealScalar>(), internal::random<RealScalar>()); + } + CHECK_CWISE1(numext::sqrt, internal::psqrt); + + // Test misc. corner cases. + const RealScalar zero = RealScalar(0); + const RealScalar one = RealScalar(1); + const RealScalar inf = std::numeric_limits<RealScalar>::infinity(); + const RealScalar nan = std::numeric_limits<RealScalar>::quiet_NaN(); + data1[0] = Scalar(zero, zero); + data1[1] = Scalar(-zero, zero); + data1[2] = Scalar(one, zero); + data1[3] = Scalar(zero, one); + CHECK_CWISE1(numext::sqrt, internal::psqrt); + data1[0] = Scalar(-one, zero); + data1[1] = Scalar(zero, -one); + data1[2] = Scalar(one, one); + data1[3] = Scalar(-one, -one); + CHECK_CWISE1(numext::sqrt, internal::psqrt); + data1[0] = Scalar(inf, zero); + data1[1] = Scalar(zero, inf); + data1[2] = Scalar(-inf, zero); + data1[3] = Scalar(zero, -inf); + CHECK_CWISE1(numext::sqrt, internal::psqrt); + data1[0] = Scalar(inf, inf); + data1[1] = Scalar(-inf, inf); + data1[2] = Scalar(inf, -inf); + data1[3] = Scalar(-inf, -inf); + CHECK_CWISE1(numext::sqrt, internal::psqrt); + data1[0] = Scalar(nan, zero); + data1[1] = Scalar(zero, nan); + data1[2] = Scalar(nan, one); + data1[3] = Scalar(one, nan); + CHECK_CWISE1(numext::sqrt, internal::psqrt); + data1[0] = Scalar(nan, nan); + data1[1] = Scalar(inf, nan); + data1[2] = Scalar(nan, inf); + data1[3] = Scalar(-inf, nan); + CHECK_CWISE1(numext::sqrt, internal::psqrt); + } } template <typename Scalar, typename Packet> |