diff options
author | Srinivas Vasudevan <srvasude@google.com> | 2019-09-11 18:34:02 -0700 |
---|---|---|
committer | Srinivas Vasudevan <srvasude@google.com> | 2019-09-11 18:34:02 -0700 |
commit | facdec5aa7d947d5462c9dbaefa7a50c4cabff3b (patch) | |
tree | 3a64e2ead30c9dd3027dc9d4f9d9168f2f92b30a /Eigen/src/Core | |
parent | b052ec699249f87d428b38c51ebd5f59d45f7f91 (diff) |
Add packetized versions of i0e and i1e special functions.
- In particular refactor the i0e and i1e code so scalar and vectorized path share code.
- Move chebevl to GenericPacketMathFunctions.
A brief benchmark with building Eigen with FMA, AVX and AVX2 flags
Before:
CPU: Intel Haswell with HyperThreading (6 cores)
Benchmark Time(ns) CPU(ns) Iterations
-----------------------------------------------------------------
BM_eigen_i0e_double/1 57.3 57.3 10000000
BM_eigen_i0e_double/8 398 398 1748554
BM_eigen_i0e_double/64 3184 3184 218961
BM_eigen_i0e_double/512 25579 25579 27330
BM_eigen_i0e_double/4k 205043 205042 3418
BM_eigen_i0e_double/32k 1646038 1646176 422
BM_eigen_i0e_double/256k 13180959 13182613 53
BM_eigen_i0e_double/1M 52684617 52706132 10
BM_eigen_i0e_float/1 28.4 28.4 24636711
BM_eigen_i0e_float/8 75.7 75.7 9207634
BM_eigen_i0e_float/64 512 512 1000000
BM_eigen_i0e_float/512 4194 4194 166359
BM_eigen_i0e_float/4k 32756 32761 21373
BM_eigen_i0e_float/32k 261133 261153 2678
BM_eigen_i0e_float/256k 2087938 2088231 333
BM_eigen_i0e_float/1M 8380409 8381234 84
BM_eigen_i1e_double/1 56.3 56.3 10000000
BM_eigen_i1e_double/8 397 397 1772376
BM_eigen_i1e_double/64 3114 3115 223881
BM_eigen_i1e_double/512 25358 25361 27761
BM_eigen_i1e_double/4k 203543 203593 3462
BM_eigen_i1e_double/32k 1613649 1613803 428
BM_eigen_i1e_double/256k 12910625 12910374 54
BM_eigen_i1e_double/1M 51723824 51723991 10
BM_eigen_i1e_float/1 28.3 28.3 24683049
BM_eigen_i1e_float/8 74.8 74.9 9366216
BM_eigen_i1e_float/64 505 505 1000000
BM_eigen_i1e_float/512 4068 4068 171690
BM_eigen_i1e_float/4k 31803 31806 21948
BM_eigen_i1e_float/32k 253637 253692 2763
BM_eigen_i1e_float/256k 2019711 2019918 346
BM_eigen_i1e_float/1M 8238681 8238713 86
After:
CPU: Intel Haswell with HyperThreading (6 cores)
Benchmark Time(ns) CPU(ns) Iterations
-----------------------------------------------------------------
BM_eigen_i0e_double/1 15.8 15.8 44097476
BM_eigen_i0e_double/8 99.3 99.3 7014884
BM_eigen_i0e_double/64 777 777 886612
BM_eigen_i0e_double/512 6180 6181 100000
BM_eigen_i0e_double/4k 48136 48140 14678
BM_eigen_i0e_double/32k 385936 385943 1801
BM_eigen_i0e_double/256k 3293324 3293551 228
BM_eigen_i0e_double/1M 12423600 12424458 57
BM_eigen_i0e_float/1 16.3 16.3 43038042
BM_eigen_i0e_float/8 30.1 30.1 23456931
BM_eigen_i0e_float/64 169 169 4132875
BM_eigen_i0e_float/512 1338 1339 516860
BM_eigen_i0e_float/4k 10191 10191 68513
BM_eigen_i0e_float/32k 81338 81337 8531
BM_eigen_i0e_float/256k 651807 651984 1000
BM_eigen_i0e_float/1M 2633821 2634187 268
BM_eigen_i1e_double/1 16.2 16.2 42352499
BM_eigen_i1e_double/8 110 110 6316524
BM_eigen_i1e_double/64 822 822 851065
BM_eigen_i1e_double/512 6480 6481 100000
BM_eigen_i1e_double/4k 51843 51843 10000
BM_eigen_i1e_double/32k 414854 414852 1680
BM_eigen_i1e_double/256k 3320001 3320568 212
BM_eigen_i1e_double/1M 13442795 13442391 53
BM_eigen_i1e_float/1 17.6 17.6 41025735
BM_eigen_i1e_float/8 35.5 35.5 19597891
BM_eigen_i1e_float/64 240 240 2924237
BM_eigen_i1e_float/512 1424 1424 485953
BM_eigen_i1e_float/4k 10722 10723 65162
BM_eigen_i1e_float/32k 86286 86297 8048
BM_eigen_i1e_float/256k 691821 691868 1000
BM_eigen_i1e_float/1M 2777336 2777747 256
This shows anywhere from a 50% to 75% improvement on these operations.
I've also benchmarked without any of these flags turned on, and got similar
performance to before (if not better).
Also tested packetmath.cpp + special_functions to ensure no regressions.
Diffstat (limited to 'Eigen/src/Core')
-rw-r--r-- | Eigen/src/Core/arch/AVX/PacketMath.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/arch/AVX512/PacketMath.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h | 91 | ||||
-rwxr-xr-x | Eigen/src/Core/arch/SSE/PacketMath.h | 3 | ||||
-rw-r--r-- | Eigen/src/Core/util/ForwardDeclarations.h | 2 |
5 files changed, 99 insertions, 1 deletions
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index e3363d006..2e5f5e5bc 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -73,6 +73,8 @@ template<> struct packet_traits<float> : default_packet_traits HasExpm1 = 1, HasExp = 1, HasNdtri = 1, + HasI0e = 1, + HasI1e = 1, HasSqrt = 1, HasRsqrt = 1, HasTanh = EIGEN_FAST_MATH, diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 11c8dae02..67e667640 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -99,6 +99,8 @@ template<> struct packet_traits<float> : default_packet_traits HasExpm1 = 1, HasNdtri = 1, #endif + HasI0e = 1, + HasI1e = 1, HasExp = 1, HasSqrt = EIGEN_FAST_MATH, HasRsqrt = EIGEN_FAST_MATH, diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 518db2207..0a4b66089 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -570,6 +570,97 @@ struct ppolevl<Packet, 0> { } }; +/* chbevl (modified for Eigen) + * + * Evaluate Chebyshev series + * + * + * + * SYNOPSIS: + * + * int N; + * Scalar x, y, coef[N], chebevl(); + * + * y = chbevl( x, coef, N ); + * + * + * + * DESCRIPTION: + * + * Evaluates the series + * + * N-1 + * - ' + * y = > coef[i] T (x/2) + * - i + * i=0 + * + * of Chebyshev polynomials Ti at argument x/2. + * + * Coefficients are stored in reverse order, i.e. the zero + * order term is last in the array. Note N is the number of + * coefficients, not the order. + * + * If coefficients are for the interval a to b, x must + * have been transformed to x -> 2(2x - b - a)/(b-a) before + * entering the routine. This maps x from (a, b) to (-1, 1), + * over which the Chebyshev polynomials are defined. + * + * If the coefficients are for the inverted interval, in + * which (a, b) is mapped to (1/b, 1/a), the transformation + * required is x -> 2(2ab/x - b - a)/(b-a). If b is infinity, + * this becomes x -> 4a/x - 1. + * + * + * + * SPEED: + * + * Taking advantage of the recurrence properties of the + * Chebyshev polynomials, the routine requires one more + * addition per loop than evaluating a nested polynomial of + * the same degree. + * + */ +template <typename Packet, int N> +struct generic_cheb_recurrence { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Packet run(Packet x, const typename unpacket_traits<Packet>::type coef[]) { + EIGEN_STATIC_ASSERT((N > 2), YOU_MADE_A_PROGRAMMING_MISTAKE); + return pmadd( + generic_cheb_recurrence<Packet, N - 1>::run(x, coef), x, + psub(pset1<Packet>(coef[N - 1]), generic_cheb_recurrence<Packet, N - + 2>::run(x, coef))); + } +}; + +template <typename Packet> +struct generic_cheb_recurrence<Packet, 2> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Packet run(Packet x, const typename unpacket_traits<Packet>::type coef[]) { + return pmadd(pset1<Packet>(coef[0]), x, pset1<Packet>(coef[1])); + } +}; + +template <typename Packet> +struct generic_cheb_recurrence<Packet, 1> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Packet run(Packet x, const typename unpacket_traits<Packet>::type coef[]) { + EIGEN_UNUSED_VARIABLE(x); + return pset1<Packet>(coef[0]); + } +}; + +template <typename Packet, int N> +struct pchebevl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Packet run(Packet x, const typename unpacket_traits<Packet>::type coef[]) { + const Packet half = pset1<Packet>(0.5); + return pmul(half, psub( + generic_cheb_recurrence<Packet, N>::run(x, coef), + generic_cheb_recurrence<Packet, N - 2>::run(x, coef))); + } +}; + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index ddd2979af..0aadefab7 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -114,7 +114,8 @@ template<> struct packet_traits<float> : default_packet_traits HasExpm1 = 1, HasNdtri = 1, HasExp = 1, - HasNdtri = 1, + HasI0e = 1, + HasI1e = 1, HasSqrt = 1, HasRsqrt = 1, HasTanh = EIGEN_FAST_MATH, diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h index 18889fcf4..749945031 100644 --- a/Eigen/src/Core/util/ForwardDeclarations.h +++ b/Eigen/src/Core/util/ForwardDeclarations.h @@ -215,6 +215,8 @@ template<typename Scalar> struct scalar_digamma_op; template<typename Scalar> struct scalar_erf_op; template<typename Scalar> struct scalar_erfc_op; template<typename Scalar> struct scalar_ndtri_op; +template<typename Scalar> struct scalar_i0e_op; +template<typename Scalar> struct scalar_i1e_op; template<typename Scalar> struct scalar_igamma_op; template<typename Scalar> struct scalar_igammac_op; template<typename Scalar> struct scalar_zeta_op; |