aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch/AVX512
diff options
context:
space:
mode:
authorGravatar guoqiangqi <425418567@qq.com>2020-10-20 11:37:09 +0800
committerGravatar guoqiangqi <425418567@qq.com>2020-10-20 11:37:09 +0800
commit28aef8e816faadc0e51afbfe3fa91f10f477535d (patch)
treeda0c52dc6a8550e19de87f5954c83624ee94c298 /Eigen/src/Core/arch/AVX512
parent4a77eda1fdaebd7f92d587dfc0158a20dc0d2625 (diff)
Improve polynomial evaluation with instruction-level parallelism for pexp_float and pexp<Packet16f>
Diffstat (limited to 'Eigen/src/Core/arch/AVX512')
-rw-r--r--Eigen/src/Core/arch/AVX512/MathFunctions.h22
1 files changed, 11 insertions, 11 deletions
diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h
index f6a43738d..bfd30c01a 100644
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -85,17 +85,17 @@ pexp<Packet16f>(const Packet16f& _x) {
_EIGEN_DECLARE_CONST_Packet16f(nln2, -0.6931471805599453f);
Packet16f r = _mm512_fmadd_ps(m, p16f_nln2, x);
Packet16f r2 = pmul(r, r);
-
- // TODO(gonnet): Split into odd/even polynomials and try to exploit
- // instruction-level parallelism.
- Packet16f y = p16f_cephes_exp_p0;
- y = pmadd(y, r, p16f_cephes_exp_p1);
- y = pmadd(y, r, p16f_cephes_exp_p2);
- y = pmadd(y, r, p16f_cephes_exp_p3);
- y = pmadd(y, r, p16f_cephes_exp_p4);
- y = pmadd(y, r, p16f_cephes_exp_p5);
- y = pmadd(y, r2, r);
- y = padd(y, p16f_1);
+ Packet16f r3 = pmul(r2, r);
+
+ // Evaluate the polynomial approximant,improved by instruction-level parallelism.
+ Packet16f y, y1, y2;
+ y = pmadd(p16f_cephes_exp_p0, r, p16f_cephes_exp_p1);
+ y1 = pmadd(p16f_cephes_exp_p3, r, p16f_cephes_exp_p4);
+ y2 = padd(r, p16f_1);
+ y = pmadd(y, r, p16f_cephes_exp_p2);
+ y1 = pmadd(y1, r, p16f_cephes_exp_p5);
+ y = pmadd(y, r3, y1);
+ y = pmadd(y, r2, y2);
// Build emm0 = 2^m.
Packet16i emm0 = _mm512_cvttps_epi32(padd(m, p16f_127));