aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2018-04-03 14:12:50 +0200
committerGravatar Gael Guennebaud <g.gael@free.fr>2018-04-03 14:12:50 +0200
commit7b0630315f343422b37f62f40a039c9e725fe9e1 (patch)
tree1864aa020d7c673e60cca9e2196c011eeb07d161 /Eigen/src
parent6719409cd92b19acabc4544f9ac5571a2ff9a88f (diff)
AVX512: fix psqrt and prsqrt
Diffstat (limited to 'Eigen/src')
-rw-r--r--Eigen/src/Core/arch/AVX512/MathFunctions.h22
1 files changed, 8 insertions, 14 deletions
diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h
index 399be0ee4..4695fbc81 100644
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -266,8 +266,7 @@ psqrt<Packet16f>(const Packet16f& _x) {
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
__mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ);
- Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_rsqrt14_ps(_x),
- _mm512_setzero_ps());
+ Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_setzero_ps(), _mm512_rsqrt14_ps(_x));
// Do a single step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
@@ -289,8 +288,7 @@ psqrt<Packet8d>(const Packet8d& _x) {
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
__mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ);
- Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_rsqrt14_pd(_x),
- _mm512_setzero_pd());
+ Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_setzero_pd(), _mm512_rsqrt14_pd(_x));
// Do a first step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
@@ -333,20 +331,18 @@ prsqrt<Packet16f>(const Packet16f& _x) {
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
__mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ);
- Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(),
- _mm512_rsqrt14_ps(_x));
+ Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_rsqrt14_ps(_x), _mm512_setzero_ps());
// Fill in NaNs and Infs for the negative/zero entries.
__mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ);
Packet16f infs_and_nans = _mm512_mask_blend_ps(
- neg_mask, p16f_nan,
- _mm512_mask_blend_ps(le_zero_mask, p16f_inf, _mm512_setzero_ps()));
+ neg_mask, _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), p16f_inf), p16f_nan);
// Do a single step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
// Insert NaNs and Infs in all the right places.
- return _mm512_mask_blend_ps(le_zero_mask, infs_and_nans, x);
+ return _mm512_mask_blend_ps(le_zero_mask, x, infs_and_nans);
}
template <>
@@ -363,14 +359,12 @@ prsqrt<Packet8d>(const Packet8d& _x) {
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
__mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ);
- Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(),
- _mm512_rsqrt14_pd(_x));
+ Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_rsqrt14_pd(_x), _mm512_setzero_pd());
// Fill in NaNs and Infs for the negative/zero entries.
__mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ);
Packet8d infs_and_nans = _mm512_mask_blend_pd(
- neg_mask, p8d_nan,
- _mm512_mask_blend_pd(le_zero_mask, p8d_inf, _mm512_setzero_pd()));
+ neg_mask, _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), p8d_inf), p8d_nan);
// Do a first step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
@@ -379,7 +373,7 @@ prsqrt<Packet8d>(const Packet8d& _x) {
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
// Insert NaNs and Infs in all the right places.
- return _mm512_mask_blend_pd(le_zero_mask, infs_and_nans, x);
+ return _mm512_mask_blend_pd(le_zero_mask, x, infs_and_nans);
}
#else
template <>