aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch/SSE/MathFunctions.h
diff options
context:
space:
mode:
authorGravatar Benoit Jacob <jacob.benoit.1@gmail.com>2009-04-06 13:33:42 +0000
committerGravatar Benoit Jacob <jacob.benoit.1@gmail.com>2009-04-06 13:33:42 +0000
commit502bf4a81dfd13630702e253fc265849d0e00ae6 (patch)
tree47708e279a14ae6f9721f56d27fd54a50a96dcbe /Eigen/src/Core/arch/SSE/MathFunctions.h
parent38f501a596aeafe1d3ff680ae6d2226ea6ed0cd2 (diff)
* fix the binary bloat issue, Rohit's idea was the good one
* a few dox fixes (alloc routines do return 0 on error) and forgot to update version number in CMakeLists
Diffstat (limited to 'Eigen/src/Core/arch/SSE/MathFunctions.h')
-rw-r--r--Eigen/src/Core/arch/SSE/MathFunctions.h138
1 files changed, 84 insertions, 54 deletions
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index 7df9dc659..64f9640af 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -52,38 +52,34 @@
#ifndef EIGEN_MATH_FUNCTIONS_SSE_H
#define EIGEN_MATH_FUNCTIONS_SSE_H
-_EIGEN_DECLARE_CONST_Packet4f(1 , 1.0);
-_EIGEN_DECLARE_CONST_Packet4f(half, 0.5);
-/* the smallest non denormalized float number */
-_EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000);
-_EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
-
-_EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000);
-
-_EIGEN_DECLARE_CONST_Packet4i(1, 1);
-_EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
-_EIGEN_DECLARE_CONST_Packet4i(2, 2);
-_EIGEN_DECLARE_CONST_Packet4i(4, 4);
-_EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
-
-/* natural logarithm computed for 4 simultaneous float
- return NaN for x <= 0
-*/
-_EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524);
-_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2);
-_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1);
-_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1);
-_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1);
-_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1);
-_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1);
-_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1);
-_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1);
-_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1);
-_EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4);
-_EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375);
-
template<> EIGEN_DONT_INLINE Packet4f ei_plog(Packet4f x)
{
+ _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0);
+ _EIGEN_DECLARE_CONST_Packet4f(half, 0.5);
+ _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+
+ _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
+
+ /* the smallest non denormalized float number */
+ _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000);
+
+ /* natural logarithm computed for 4 simultaneous float
+ return NaN for x <= 0
+ */
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375);
+
+
Packet4i emm0;
Packet4f invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
@@ -134,22 +130,27 @@ template<> EIGEN_DONT_INLINE Packet4f ei_plog(Packet4f x)
return _mm_or_ps(x, invalid_mask); // negative arg will be NAN
}
-_EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647949f);
-_EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
+template<> EIGEN_DONT_INLINE Packet4f ei_pexp(Packet4f x)
+{
+ _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0);
+ _EIGEN_DECLARE_CONST_Packet4f(half, 0.5);
+ _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
-_EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341);
-_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375);
-_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4);
-_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4);
-_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3);
-_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3);
-_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2);
-_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1);
-_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1);
+ _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647949f);
+ _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
+
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4);
+
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1);
-template<> EIGEN_DONT_INLINE Packet4f ei_pexp(Packet4f x)
-{
Packet4f tmp = _mm_setzero_ps(), fx;
Packet4i emm0;
@@ -202,19 +203,29 @@ template<> EIGEN_DONT_INLINE Packet4f ei_pexp(Packet4f x)
surprising but correct result.
*/
-_EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625);
-_EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4);
-_EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8);
-_EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4);
-_EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3);
-_EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1);
-_EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005);
-_EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003);
-_EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002);
-_EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516); // 4 / M_PI
-
template<> EIGEN_DONT_INLINE Packet4f ei_psin(Packet4f x)
{
+ _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0);
+ _EIGEN_DECLARE_CONST_Packet4f(half, 0.5);
+
+ _EIGEN_DECLARE_CONST_Packet4i(1, 1);
+ _EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
+ _EIGEN_DECLARE_CONST_Packet4i(2, 2);
+ _EIGEN_DECLARE_CONST_Packet4i(4, 4);
+
+ _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000);
+
+ _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625);
+ _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4);
+ _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8);
+ _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4);
+ _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3);
+ _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1);
+ _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005);
+ _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003);
+ _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516); // 4 / M_PI
+
Packet4f xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
Packet4i emm0, emm2;
@@ -291,6 +302,25 @@ template<> EIGEN_DONT_INLINE Packet4f ei_psin(Packet4f x)
/* almost the same as ei_psin */
template<> Packet4f ei_pcos(Packet4f x)
{
+ _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0);
+ _EIGEN_DECLARE_CONST_Packet4f(half, 0.5);
+
+ _EIGEN_DECLARE_CONST_Packet4i(1, 1);
+ _EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
+ _EIGEN_DECLARE_CONST_Packet4i(2, 2);
+ _EIGEN_DECLARE_CONST_Packet4i(4, 4);
+
+ _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625);
+ _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4);
+ _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8);
+ _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4);
+ _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3);
+ _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1);
+ _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005);
+ _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003);
+ _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002);
+ _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516); // 4 / M_PI
+
Packet4f xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
Packet4i emm0, emm2;