From 5e0a178df2b70829e990e84602fd70f92131484f Mon Sep 17 00:00:00 2001 From: Tal Hadad Date: Sun, 27 Sep 2015 16:51:24 +0300 Subject: Initial fork of unsupported module EulerAngles. --- unsupported/Eigen/EulerAngles | 32 ++++ unsupported/Eigen/src/CMakeLists.txt | 1 + unsupported/Eigen/src/EulerAngles/CMakeLists.txt | 6 + unsupported/Eigen/src/EulerAngles/EulerAngles.h | 183 ++++++++++++++++++++ unsupported/Eigen/src/EulerAngles/EulerSystem.h | 211 +++++++++++++++++++++++ unsupported/test/CMakeLists.txt | 2 + unsupported/test/EulerAngles.cpp | 18 ++ 7 files changed, 453 insertions(+) create mode 100644 unsupported/Eigen/EulerAngles create mode 100644 unsupported/Eigen/src/EulerAngles/CMakeLists.txt create mode 100644 unsupported/Eigen/src/EulerAngles/EulerAngles.h create mode 100644 unsupported/Eigen/src/EulerAngles/EulerSystem.h create mode 100644 unsupported/test/EulerAngles.cpp diff --git a/unsupported/Eigen/EulerAngles b/unsupported/Eigen/EulerAngles new file mode 100644 index 000000000..a1aa5d829 --- /dev/null +++ b/unsupported/Eigen/EulerAngles @@ -0,0 +1,32 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Tal Hadad +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_EULERANGLES_MODULE_H +#define EIGEN_EULERANGLES_MODULE_H + + +#include "Eigen/Core" +#include "Eigen/Geometry" + +#include "Eigen/src/Core/util/DisableStupidWarnings.h" + +/** + * \defgroup EulerAngles_Module EulerAngles module + * + * + * + * + */ + +#include "src/EulerAngles/EulerSystem.h" +#include "src/EulerAngles/EulerAngles.h" + +#include "Eigen/src/Core/util/ReenableStupidWarnings.h" + +#endif // EIGEN_EULERANGLES_MODULE_H diff --git a/unsupported/Eigen/src/CMakeLists.txt b/unsupported/Eigen/src/CMakeLists.txt index 8eb2808e3..fae1c5854 100644 --- a/unsupported/Eigen/src/CMakeLists.txt +++ b/unsupported/Eigen/src/CMakeLists.txt @@ -12,3 +12,4 @@ ADD_SUBDIRECTORY(Skyline) ADD_SUBDIRECTORY(SparseExtra) ADD_SUBDIRECTORY(KroneckerProduct) ADD_SUBDIRECTORY(Splines) +ADD_SUBDIRECTORY(EulerAngles) diff --git a/unsupported/Eigen/src/EulerAngles/CMakeLists.txt b/unsupported/Eigen/src/EulerAngles/CMakeLists.txt new file mode 100644 index 000000000..7986afc5e --- /dev/null +++ b/unsupported/Eigen/src/EulerAngles/CMakeLists.txt @@ -0,0 +1,6 @@ +FILE(GLOB Eigen_IterativeSolvers_SRCS "*.h") + +INSTALL(FILES + ${Eigen_IterativeSolvers_SRCS} + DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/IterativeSolvers COMPONENT Devel + ) diff --git a/unsupported/Eigen/src/EulerAngles/EulerAngles.h b/unsupported/Eigen/src/EulerAngles/EulerAngles.h new file mode 100644 index 000000000..b3bd66441 --- /dev/null +++ b/unsupported/Eigen/src/EulerAngles/EulerAngles.h @@ -0,0 +1,183 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Tal Hadad +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_EULERANGLESCLASS_H// TODO: Fix previous "EIGEN_EULERANGLES_H" definition? +#define EIGEN_EULERANGLESCLASS_H + +namespace Eigen +{ + /*template + struct ei_eulerangles_assign_impl;*/ + + /** \class EulerAngles + * + * \brief Represents a rotation in a 3 dimensional space as three Euler angles + * + * \param _Scalar the scalar type, i.e., the type of the angles. + * + * \sa cl + */ + template + class EulerAngles : public RotationBase, 3> + { + public: + /** the scalar type of the coefficients */ + typedef _Scalar Scalar; + typedef _System System; + + typedef Matrix Matrix3; + typedef Matrix Vector3; + typedef Quaternion QuaternionType; + typedef AngleAxis AngleAxisType; + + protected: + + Vector3 m_angles; + + public: + + EulerAngles() {} + inline EulerAngles(Scalar a0, Scalar a1, Scalar a2) : m_angles(a0, a1, a2) {} + inline EulerAngles(Vector3 angles) : m_angles(angles) {} + inline EulerAngles(const QuaternionType& q) { *this = q; } + inline EulerAngles(const AngleAxisType& aa) { *this = aa; } + template + inline EulerAngles(const MatrixBase& m) { *this = m; } + + // TODO: Support assignment from euler to euler + + Scalar angle(int i) const { return m_angles.coeff(i); } + Scalar& angle(int i) { return m_angles.coeffRef(i); } + + const Vector3& coeffs() const { return m_angles; } + Vector3& coeffs() { return m_angles; } + + // TODO: Add set/get functions + + Scalar h() const { return m_angles[0]; } + Scalar& h() { return m_angles[0]; } + + Scalar p() const { return m_angles[1]; } + Scalar& p() { return m_angles[1]; } + + Scalar r() const { return m_angles[2]; } + Scalar& r() { return m_angles[2]; } + + EulerAngles invert() const + { + //m_angles = -m_angles;// I want to do this but there could be an aliasing issue! + m_angles *= -1; + + return *this; + } + + EulerAngles inverse() const + { + EulerAngles res; + res.m_angles = -m_angles; + return res; + } + + EulerAngles operator -() const + { + return inverse(); + } + + /** Constructs and \returns an equivalent 3x3 rotation matrix. + */ + template + // TODO: Add booleans which let the user control desired output angles range( (-PI, PI) or [0, 2*PI) ) + EulerAngles& fromRotationMatrix(const MatrixBase& m) + { + System::eulerAngles(*this, m); + return *this; + } + + /** Set \c *this from a rotation matrix(i.e. pure orthogonal matrix with determinent of +1). + */ + template + EulerAngles& operator=(const MatrixBase& mat){ + return fromRotationMatrix(mat); + } + + // TODO: Assign and construct from another EulerAngle (with different system) + + /** Set \c *this from a quaternion. + * The axis is normalized. + */ + EulerAngles& operator=(const QuaternionType& q){ + // TODO: Implement it in a better way + // According to http://www.euclideanspace.com/maths/geometry/rotations/conversions/quaternionToEuler/ + // we can compute only the needed matrix cells and then convert to euler angles. + // Currently we compute all matrix cells from quaternion. + + fromRotationMatrix(q.toRotationMatrix()); + + // Special case only for ZYX + /*Scalar y2 = q.y() * q.y(); + m_angles[0] = std::atan2(2*(q.w()*q.z() + q.x()*q.y()), (1 - 2*(y2 + q.z()*q.z()))); + m_angles[1] = std::asin( 2*(q.w()*q.y() - q.z()*q.x())); + m_angles[2] = std::atan2(2*(q.w()*q.x() + q.y()*q.z()), (1 - 2*(q.x()*q.x() + y2))); + */ + + return *this; + } + + /** Set \c *this from AngleAxis \a ea. + */ + EulerAngles& operator=(const AngleAxisType& ea) + { + // TODO: Implement it in a better way + return *this = ea.toRotationMatrix(); + } + + // TODO: Fix this function, and make it generic + Matrix3 toRotationMatrix(void) const + { + return static_cast(*this).toRotationMatrix(); + } + + operator QuaternionType() const + { + return + AngleAxisType((System::IsHeadingOpposite ? -1 : 1) * h(), Vector3::Unit(System::HeadingAxisAbs - 1)) * + AngleAxisType((System::IsPitchOpposite ? -1 : 1) * p(), Vector3::Unit(System::PitchAxisAbs - 1)) * + AngleAxisType((System::IsRollOpposite ? -1 : 1) * r(), Vector3::Unit(System::RollAxisAbs - 1)); + } + }; + + typedef EulerAngles EulerAnglesXYZd; + typedef EulerAngles EulerAnglesXYXd; + typedef EulerAngles EulerAnglesXZYd; + typedef EulerAngles EulerAnglesXZXd; + + typedef EulerAngles EulerAnglesYZXd; + typedef EulerAngles EulerAnglesYZYd; + typedef EulerAngles EulerAnglesYXZd; + typedef EulerAngles EulerAnglesYXYd; + + typedef EulerAngles EulerAnglesZXYd; + typedef EulerAngles EulerAnglesZXZd; + typedef EulerAngles EulerAnglesZYXd; + typedef EulerAngles EulerAnglesZYZd; + + namespace internal + { + template + struct traits > + { + typedef _Scalar Scalar; + }; + } + +} + +#endif // EIGEN_EULERANGLESCLASS_H diff --git a/unsupported/Eigen/src/EulerAngles/EulerSystem.h b/unsupported/Eigen/src/EulerAngles/EulerSystem.h new file mode 100644 index 000000000..fc782e914 --- /dev/null +++ b/unsupported/Eigen/src/EulerAngles/EulerSystem.h @@ -0,0 +1,211 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Tal Hadad +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_EULERSYSTEM_H +#define EIGEN_EULERSYSTEM_H + +namespace Eigen +{ + // Forward declerations + template + class EulerAngles; + + namespace internal + { + // TODO: Check if already exists on the rest API + template 0)> + struct Abs + { + enum { value = Num }; + }; + + template + struct Abs + { + enum { value = -Num }; + }; + + template + struct NegateIf + { + template + static void run(T& t) + { + t = -t; + } + }; + + template <> + struct NegateIf + { + template + static void run(T& t) + { + // no op + } + }; + + template + struct NegateIfXor : NegateIf {}; + + template + struct IsValidAxis + { + enum { value = Axis != 0 && Abs::value <= 3 }; + }; + } + + enum EulerAxis + { + EULER_X = 1, + EULER_Y = 2, + EULER_Z = 3 + }; + + template + class EulerSystem + { + public: + // It's defined this way and not as enum, because I think + // that enum is not guerantee to support negative numbers + static const int HeadingAxis = _HeadingAxis; + static const int PitchAxis = _PitchAxis; + static const int RollAxis = _RollAxis; + + enum + { + HeadingAxisAbs = internal::Abs::value, + PitchAxisAbs = internal::Abs::value, + RollAxisAbs = internal::Abs::value, + + IsHeadingOpposite = (HeadingAxis < 0) ? 1 : 0, + IsPitchOpposite = (PitchAxis < 0) ? 1 : 0, + IsRollOpposite = (RollAxis < 0) ? 1 : 0, + + IsOdd = ((HeadingAxisAbs)%3 == (PitchAxisAbs - 1)%3) ? 0 : 1, + IsEven = IsOdd ? 0 : 1, + + // TODO: Assert this, and sort it in a better way + IsValid = ((unsigned)HeadingAxisAbs != (unsigned)PitchAxisAbs && + (unsigned)PitchAxisAbs != (unsigned)RollAxisAbs && + internal::IsValidAxis::value && internal::IsValidAxis::value && internal::IsValidAxis::value) ? 1 : 0, + + // TODO: After a proper assertation, remove the "IsValid" from this expression + IsTaitBryan = (IsValid && (unsigned)HeadingAxisAbs != (unsigned)RollAxisAbs) ? 1 : 0 + }; + + private: + + enum + { + // I, J, K are the pivot indexes permutation for the rotation matrix, that match this euler system. + // They are used in this class converters. + // They are always different from each other, and their possible values are: 0, 1, or 2. + I = HeadingAxisAbs - 1, + J = (HeadingAxisAbs - 1 + 1 + IsOdd)%3, + K = (HeadingAxisAbs - 1 + 2 - IsOdd)%3 + }; + + template + static void eulerAngles_imp(Matrix::Scalar, 3, 1>& res, const MatrixBase& mat, internal::true_type isTaitBryan) + { + using std::atan2; + using std::sin; + using std::cos; + + typedef typename Derived::Scalar Scalar; + typedef Matrix Vector2; + + res[0] = atan2(mat(J,K), mat(K,K)); + Scalar c2 = Vector2(mat(I,I), mat(I,J)).norm(); + if((IsOdd && res[0]Scalar(0))) { + res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(M_PI) : res[0] + Scalar(M_PI); + res[1] = atan2(-mat(I,K), -c2); + } + else + res[1] = atan2(-mat(I,K), c2); + Scalar s1 = sin(res[0]); + Scalar c1 = cos(res[0]); + res[2] = atan2(s1*mat(K,I)-c1*mat(J,I), c1*mat(J,J) - s1 * mat(K,J)); + } + + template + static void eulerAngles_imp(Matrix::Scalar,3,1>& res, const MatrixBase& mat, internal::false_type isTaitBryan) + { + using std::atan2; + using std::sin; + using std::cos; + + typedef typename Derived::Scalar Scalar; + typedef Matrix Vector2; + + res[0] = atan2(mat(J,I), mat(K,I)); + if((IsOdd && res[0]Scalar(0))) + { + res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(M_PI) : res[0] + Scalar(M_PI); + Scalar s2 = Vector2(mat(J,I), mat(K,I)).norm(); + res[1] = -atan2(s2, mat(I,I)); + } + else + { + Scalar s2 = Vector2(mat(J,I), mat(K,I)).norm(); + res[1] = atan2(s2, mat(I,I)); + } + + // With a=(0,1,0), we have i=0; j=1; k=2, and after computing the first two angles, + // we can compute their respective rotation, and apply its inverse to M. Since the result must + // be a rotation around x, we have: + // + // c2 s1.s2 c1.s2 1 0 0 + // 0 c1 -s1 * M = 0 c3 s3 + // -s2 s1.c2 c1.c2 0 -s3 c3 + // + // Thus: m11.c1 - m21.s1 = c3 & m12.c1 - m22.s1 = s3 + + Scalar s1 = sin(res[0]); + Scalar c1 = cos(res[0]); + res[2] = atan2(c1*mat(J,K)-s1*mat(K,K), c1*mat(J,J) - s1 * mat(K,J)); + } + + public: + + // TODO: Support headingAxisVector(), .. + + template + static void eulerAngles(EulerAngles& res, const typename EulerAngles::Matrix3& mat) + { + eulerAngles_imp( + res.coeffs(), mat, + typename internal::conditional::type()); + + internal::NegateIfXor::run(res.h()); + + internal::NegateIfXor::run(res.p()); + + internal::NegateIfXor::run(res.r()); + } + }; + + typedef EulerSystem EulerSystemXYZ; + typedef EulerSystem EulerSystemXYX; + typedef EulerSystem EulerSystemXZY; + typedef EulerSystem EulerSystemXZX; + + typedef EulerSystem EulerSystemYZX; + typedef EulerSystem EulerSystemYZY; + typedef EulerSystem EulerSystemYXZ; + typedef EulerSystem EulerSystemYXY; + + typedef EulerSystem EulerSystemZXY; + typedef EulerSystem EulerSystemZXZ; + typedef EulerSystem EulerSystemZYX; + typedef EulerSystem EulerSystemZYZ; +} + +#endif // EIGEN_EULERSYSTEM_H diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 79e70ced4..653392e40 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -45,6 +45,8 @@ ei_add_test(alignedvector3) ei_add_test(FFT) +ei_add_test(EulerAngles) + find_package(MPFR 2.3.0) find_package(GMP) if(MPFR_FOUND) diff --git a/unsupported/test/EulerAngles.cpp b/unsupported/test/EulerAngles.cpp new file mode 100644 index 000000000..d03db1ac3 --- /dev/null +++ b/unsupported/test/EulerAngles.cpp @@ -0,0 +1,18 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Tal Hadad +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +void test_EulerAngles() +{ + //CALL_SUBTEST( test_return_by_value(32) ); + +} -- cgit v1.2.3 From 6752a69aa521d4a0979d575af79b4acfd54dd089 Mon Sep 17 00:00:00 2001 From: Tal Hadad Date: Sun, 20 Dec 2015 12:49:12 +0200 Subject: Much better tests, and a little bit more functionality. --- unsupported/Eigen/src/EulerAngles/EulerAngles.h | 20 +++- unsupported/Eigen/src/EulerAngles/EulerSystem.h | 26 ++++- unsupported/test/EulerAngles.cpp | 121 +++++++++++++++++++++++- 3 files changed, 158 insertions(+), 9 deletions(-) diff --git a/unsupported/Eigen/src/EulerAngles/EulerAngles.h b/unsupported/Eigen/src/EulerAngles/EulerAngles.h index b3bd66441..ccde28eb6 100644 --- a/unsupported/Eigen/src/EulerAngles/EulerAngles.h +++ b/unsupported/Eigen/src/EulerAngles/EulerAngles.h @@ -37,16 +37,26 @@ namespace Eigen typedef Matrix Vector3; typedef Quaternion QuaternionType; typedef AngleAxis AngleAxisType; + + static Vector3 HeadingAxisVector() { + return internal::NegativeIf::run(Vector3::Unit(System::HeadingAxisAbs - 1)); + } + + static Vector3 PitchAxisVector() { + return internal::NegativeIf::run(Vector3::Unit(System::PitchAxisAbs - 1)); + } + + static Vector3 RollAxisVector() { + return internal::NegativeIf::run(Vector3::Unit(System::RollAxisAbs - 1)); + } - protected: - + private: Vector3 m_angles; public: EulerAngles() {} inline EulerAngles(Scalar a0, Scalar a1, Scalar a2) : m_angles(a0, a1, a2) {} - inline EulerAngles(Vector3 angles) : m_angles(angles) {} inline EulerAngles(const QuaternionType& q) { *this = q; } inline EulerAngles(const AngleAxisType& aa) { *this = aa; } template @@ -116,7 +126,7 @@ namespace Eigen EulerAngles& operator=(const QuaternionType& q){ // TODO: Implement it in a better way // According to http://www.euclideanspace.com/maths/geometry/rotations/conversions/quaternionToEuler/ - // we can compute only the needed matrix cells and then convert to euler angles. + // we can compute only the needed matrix cells and then convert to euler angles. (see ZYX example below) // Currently we compute all matrix cells from quaternion. fromRotationMatrix(q.toRotationMatrix()); @@ -131,6 +141,8 @@ namespace Eigen return *this; } + // TODO: Support isApprox function + /** Set \c *this from AngleAxis \a ea. */ EulerAngles& operator=(const AngleAxisType& ea) diff --git a/unsupported/Eigen/src/EulerAngles/EulerSystem.h b/unsupported/Eigen/src/EulerAngles/EulerSystem.h index fc782e914..6ee3f51df 100644 --- a/unsupported/Eigen/src/EulerAngles/EulerSystem.h +++ b/unsupported/Eigen/src/EulerAngles/EulerSystem.h @@ -31,6 +31,26 @@ namespace Eigen enum { value = -Num }; }; + template + struct NegativeIf + { + template + static T run(const T& t) + { + return -t; + } + }; + + template <> + struct NegativeIf + { + template + static T run(const T& t) + { + return t; + } + }; + template struct NegateIf { @@ -45,7 +65,7 @@ namespace Eigen struct NegateIf { template - static void run(T& t) + static void run(T&) { // no op } @@ -113,7 +133,7 @@ namespace Eigen }; template - static void eulerAngles_imp(Matrix::Scalar, 3, 1>& res, const MatrixBase& mat, internal::true_type isTaitBryan) + static void eulerAngles_imp(Matrix::Scalar, 3, 1>& res, const MatrixBase& mat, internal::true_type /*isTaitBryan*/) { using std::atan2; using std::sin; @@ -136,7 +156,7 @@ namespace Eigen } template - static void eulerAngles_imp(Matrix::Scalar,3,1>& res, const MatrixBase& mat, internal::false_type isTaitBryan) + static void eulerAngles_imp(Matrix::Scalar,3,1>& res, const MatrixBase& mat, internal::false_type /*isTaitBryan*/) { using std::atan2; using std::sin; diff --git a/unsupported/test/EulerAngles.cpp b/unsupported/test/EulerAngles.cpp index d03db1ac3..57a34776a 100644 --- a/unsupported/test/EulerAngles.cpp +++ b/unsupported/test/EulerAngles.cpp @@ -11,8 +11,125 @@ #include -void test_EulerAngles() +using namespace Eigen; + +template +void verify_euler(const Matrix& ea) +{ + typedef EulerAngles EulerAnglesType; + typedef Matrix Matrix3; + typedef Matrix Vector3; + typedef AngleAxis AngleAxisx; + using std::abs; + + const int i = EulerSystem::HeadingAxisAbs - 1; + const int j = EulerSystem::PitchAxisAbs - 1; + const int k = EulerSystem::RollAxisAbs - 1; + + const int iFactor = EulerSystem::IsHeadingOpposite ? -1 : 1; + const int jFactor = EulerSystem::IsPitchOpposite ? -1 : 1; + const int kFactor = EulerSystem::IsRollOpposite ? -1 : 1; + + const Vector3 I = EulerAnglesType::HeadingAxisVector(); + const Vector3 J = EulerAnglesType::PitchAxisVector(); + const Vector3 K = EulerAnglesType::RollAxisVector(); + + EulerAnglesType e(ea[0], ea[1], ea[2]); + + Matrix3 m(e); + Vector3 eabis = EulerAnglesType(m).coeffs(); + Vector3 eabis2 = m.eulerAngles(i, j, k); + eabis2[0] *= iFactor; + eabis2[1] *= jFactor; + eabis2[2] *= kFactor; + + VERIFY_IS_APPROX(eabis, eabis2);// Verify that our estimation is the same as m.eulerAngles() is + + Matrix3 mbis(AngleAxisx(eabis[0], I) * AngleAxisx(eabis[1], J) * AngleAxisx(eabis[2], K)); + VERIFY_IS_APPROX(m, mbis); + /* If I==K, and ea[1]==0, then there no unique solution. */ + /* The remark apply in the case where I!=K, and |ea[1]| is close to pi/2. */ + if( (i!=k || ea[1]!=0) && (i==k || !internal::isApprox(abs(ea[1]),Scalar(EIGEN_PI/2),test_precision())) ) + VERIFY((ea-eabis).norm() <= test_precision()); + + // approx_or_less_than does not work for 0 + VERIFY(0 < eabis[0] || test_isMuchSmallerThan(eabis[0], Scalar(1))); + VERIFY_IS_APPROX_OR_LESS_THAN(eabis[0], Scalar(EIGEN_PI)); + VERIFY_IS_APPROX_OR_LESS_THAN(-Scalar(EIGEN_PI), eabis[1]); + VERIFY_IS_APPROX_OR_LESS_THAN(eabis[1], Scalar(EIGEN_PI)); + VERIFY_IS_APPROX_OR_LESS_THAN(-Scalar(EIGEN_PI), eabis[2]); + VERIFY_IS_APPROX_OR_LESS_THAN(eabis[2], Scalar(EIGEN_PI)); +} + +template void check_all_var(const Matrix& ea) +{ + verify_euler(ea); + verify_euler(ea); + verify_euler(ea); + verify_euler(ea); + + verify_euler(ea); + verify_euler(ea); + verify_euler(ea); + verify_euler(ea); + + verify_euler(ea); + verify_euler(ea); + verify_euler(ea); + verify_euler(ea); +} + +template void eulerangles() { - //CALL_SUBTEST( test_return_by_value(32) ); + typedef Matrix Matrix3; + typedef Matrix Vector3; + typedef Array Array3; + typedef Quaternion Quaternionx; + typedef AngleAxis AngleAxisx; + + Scalar a = internal::random(-Scalar(EIGEN_PI), Scalar(EIGEN_PI)); + Quaternionx q1; + q1 = AngleAxisx(a, Vector3::Random().normalized()); + Matrix3 m; + m = q1; + + Vector3 ea = m.eulerAngles(0,1,2); + check_all_var(ea); + ea = m.eulerAngles(0,1,0); + check_all_var(ea); + + // Check with purely random Quaternion: + q1.coeffs() = Quaternionx::Coefficients::Random().normalized(); + m = q1; + ea = m.eulerAngles(0,1,2); + check_all_var(ea); + ea = m.eulerAngles(0,1,0); + check_all_var(ea); + + // Check with random angles in range [0:pi]x[-pi:pi]x[-pi:pi]. + ea = (Array3::Random() + Array3(1,0,0))*Scalar(EIGEN_PI)*Array3(0.5,1,1); + check_all_var(ea); + + ea[2] = ea[0] = internal::random(0,Scalar(EIGEN_PI)); + check_all_var(ea); + ea[0] = ea[1] = internal::random(0,Scalar(EIGEN_PI)); + check_all_var(ea); + + ea[1] = 0; + check_all_var(ea); + + ea.head(2).setZero(); + check_all_var(ea); + + ea.setZero(); + check_all_var(ea); +} + +void test_EulerAngles() +{ + for(int i = 0; i < g_repeat; i++) { + CALL_SUBTEST_1( eulerangles() ); + CALL_SUBTEST_2( eulerangles() ); + } } -- cgit v1.2.3 From b091b7e6ea0d78718fb41a56251021e06bbb15be Mon Sep 17 00:00:00 2001 From: Tal Hadad Date: Sun, 20 Dec 2015 13:00:07 +0200 Subject: Remove unneccesary comment. --- unsupported/Eigen/src/EulerAngles/EulerSystem.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/unsupported/Eigen/src/EulerAngles/EulerSystem.h b/unsupported/Eigen/src/EulerAngles/EulerSystem.h index 6ee3f51df..ba33d5400 100644 --- a/unsupported/Eigen/src/EulerAngles/EulerSystem.h +++ b/unsupported/Eigen/src/EulerAngles/EulerSystem.h @@ -194,8 +194,6 @@ namespace Eigen } public: - - // TODO: Support headingAxisVector(), .. template static void eulerAngles(EulerAngles& res, const typename EulerAngles::Matrix3& mat) -- cgit v1.2.3 From bfed274df36a9244c067b3658c057cac4e13c886 Mon Sep 17 00:00:00 2001 From: Tal Hadad Date: Sun, 20 Dec 2015 16:24:53 +0200 Subject: Use RotationBase, test quaternions and support ranges. --- unsupported/Eigen/src/EulerAngles/EulerAngles.h | 137 +++++++++++++++++------- unsupported/Eigen/src/EulerAngles/EulerSystem.h | 61 ++++++++++- unsupported/test/EulerAngles.cpp | 137 ++++++++++++++++++------ 3 files changed, 263 insertions(+), 72 deletions(-) diff --git a/unsupported/Eigen/src/EulerAngles/EulerAngles.h b/unsupported/Eigen/src/EulerAngles/EulerAngles.h index ccde28eb6..3362d9c3e 100644 --- a/unsupported/Eigen/src/EulerAngles/EulerAngles.h +++ b/unsupported/Eigen/src/EulerAngles/EulerAngles.h @@ -57,10 +57,32 @@ namespace Eigen EulerAngles() {} inline EulerAngles(Scalar a0, Scalar a1, Scalar a2) : m_angles(a0, a1, a2) {} - inline EulerAngles(const QuaternionType& q) { *this = q; } - inline EulerAngles(const AngleAxisType& aa) { *this = aa; } + template inline EulerAngles(const MatrixBase& m) { *this = m; } + + template + inline EulerAngles( + const MatrixBase& m, + bool positiveRangeHeading, + bool positiveRangePitch, + bool positiveRangeRoll) { + + fromRotation(m, positiveRangeHeading, positiveRangePitch, positiveRangeRoll); + } + + template + inline EulerAngles(const RotationBase& rot) { *this = rot; } + + template + inline EulerAngles( + const RotationBase& rot, + bool positiveRangeHeading, + bool positiveRangePitch, + bool positiveRangeRoll) { + + fromRotation(rot, positiveRangeHeading, positiveRangePitch, positiveRangeRoll); + } // TODO: Support assignment from euler to euler @@ -104,65 +126,108 @@ namespace Eigen /** Constructs and \returns an equivalent 3x3 rotation matrix. */ template - // TODO: Add booleans which let the user control desired output angles range( (-PI, PI) or [0, 2*PI) ) - EulerAngles& fromRotationMatrix(const MatrixBase& m) + EulerAngles& fromRotation(const MatrixBase& m) { System::eulerAngles(*this, m); return *this; } - /** Set \c *this from a rotation matrix(i.e. pure orthogonal matrix with determinent of +1). - */ + template< + bool PositiveRangeHeading, + bool PositiveRangePitch, + bool PositiveRangeRoll, + typename Derived> + EulerAngles& fromRotation(const MatrixBase& m) + { + System::eulerAngles(*this, m); + return *this; + } + template - EulerAngles& operator=(const MatrixBase& mat){ - return fromRotationMatrix(mat); + EulerAngles& fromRotation( + const MatrixBase& m, + bool positiveRangeHeading, + bool positiveRangePitch, + bool positiveRangeRoll) + { + System::eulerAngles(*this, m, positiveRangeHeading, positiveRangePitch, positiveRangeRoll); + return *this; } - - // TODO: Assign and construct from another EulerAngle (with different system) - /** Set \c *this from a quaternion. - * The axis is normalized. - */ - EulerAngles& operator=(const QuaternionType& q){ - // TODO: Implement it in a better way + template + EulerAngles& fromRotation(const RotationBase& rot) + { + return fromRotation(rot.toRotationMatrix()); + } + + template< + bool PositiveRangeHeading, + bool PositiveRangePitch, + bool PositiveRangeRoll, + typename Derived> + EulerAngles& fromRotation(const RotationBase& rot) + { + return fromRotation(rot.toRotationMatrix()); + } + + template + EulerAngles& fromRotation( + const RotationBase& rot, + bool positiveRangeHeading, + bool positiveRangePitch, + bool positiveRangeRoll) + { + return fromRotation(rot.toRotationMatrix(), positiveRangeHeading, positiveRangePitch, positiveRangeRoll); + } + + /*EulerAngles& fromQuaternion(const QuaternionType& q) + { + // TODO: Implement it in a faster way for quaternions // According to http://www.euclideanspace.com/maths/geometry/rotations/conversions/quaternionToEuler/ // we can compute only the needed matrix cells and then convert to euler angles. (see ZYX example below) // Currently we compute all matrix cells from quaternion. - fromRotationMatrix(q.toRotationMatrix()); - // Special case only for ZYX - /*Scalar y2 = q.y() * q.y(); - m_angles[0] = std::atan2(2*(q.w()*q.z() + q.x()*q.y()), (1 - 2*(y2 + q.z()*q.z()))); - m_angles[1] = std::asin( 2*(q.w()*q.y() - q.z()*q.x())); - m_angles[2] = std::atan2(2*(q.w()*q.x() + q.y()*q.z()), (1 - 2*(q.x()*q.x() + y2))); + //Scalar y2 = q.y() * q.y(); + //m_angles[0] = std::atan2(2*(q.w()*q.z() + q.x()*q.y()), (1 - 2*(y2 + q.z()*q.z()))); + //m_angles[1] = std::asin( 2*(q.w()*q.y() - q.z()*q.x())); + //m_angles[2] = std::atan2(2*(q.w()*q.x() + q.y()*q.z()), (1 - 2*(q.x()*q.x() + y2))); + }*/ + + /** Set \c *this from a rotation matrix(i.e. pure orthogonal matrix with determinent of +1). */ - - return *this; + template + EulerAngles& operator=(const MatrixBase& mat) { + return fromRotation(mat); } + + // TODO: Assign and construct from another EulerAngle (with different system) - // TODO: Support isApprox function + /** Set \c *this from a rotation. + */ + template + EulerAngles& operator=(const RotationBase& rot) { + return fromRotation(rot.toRotationMatrix()); + } - /** Set \c *this from AngleAxis \a ea. - */ - EulerAngles& operator=(const AngleAxisType& ea) + // TODO: Support isApprox function + + Matrix3 toRotationMatrix() const { - // TODO: Implement it in a better way - return *this = ea.toRotationMatrix(); + return static_cast(*this).toRotationMatrix(); } - // TODO: Fix this function, and make it generic - Matrix3 toRotationMatrix(void) const + QuaternionType toQuaternion() const { - return static_cast(*this).toRotationMatrix(); + return + AngleAxisType(h(), HeadingAxisVector()) * + AngleAxisType(p(), PitchAxisVector()) * + AngleAxisType(r(), RollAxisVector()); } operator QuaternionType() const { - return - AngleAxisType((System::IsHeadingOpposite ? -1 : 1) * h(), Vector3::Unit(System::HeadingAxisAbs - 1)) * - AngleAxisType((System::IsPitchOpposite ? -1 : 1) * p(), Vector3::Unit(System::PitchAxisAbs - 1)) * - AngleAxisType((System::IsRollOpposite ? -1 : 1) * r(), Vector3::Unit(System::RollAxisAbs - 1)); + return toQuaternion(); } }; diff --git a/unsupported/Eigen/src/EulerAngles/EulerSystem.h b/unsupported/Eigen/src/EulerAngles/EulerSystem.h index ba33d5400..9699dd10d 100644 --- a/unsupported/Eigen/src/EulerAngles/EulerSystem.h +++ b/unsupported/Eigen/src/EulerAngles/EulerSystem.h @@ -19,7 +19,7 @@ namespace Eigen namespace internal { // TODO: Check if already exists on the rest API - template 0)> + template 0)> struct Abs { enum { value = Num }; @@ -73,6 +73,26 @@ namespace Eigen template struct NegateIfXor : NegateIf {}; + + template + struct AddConstIf + { + template + static void run(T& t) + { + t += T(value); + } + }; + + template + struct AddConstIf + { + template + static void run(T&) + { + // no op + } + }; template struct IsValidAxis @@ -196,17 +216,50 @@ namespace Eigen public: template - static void eulerAngles(EulerAngles& res, const typename EulerAngles::Matrix3& mat) + static void eulerAngles( + EulerAngles& res, + const typename EulerAngles::Matrix3& mat) + { + eulerAngles(res, mat, false, false, false); + } + + template< + typename Scalar, + bool PositiveRangeHeading, + bool PositiveRangePitch, + bool PositiveRangeRoll> + static void eulerAngles( + EulerAngles& res, + const typename EulerAngles::Matrix3& mat) + { + eulerAngles(res, mat, PositiveRangeHeading, PositiveRangePitch, PositiveRangeRoll); + } + + template + static void eulerAngles( + EulerAngles& res, + const typename EulerAngles::Matrix3& mat, + bool positiveRangeHeading, + bool positiveRangePitch, + bool positiveRangeRoll) { eulerAngles_imp( res.coeffs(), mat, typename internal::conditional::type()); internal::NegateIfXor::run(res.h()); - internal::NegateIfXor::run(res.p()); - internal::NegateIfXor::run(res.r()); + + // Saturate results to the requested range + if (positiveRangeHeading && (res.h() < 0)) + res.h() += Scalar(2 * EIGEN_PI); + + if (positiveRangePitch && (res.p() < 0)) + res.p() += Scalar(2 * EIGEN_PI); + + if (positiveRangeRoll && (res.r() < 0)) + res.r() += Scalar(2 * EIGEN_PI); } }; diff --git a/unsupported/test/EulerAngles.cpp b/unsupported/test/EulerAngles.cpp index 57a34776a..e65399ffa 100644 --- a/unsupported/test/EulerAngles.cpp +++ b/unsupported/test/EulerAngles.cpp @@ -14,14 +14,53 @@ using namespace Eigen; template -void verify_euler(const Matrix& ea) +void verify_euler_ranged(const Matrix& ea, + bool positiveRangeHeading, bool positiveRangePitch, bool positiveRangeRoll) { typedef EulerAngles EulerAnglesType; typedef Matrix Matrix3; typedef Matrix Vector3; - typedef AngleAxis AngleAxisx; + typedef Quaternion QuaternionType; + typedef AngleAxis AngleAxisType; using std::abs; + Scalar headingRangeStart, headingRangeEnd; + Scalar pitchRangeStart, pitchRangeEnd; + Scalar rollRangeStart, rollRangeEnd; + + if (positiveRangeHeading) + { + headingRangeStart = Scalar(0); + headingRangeEnd = Scalar(2 * EIGEN_PI); + } + else + { + headingRangeStart = -Scalar(EIGEN_PI); + headingRangeEnd = Scalar(EIGEN_PI); + } + + if (positiveRangePitch) + { + pitchRangeStart = Scalar(0); + pitchRangeEnd = Scalar(2 * EIGEN_PI); + } + else + { + pitchRangeStart = -Scalar(EIGEN_PI); + pitchRangeEnd = Scalar(EIGEN_PI); + } + + if (positiveRangeRoll) + { + rollRangeStart = Scalar(0); + rollRangeEnd = Scalar(2 * EIGEN_PI); + } + else + { + rollRangeStart = -Scalar(EIGEN_PI); + rollRangeEnd = Scalar(EIGEN_PI); + } + const int i = EulerSystem::HeadingAxisAbs - 1; const int j = EulerSystem::PitchAxisAbs - 1; const int k = EulerSystem::RollAxisAbs - 1; @@ -37,46 +76,80 @@ void verify_euler(const Matrix& ea) EulerAnglesType e(ea[0], ea[1], ea[2]); Matrix3 m(e); - Vector3 eabis = EulerAnglesType(m).coeffs(); + Vector3 eabis = EulerAnglesType(m, positiveRangeHeading, positiveRangePitch, positiveRangeRoll).coeffs(); + + // Check that eabis in range + VERIFY(headingRangeStart <= eabis[0] && eabis[0] <= headingRangeEnd); + VERIFY(pitchRangeStart <= eabis[1] && eabis[1] <= pitchRangeEnd); + VERIFY(rollRangeStart <= eabis[2] && eabis[2] <= rollRangeEnd); + Vector3 eabis2 = m.eulerAngles(i, j, k); + + // Invert the relevant axes eabis2[0] *= iFactor; eabis2[1] *= jFactor; eabis2[2] *= kFactor; + // Saturate the angles to the correct range + if (positiveRangeHeading && (eabis2[0] < 0)) + eabis2[0] += Scalar(2 * EIGEN_PI); + if (positiveRangePitch && (eabis2[1] < 0)) + eabis2[1] += Scalar(2 * EIGEN_PI); + if (positiveRangeRoll && (eabis2[2] < 0)) + eabis2[2] += Scalar(2 * EIGEN_PI); + VERIFY_IS_APPROX(eabis, eabis2);// Verify that our estimation is the same as m.eulerAngles() is - Matrix3 mbis(AngleAxisx(eabis[0], I) * AngleAxisx(eabis[1], J) * AngleAxisx(eabis[2], K)); + Matrix3 mbis(AngleAxisType(eabis[0], I) * AngleAxisType(eabis[1], J) * AngleAxisType(eabis[2], K)); VERIFY_IS_APPROX(m, mbis); - /* If I==K, and ea[1]==0, then there no unique solution. */ - /* The remark apply in the case where I!=K, and |ea[1]| is close to pi/2. */ - if( (i!=k || ea[1]!=0) && (i==k || !internal::isApprox(abs(ea[1]),Scalar(EIGEN_PI/2),test_precision())) ) - VERIFY((ea-eabis).norm() <= test_precision()); - - // approx_or_less_than does not work for 0 - VERIFY(0 < eabis[0] || test_isMuchSmallerThan(eabis[0], Scalar(1))); - VERIFY_IS_APPROX_OR_LESS_THAN(eabis[0], Scalar(EIGEN_PI)); - VERIFY_IS_APPROX_OR_LESS_THAN(-Scalar(EIGEN_PI), eabis[1]); - VERIFY_IS_APPROX_OR_LESS_THAN(eabis[1], Scalar(EIGEN_PI)); - VERIFY_IS_APPROX_OR_LESS_THAN(-Scalar(EIGEN_PI), eabis[2]); - VERIFY_IS_APPROX_OR_LESS_THAN(eabis[2], Scalar(EIGEN_PI)); + + // Tests that are only relevant for no possitive range + if (!(positiveRangeHeading || positiveRangePitch || positiveRangeRoll)) + { + /* If I==K, and ea[1]==0, then there no unique solution. */ + /* The remark apply in the case where I!=K, and |ea[1]| is close to pi/2. */ + if( (i!=k || ea[1]!=0) && (i==k || !internal::isApprox(abs(ea[1]),Scalar(EIGEN_PI/2),test_precision())) ) + VERIFY((ea-eabis).norm() <= test_precision()); + + // approx_or_less_than does not work for 0 + VERIFY(0 < eabis[0] || test_isMuchSmallerThan(eabis[0], Scalar(1))); + } + + // Quaternions + QuaternionType q(e); + eabis = EulerAnglesType(q, positiveRangeHeading, positiveRangePitch, positiveRangeRoll).coeffs(); + VERIFY_IS_APPROX(eabis, eabis2);// Verify that the euler angles are still the same +} + +template +void verify_euler(const Matrix& ea) +{ + verify_euler_ranged(ea, false, false, false); + verify_euler_ranged(ea, false, false, true); + verify_euler_ranged(ea, false, true, false); + verify_euler_ranged(ea, false, true, true); + verify_euler_ranged(ea, true, false, false); + verify_euler_ranged(ea, true, false, true); + verify_euler_ranged(ea, true, true, false); + verify_euler_ranged(ea, true, true, true); } template void check_all_var(const Matrix& ea) { - verify_euler(ea); - verify_euler(ea); - verify_euler(ea); - verify_euler(ea); - - verify_euler(ea); - verify_euler(ea); - verify_euler(ea); - verify_euler(ea); - - verify_euler(ea); - verify_euler(ea); - verify_euler(ea); - verify_euler(ea); + verify_euler(ea); + verify_euler(ea); + verify_euler(ea); + verify_euler(ea); + + verify_euler(ea); + verify_euler(ea); + verify_euler(ea); + verify_euler(ea); + + verify_euler(ea); + verify_euler(ea); + verify_euler(ea); + verify_euler(ea); } template void eulerangles() @@ -85,11 +158,11 @@ template void eulerangles() typedef Matrix Vector3; typedef Array Array3; typedef Quaternion Quaternionx; - typedef AngleAxis AngleAxisx; + typedef AngleAxis AngleAxisType; Scalar a = internal::random(-Scalar(EIGEN_PI), Scalar(EIGEN_PI)); Quaternionx q1; - q1 = AngleAxisx(a, Vector3::Random().normalized()); + q1 = AngleAxisType(a, Vector3::Random().normalized()); Matrix3 m; m = q1; -- cgit v1.2.3 From c006ecace15602e8c4b9a543af108a1dadc080db Mon Sep 17 00:00:00 2001 From: Tal Hadad Date: Sun, 20 Dec 2015 20:07:06 +0200 Subject: Fix comments --- unsupported/Eigen/src/EulerAngles/EulerAngles.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/unsupported/Eigen/src/EulerAngles/EulerAngles.h b/unsupported/Eigen/src/EulerAngles/EulerAngles.h index 3362d9c3e..d21f52491 100644 --- a/unsupported/Eigen/src/EulerAngles/EulerAngles.h +++ b/unsupported/Eigen/src/EulerAngles/EulerAngles.h @@ -84,16 +84,12 @@ namespace Eigen fromRotation(rot, positiveRangeHeading, positiveRangePitch, positiveRangeRoll); } - // TODO: Support assignment from euler to euler - Scalar angle(int i) const { return m_angles.coeff(i); } Scalar& angle(int i) { return m_angles.coeffRef(i); } const Vector3& coeffs() const { return m_angles; } Vector3& coeffs() { return m_angles; } - // TODO: Add set/get functions - Scalar h() const { return m_angles[0]; } Scalar& h() { return m_angles[0]; } @@ -201,7 +197,7 @@ namespace Eigen return fromRotation(mat); } - // TODO: Assign and construct from another EulerAngle (with different system) + // TODO: Assign and construct from another EulerAngles (with different system) /** Set \c *this from a rotation. */ -- cgit v1.2.3 From 166b56bc613e89a3e93f8c08912f15bd507d2351 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 7 Apr 2016 22:45:54 -0700 Subject: Fixed the type casting benchmark for float16 --- bench/tensors/tensor_benchmarks.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index 16b388abf..8fe211602 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -48,6 +48,10 @@ template class BenchmarkSuite { Eigen::array sizes; sizes[0] = m_; sizes[1] = k_; + if (sizeof(T) < sizeof(int)) { + sizes[0] = m_ * sizeof(T) / sizeof(int); + sizes[1] = k_ * sizeof(T) / sizeof(int); + } const TensorMap, Eigen::Aligned> A((int*)a_, sizes); TensorMap, Eigen::Aligned> B(b_, sizes); -- cgit v1.2.3 From 0e8fc3108762203867926252c1862a07e16b9514 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Fri, 15 Apr 2016 07:08:57 -0400 Subject: remove pgather/pscatter for std::complex for s390x --- Eigen/src/Core/arch/ZVector/Complex.h | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h index 9a8735ac1..e9d83eca6 100644 --- a/Eigen/src/Core/arch/ZVector/Complex.h +++ b/Eigen/src/Core/arch/ZVector/Complex.h @@ -57,21 +57,6 @@ template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex< template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) { /* here we really have to use unaligned loads :( */ return ploadu(&from); } -template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index stride) -{ - std::complex EIGEN_ALIGN16 af[2]; - af[0] = from[0*stride]; - af[1] = from[1*stride]; - return pload(af); -} -template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, Index stride) -{ - std::complex EIGEN_ALIGN16 af[2]; - pstore >(af, from); - to[0*stride] = af[0]; - to[1*stride] = af[1]; -} - template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); } template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); } template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); } -- cgit v1.2.3 From ee0459300b5222d5a8bf3efb2b734b154c3c4817 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 28 Apr 2016 14:31:21 -0300 Subject: minor fix, add to copyright --- Eigen/src/Core/arch/AltiVec/Complex.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index 58c296171..7631fec91 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -2,6 +2,7 @@ // for linear algebra. // // Copyright (C) 2010 Gael Guennebaud +// Copyright (C) 2010-2016 Konstantinos Margaritis // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -15,13 +16,15 @@ namespace Eigen { namespace internal { static Packet4ui p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_ZERO_);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; -#ifdef _BIG_ENDIAN +#ifdef __VSX__ +#if defined(_BIG_ENDIAN) static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 }; static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 }; #else static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 }; static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 }; #endif +#endif //---------- float ---------- struct Packet2cf -- cgit v1.2.3 From 950158f6d1f35f0d5537436de88899f661092015 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 28 Apr 2016 14:32:11 -0300 Subject: add name to copyrights --- Eigen/src/Core/arch/NEON/Complex.h | 1 + Eigen/src/Core/arch/NEON/PacketMath.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index d2d467936..3930dd67d 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -2,6 +2,7 @@ // for linear algebra. // // Copyright (C) 2010 Gael Guennebaud +// Copyright (C) 2010 Konstantinos Margaritis // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 3224c36bd..7231817a3 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -2,7 +2,7 @@ // for linear algebra. // // Copyright (C) 2008-2009 Gael Guennebaud -// Copyright (C) 2010 Konstantinos Margaritis +// Copyright (C) 2010 Konstantinos Margaritis // Heavily based on Gael's SSE version. // // This Source Code Form is subject to the terms of the Mozilla -- cgit v1.2.3 From 8ed26120c81feb7c15544013e30010bda8fc5a80 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 28 Apr 2016 14:32:42 -0300 Subject: bring Altivec/VSX to a better state, implement some of the missing functions --- Eigen/src/Core/arch/AltiVec/PacketMath.h | 228 +++++++++++++++++-------------- 1 file changed, 128 insertions(+), 100 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 0dbbc2e42..62c8df115 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2008-2014 Konstantinos Margaritis +// Copyright (C) 2008-2016 Konstantinos Margaritis // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -42,7 +42,7 @@ typedef __vector unsigned char Packet16uc; // and it doesn't really work to declare them global, so we define macros instead #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ - Packet4f p4f_##NAME = (Packet4f) vec_splat_s32(X) + Packet4f p4f_##NAME = reinterpret_cast(vec_splat_s32(X)) #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \ Packet4i p4i_##NAME = vec_splat_s32(X) @@ -69,13 +69,13 @@ typedef __vector unsigned char Packet16uc; // These constants are endian-agnostic static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0} static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} -#ifndef __VSX__ static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1} -static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0} -#endif static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16} static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1} static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000} +#ifndef __VSX__ +static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0} +#endif static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 }; static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; @@ -95,8 +95,10 @@ static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 }; // Handle endianness properly while loading constants // Define global static constants: #ifdef _BIG_ENDIAN -static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0); +static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0); +#ifdef __VSX__ static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; +#endif static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; @@ -121,6 +123,12 @@ static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8 static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_LO, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; #endif // _BIG_ENDIAN +#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC + #define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR); +#else + #define EIGEN_PPC_PREFETCH(ADDR) asm( " dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" ); +#endif + template<> struct packet_traits : default_packet_traits { typedef Packet4f type; @@ -129,15 +137,30 @@ template<> struct packet_traits : default_packet_traits Vectorizable = 1, AlignedOnScalar = 1, size=4, - HasHalfPacket=0, + HasHalfPacket = 1, - // FIXME check the Has* + HasAdd = 1, + HasSub = 1, + HasMul = 1, HasDiv = 1, + HasMin = 1, + HasMax = 1, + HasAbs = 1, HasSin = 0, HasCos = 0, - HasLog = 1, + HasLog = 0, HasExp = 1, - HasSqrt = 0 +#ifdef __VSX__ + HasSqrt = 1, +#else + HasSqrt = 0, +#endif + HasRsqrt = 1, + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasNegate = 1, + HasBlend = 1 }; }; template<> struct packet_traits : default_packet_traits @@ -145,10 +168,16 @@ template<> struct packet_traits : default_packet_traits typedef Packet4i type; typedef Packet4i half; enum { - // FIXME check the Has* Vectorizable = 1, AlignedOnScalar = 1, - size=4 + size = 4, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 0, + HasBlend = 1 }; }; @@ -200,18 +229,6 @@ inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v) s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; return s; } -/* -inline std::ostream & operator <<(std::ostream & s, const Packetbi & v) -{ - union { - Packet4bi v; - unsigned int n[4]; - } vt; - vt.v = v; - s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; - return s; -}*/ - // Need to define them first or we get specialization after instantiation errors template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); } @@ -221,20 +238,17 @@ template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& f template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); } template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { - // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html - float EIGEN_ALIGN16 af[4]; - af[0] = from; - Packet4f vc = pload(af); - vc = vec_splat(vc, 0); - return vc; + float EIGEN_ALIGN16 af; + af = from; + Packet4f vc = vec_lde(0, &af); + return vec_splat(vc, 0); } template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { - int EIGEN_ALIGN16 ai[4]; - ai[0] = from; - Packet4i vc = pload(ai); - vc = vec_splat(vc, 0); - return vc; + int EIGEN_ALIGN16 ai; + ai = from; + Packet4i vc = vec_lde(0, &ai); + return vec_splat(vc, 0); } template<> EIGEN_STRONG_INLINE void pbroadcast4(const float *a, @@ -310,42 +324,10 @@ template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b,p4f_ZERO); } -/* Commented out: it's actually slower than processing it scalar - * -template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) -{ - // Detailed in: http://freevec.org/content/32bit_signed_integer_multiplication_altivec - //Set up constants, variables - Packet4i a1, b1, bswap, low_prod, high_prod, prod, prod_, v1sel; - - // Get the absolute values - a1 = vec_abs(a); - b1 = vec_abs(b); - - // Get the signs using xor - Packet4bi sgn = (Packet4bi) vec_cmplt(vec_xor(a, b), p4i_ZERO); - - // Do the multiplication for the asbolute values. - bswap = (Packet4i) vec_rl((Packet4ui) b1, (Packet4ui) p4i_MINUS16 ); - low_prod = vec_mulo((Packet8i) a1, (Packet8i)b1); - high_prod = vec_msum((Packet8i) a1, (Packet8i) bswap, p4i_ZERO); - high_prod = (Packet4i) vec_sl((Packet4ui) high_prod, (Packet4ui) p4i_MINUS16); - prod = vec_add( low_prod, high_prod ); - - // NOR the product and select only the negative elements according to the sign mask - prod_ = vec_nor(prod, prod); - prod_ = vec_sel(p4i_ZERO, prod_, sgn); - - // Add 1 to the result to get the negative numbers - v1sel = vec_sel(p4i_ZERO, p4i_ONE, sgn); - prod_ = vec_add(prod_, v1sel); - - // Merge the results back to the final vector. - prod = vec_sel(prod, prod_, sgn); - - return prod; -} +/* +template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4i& a, const Packet4i& b) { return vec_madd(a,b,p4f_ZERO); } */ + template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { #ifndef __VSX__ // VSX actually provides a div instruction @@ -391,6 +373,10 @@ template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); } template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); } +template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { return vec_round(a); } +template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) { return vec_ceil(a); } +template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { return vec_floor(a); } + #ifdef _BIG_ENDIAN template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { @@ -494,16 +480,19 @@ template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& } #endif -#ifndef __VSX__ -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); } -template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); } -#endif +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_PPC_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_PPC_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; } -template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; } +template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; } -template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return (Packet4f)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE32); } -template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return (Packet4i)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE32); } +template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) +{ + return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE32)); +} +template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) +{ + return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE32)); } template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); } template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } @@ -511,9 +500,9 @@ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) { Packet4f b, sum; - b = (Packet4f) vec_sld(a, a, 8); + b = vec_sld(a, a, 8); sum = vec_add(a, b); - b = (Packet4f) vec_sld(sum, sum, 4); + b = vec_sld(sum, sum, 4); sum = vec_add(sum, b); return pfirst(sum); } @@ -591,8 +580,8 @@ template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) { Packet4f prod; - prod = pmul(a, (Packet4f)vec_sld(a, a, 8)); - return pfirst(pmul(prod, (Packet4f)vec_sld(prod, prod, 4))); + prod = pmul(a, vec_sld(a, a, 8)); + return pfirst(pmul(prod, vec_sld(prod, prod, 4))); } template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) @@ -716,17 +705,31 @@ ptranspose(PacketBlock& kernel) { kernel.packet[3] = vec_mergel(t1, t3); } +template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { + Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; + Packet4ui mask = reinterpret_cast(vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p4i_ONE))); + return vec_sel(elsePacket, thenPacket, mask); +} + +template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { + Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; + Packet4ui mask = reinterpret_cast(vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p4i_ONE))); + return vec_sel(elsePacket, thenPacket, mask); +} + //---------- double ---------- #ifdef __VSX__ typedef __vector double Packet2d; typedef __vector unsigned long long Packet2ul; typedef __vector long long Packet2l; +typedef __vector __bool long Packet2bl; -static Packet2l p2l_ZERO = (Packet2l) p4i_ZERO; -static Packet2d p2d_ONE = { 1.0, 1.0 }; -static Packet2d p2d_ZERO = (Packet2d) p4f_ZERO; -static Packet2d p2d_ZERO_ = { -0.0, -0.0 }; +static Packet2l p2l_ONE = { 1, 1 }; +static Packet2l p2l_ZERO = reinterpret_cast(p4i_ZERO); +static Packet2d p2d_ONE = { 1.0, 1.0 }; +static Packet2d p2d_ZERO = reinterpret_cast(p4f_ZERO); +static Packet2d p2d_ZERO_ = { -0.0, -0.0 }; #ifdef _BIG_ENDIAN static Packet2d p2d_COUNTDOWN = (Packet2d) vec_sld((Packet16uc) p2d_ZERO, (Packet16uc) p2d_ONE, 8); @@ -753,11 +756,26 @@ template<> struct packet_traits : default_packet_traits Vectorizable = 1, AlignedOnScalar = 1, size=2, - HasHalfPacket = 0, + HasHalfPacket = 1, + HasAdd = 1, + HasSub = 1, + HasMul = 1, HasDiv = 1, + HasMin = 1, + HasMax = 1, + HasAbs = 1, + HasSin = 0, + HasCos = 0, + HasLog = 0, HasExp = 1, - HasSqrt = 0 + HasSqrt = 1, + HasRsqrt = 1, + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasNegate = 1, + HasBlend = 1 }; }; @@ -784,8 +802,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { double EIGEN_ALIGN16 af[2]; af[0] = from; Packet2d vc = pload(af); - vc = vec_splat_dbl(vc, 0); - return vc; + return vec_splat_dbl(vc, 0); } template<> EIGEN_STRONG_INLINE void pbroadcast4(const double *a, @@ -840,6 +857,10 @@ template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); } +template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { return vec_round(a); } +template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { return vec_ceil(a); } +template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { return vec_floor(a); } + template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD @@ -859,12 +880,14 @@ template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& vec_vsx_st((Packet4f)from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to)); } -template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { vec_dstt((const float *) addr, DST_CTRL(2,2,32), DST_CHAN); } - -template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_PPC_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return (Packet2d)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE64); } +template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) +{ + return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE64)); +} template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); } template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) @@ -882,7 +905,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) v[1] = vec_add(vecs[1], (Packet2d) vec_sld((Packet4ui) vecs[1], (Packet4ui) vecs[1], 8)); #ifdef _BIG_ENDIAN - sum = (Packet2d) vec_sld((Packet4ui) v[0], (Packet4ui) v[1], 8); + sum = (Packet2d) vec_sld((Packet4ui) v[0], (Packet4ui) v[1], 8); #else sum = (Packet2d) vec_sld((Packet4ui) v[1], (Packet4ui) v[0], 8); #endif @@ -893,19 +916,19 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) // mul template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { - return pfirst(pmul(a, (Packet2d)vec_sld((Packet4ui) a, (Packet4ui) a, 8))); + return pfirst(pmul(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); } // min template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) { - return pfirst(vec_min(a, (Packet2d) vec_sld((Packet4ui) a, (Packet4ui) a, 8))); + return pfirst(pmin(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); } // max template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) { - return pfirst(vec_max(a, (Packet2d) vec_sld((Packet4ui) a, (Packet4ui) a, 8))); + return pfirst(pmax(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); } template @@ -915,9 +938,9 @@ struct palign_impl { if (Offset == 1) #ifdef _BIG_ENDIAN - first = (Packet2d) vec_sld((Packet4ui) first, (Packet4ui) second, 8); + first = reinterpret_cast(vec_sld(reinterpret_cast(first), reinterpret_cast(second), 8)); #else - first = (Packet2d) vec_sld((Packet4ui) second, (Packet4ui) first, 8); + first = reinterpret_cast(vec_sld(reinterpret_cast(second), reinterpret_cast(first), 8)); #endif } }; @@ -931,6 +954,11 @@ ptranspose(PacketBlock& kernel) { kernel.packet[1] = t1; } +template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { + Packet2l select = { ifPacket.select[0], ifPacket.select[1] }; + Packet2bl mask = vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p2l_ONE)); + return vec_sel(elsePacket, thenPacket, mask); +} #endif // __VSX__ } // end namespace internal -- cgit v1.2.3 From 62f9093b317fca2468c0b851e08d71bf266b996f Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 28 Apr 2016 14:33:09 -0300 Subject: improve state of MathFunctions as well --- Eigen/src/Core/arch/AltiVec/MathFunctions.h | 160 ++++++++++++++++------------ 1 file changed, 93 insertions(+), 67 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/Eigen/src/Core/arch/AltiVec/MathFunctions.h index 9e37e93f8..2f4281771 100644 --- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h +++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h @@ -3,6 +3,7 @@ // // Copyright (C) 2007 Julien Pommier // Copyright (C) 2009 Gael Guennebaud +// Copyright (C) 2016 Konstantinos Margaritis // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -19,39 +20,75 @@ namespace Eigen { namespace internal { -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f plog(const Packet4f& _x) -{ - Packet4f x = _x; - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); - _EIGEN_DECLARE_CONST_Packet4i(23, 23); +static _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); +static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); +static _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); +static _EIGEN_DECLARE_CONST_Packet4i(23, 23); - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000); +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000); - /* the smallest non denormalized float number */ - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000); - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000); // -1.f/0.f - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan, 0xffffffff); +/* the smallest non denormalized float number */ +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000); +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000); // -1.f/0.f +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan, 0xffffffff); - /* natural logarithm computed for 4 simultaneous float - return NaN for x <= 0 - */ - _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); +/* natural logarithm computed for 4 simultaneous float + return NaN for x <= 0 +*/ +static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); + +static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); +static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); + +static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); + +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); + +static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); +static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); +static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); + +static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437); +static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303); + +static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); + +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); + +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); + +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f plog(const Packet4f& _x) +{ + Packet4f x = _x; + Packet4i emm0; /* isvalid_mask is 0 if x < 0 or x is NaN. */ @@ -112,25 +149,6 @@ template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pexp(const Packet4f& _x) { Packet4f x = _x; - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); - _EIGEN_DECLARE_CONST_Packet4i(23, 23); - - - _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); - _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); - - _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); - - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); Packet4f tmp, fx; Packet4i emm0; @@ -171,7 +189,31 @@ Packet4f pexp(const Packet4f& _x) isnumber_mask); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f prsqrt(const Packet4f& x) +{ + return vec_rsqrt(x); +} + #ifdef __VSX__ +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d prsqrt(const Packet2d& x) +{ + return vec_rsqrt(x); +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f psqrt(const Packet4f& x) +{ + return vec_sqrt(x); +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d psqrt(const Packet2d& x) +{ + return vec_sqrt(x); +} + // VSX support varies between different compilers and even different // versions of the same compiler. For gcc version >= 4.9.3, we can use // vec_cts to efficiently convert Packet2d to Packet2l. Otherwise, use @@ -194,26 +236,10 @@ Packet2d pexp(const Packet2d& _x) { Packet2d x = _x; - _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); - _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); - _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); - - _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437); - _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); + emm0 = _mm_add_epi32(emm0, p4i_1023_0); + emm0 = _mm_slli_epi32(emm0, 20); + emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3)); + return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x); Packet2d tmp, fx; Packet2l emm0; -- cgit v1.2.3 From 6ed7a7281c491376e71177b0f758627d1fc0bc4e Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 28 Apr 2016 14:35:55 -0300 Subject: remove accidentally pasted code --- Eigen/src/Core/arch/AltiVec/MathFunctions.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/Eigen/src/Core/arch/AltiVec/MathFunctions.h index 2f4281771..207ac21f6 100644 --- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h +++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h @@ -236,11 +236,6 @@ Packet2d pexp(const Packet2d& _x) { Packet2d x = _x; - emm0 = _mm_add_epi32(emm0, p4i_1023_0); - emm0 = _mm_slli_epi32(emm0, 20); - emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3)); - return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x); - Packet2d tmp, fx; Packet2l emm0; -- cgit v1.2.3 From 87294c84a65b5835bc8fa85ae565e35d8fb86d38 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 28 Apr 2016 14:39:56 -0300 Subject: define Packet2d constants with VSX only --- Eigen/src/Core/arch/AltiVec/MathFunctions.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/Eigen/src/Core/arch/AltiVec/MathFunctions.h index 207ac21f6..0c137ec7e 100644 --- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h +++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h @@ -62,6 +62,7 @@ static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); +#ifdef __VSX__ static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); @@ -82,7 +83,7 @@ static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); - +#endif template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f plog(const Packet4f& _x) -- cgit v1.2.3 From e2ca4784851c117f9039719ff71ba2db9fc922b8 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 3 May 2016 23:15:29 +0200 Subject: bug #1214: consider denormals as zero in D&C SVD. This also workaround infinite binary search when compiling with ICC's unsafe optimizations. --- Eigen/src/SVD/BDCSVD.h | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index 3552c87bf..bfce1bec0 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -11,7 +11,7 @@ // Copyright (C) 2013 Jean Ceccato // Copyright (C) 2013 Pierre Zoppitelli // Copyright (C) 2013 Jitse Niesen -// Copyright (C) 2014 Gael Guennebaud +// Copyright (C) 2014-2016 Gael Guennebaud // // Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -21,6 +21,7 @@ #define EIGEN_BDCSVD_H // #define EIGEN_BDCSVD_DEBUG_VERBOSE // #define EIGEN_BDCSVD_SANITY_CHECKS + namespace Eigen { #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE @@ -228,6 +229,8 @@ BDCSVD& BDCSVD::compute(const MatrixType& matrix, unsign #endif allocate(matrix.rows(), matrix.cols(), computationOptions); using std::abs; + + const RealScalar considerZero = (std::numeric_limits::min)(); //**** step -1 - If the problem is too small, directly falls back to JacobiSVD and return if(matrix.cols() < m_algoswap) @@ -266,7 +269,7 @@ BDCSVD& BDCSVD::compute(const MatrixType& matrix, unsign { RealScalar a = abs(m_computed.coeff(i, i)); m_singularValues.coeffRef(i) = a * scale; - if (a == 0) + if (a::divide (Index firstCol, Index lastCol, Index firstRowW, using std::abs; const Index n = lastCol - firstCol + 1; const Index k = n/2; + const RealScalar considerZero = (std::numeric_limits::min)(); RealScalar alphaK; RealScalar betaK; RealScalar r0; @@ -434,7 +438,7 @@ void BDCSVD::divide (Index firstCol, Index lastCol, Index firstRowW, f = m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1); } if (m_compV) m_naiveV(firstRowW+k, firstColW) = 1; - if (r0 == 0) + if (r0::divide (Index firstCol, Index lastCol, Index firstRowW, template void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V) { + const RealScalar considerZero = (std::numeric_limits::min)(); + using std::abs; ArrayRef col0 = m_computed.col(firstCol).segment(firstCol, n); m_workspace.head(n) = m_computed.block(firstCol, firstCol, n, n).diagonal(); ArrayRef diag = m_workspace.head(n); @@ -575,7 +581,7 @@ void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec while(actual_n>1 && diag(actual_n-1)==0) --actual_n; Index m = 0; // size of the deflated problem for(Index k=0;kconsiderZero) m_workspaceI(m++) = k; Map perm(m_workspaceI.data(),m); @@ -600,7 +606,7 @@ void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec { Index actual_n = n; - while(actual_n>1 && col0(actual_n-1)==0) --actual_n; + while(actual_n>1 && abs(col0(actual_n-1))0) : " << ((singVals.array()-diag) / singVals.array()).head(actual_n).transpose() << "\n\n"; @@ -680,6 +686,7 @@ typename BDCSVD::RealScalar BDCSVD::secularEq(RealScalar res += numext::abs2(col0(j)) / ((diagShifted(j) - mu) * (diag(j) + shift + mu)); } return res; + } template @@ -798,7 +805,7 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d RealScalar leftShifted, rightShifted; if (shift == left) { - leftShifted = RealScalar(1)/NumTraits::highest(); + leftShifted = (std::numeric_limits::min)(); // I don't understand why the case k==0 would be special there: // if (k == 0) rightShifted = right - left; else rightShifted = (k==actual_n-1) ? right : ((right - left) * 0.6); // theoretically we can take 0.5, but let's be safe @@ -806,7 +813,7 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d else { leftShifted = -(right - left) * 0.6; - rightShifted = -RealScalar(1)/NumTraits::highest(); + rightShifted = -(std::numeric_limits::min)(); } RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift); @@ -817,7 +824,10 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE if(!(fLeft * fRight<0)) + { + std::cout << "fLeft: " << leftShifted << " - " << diagShifted.head(10).transpose() << "\n ; " << bool(left==shift) << " " << (left-shift) << "\n"; std::cout << k << " : " << fLeft << " * " << fRight << " == " << fLeft * fRight << " ; " << left << " - " << right << " -> " << leftShifted << " " << rightShifted << " shift=" << shift << "\n"; + } #endif eigen_internal_assert(fLeft * fRight < 0); @@ -1028,8 +1038,9 @@ void BDCSVD::deflation(Index firstCol, Index lastCol, Index k, Index Diagonal fulldiag(m_computed); VectorBlock,Dynamic> diag(fulldiag, firstCol+shift, length); + const RealScalar considerZero = (std::numeric_limits::min)(); RealScalar maxDiag = diag.tail((std::max)(Index(1),length-1)).cwiseAbs().maxCoeff(); - RealScalar epsilon_strict = NumTraits::epsilon() * maxDiag; + RealScalar epsilon_strict = numext::maxi(considerZero,NumTraits::epsilon() * maxDiag); RealScalar epsilon_coarse = 8 * NumTraits::epsilon() * numext::maxi(col0.cwiseAbs().maxCoeff(), maxDiag); #ifdef EIGEN_BDCSVD_SANITY_CHECKS @@ -1082,7 +1093,7 @@ void BDCSVD::deflation(Index firstCol, Index lastCol, Index k, Index { // Check for total deflation // If we have a total deflation, then we have to consider col0(0)==diag(0) as a singular value during sorting - bool total_deflation = (col0.tail(length-1).array()==RealScalar(0)).all(); + bool total_deflation = (col0.tail(length-1).array()::deflation(Index firstCol, Index lastCol, Index k, Index // Move deflated diagonal entries at the end. for(Index i=1; i::deflation(Index firstCol, Index lastCol, Index k, Index for(Index i=1; i::deflation(Index firstCol, Index lastCol, Index k, Index //condition 4.4 { Index i = length-1; - while(i>0 && (diag(i)==0 || col0(i)==0)) --i; + while(i>0 && (abs(diag(i))1;--i) if( (diag(i) - diag(i-1)) < NumTraits::epsilon()*maxDiag ) { @@ -1177,7 +1188,7 @@ void BDCSVD::deflation(Index firstCol, Index lastCol, Index k, Index #ifdef EIGEN_BDCSVD_SANITY_CHECKS for(Index j=2;j Date: Tue, 3 May 2016 19:56:40 -0700 Subject: Use numext::isfinite instead of std::isfinite --- unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h index 0f6dcedaa..4236c75a6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h @@ -67,10 +67,9 @@ class TensorOpCost { bytes_stored_(bytes_stored), compute_cycles_(vectorized ? compute_cycles / packet_size : compute_cycles) { - using std::isfinite; - eigen_assert(bytes_loaded >= 0 && (isfinite)(bytes_loaded)); - eigen_assert(bytes_stored >= 0 && (isfinite)(bytes_stored)); - eigen_assert(compute_cycles >= 0 && (isfinite)(compute_cycles)); + eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded)); + eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored)); + eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const { -- cgit v1.2.3 From 75a94b966246f3987d3a95ea6d5feaba6ecc2d03 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 4 May 2016 12:53:14 +0200 Subject: Improve documentation of BDCSVD --- Eigen/src/SVD/BDCSVD.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index bfce1bec0..8df570647 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -50,6 +50,18 @@ struct traits > * * \tparam _MatrixType the type of the matrix of which we are computing the SVD decomposition * + * This class first reduces the input matrix to bi-diagonal form using class UpperBidiagonalization, + * and then performs a divide-and-conquer diagonalization. Small blocks are diagonalized using class JacobiSVD. + * You can control the switching size with the setSwitchSize() method, default is 16. + * For small matrice (<16), it is thus preferable to directly use JacobiSVD. For larger ones, BDCSVD is highly + * recommended and can several order of magnitude faster. + * + * \warning this algorithm is unlikely to provide accurate result when compiled with unsafe math optimizations. + * For instance, this concerns Intel's compiler (ICC), which perfroms such optimization by default unless + * you compile with the \c -fp-model \c precise option. Likewise, the \c -ffast-math option of GCC or clang will + * significantly degrade the accuracy. + * + * \sa class JacobiSVD */ template class BDCSVD : public SVDBase > -- cgit v1.2.3 From be78aea6b3d2443d1c614a0da7a56dfefbad4520 Mon Sep 17 00:00:00 2001 From: Ola Røer Thorsen Date: Wed, 4 May 2016 10:52:08 +0200 Subject: fix double-promotion/float-conversion in Core/SpecialFunctions.h --- Eigen/src/Core/SpecialFunctions.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index 3513a5c63..c6a50bb1d 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -284,7 +284,7 @@ struct digamma_impl { bool negative = false; const Scalar maxnum = NumTraits::infinity(); - const Scalar m_pi(EIGEN_PI); + const Scalar m_pi = Scalar(EIGEN_PI); const Scalar zero = Scalar(0); const Scalar one = Scalar(1); @@ -441,7 +441,7 @@ struct igamma_helper { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float big() { // use epsneg (1.0 - epsneg == 1.0) - return 1.0 / (NumTraits::epsilon() / 2); + return 1.0f / (NumTraits::epsilon() / 2); } }; @@ -742,7 +742,7 @@ struct igamma_impl { const Scalar machep = igamma_helper::machep(); const Scalar maxlog = numext::log(NumTraits::highest()); - double ans, ax, c, r; + Scalar ans, ax, c, r; /* Compute x**a * exp(-x) / gamma(a) */ ax = a * numext::log(x) - x - lgamma_impl::run(a); -- cgit v1.2.3 From dd2b45feede628b66b2b9cf07d44fedf666da4fa Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 4 May 2016 16:57:52 -0700 Subject: Removed extraneous 'explicit' keywords --- unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index f0b8ac958..7eccdf7de 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -190,13 +190,13 @@ template { EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 2 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE) } #else - EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1) { + EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1) { eigen_assert(NumDims == 2); (*this)[0] = i0; (*this)[1] = i1; } - EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) { + EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) { eigen_assert(NumDims == 3); (*this)[0] = i0; (*this)[1] = i1; (*this)[2] = i2; } - EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) { + EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) { eigen_assert(NumDims == 4); (*this)[0] = i0; (*this)[1] = i1; (*this)[2] = i2; (*this)[3] = i3; } - EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) { + EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) { eigen_assert(NumDims == 5); (*this)[0] = i0; (*this)[1] = i1; -- cgit v1.2.3 From 62b710072e282ad70bbcb38468367f7f99232d32 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 4 May 2016 21:08:22 -0700 Subject: Reduced the memory footprint of the cxx11_tensor_image_patch test --- unsupported/test/cxx11_tensor_image_patch.cpp | 49 +++++++-------------------- 1 file changed, 12 insertions(+), 37 deletions(-) diff --git a/unsupported/test/cxx11_tensor_image_patch.cpp b/unsupported/test/cxx11_tensor_image_patch.cpp index 5d6a49181..988b01481 100644 --- a/unsupported/test/cxx11_tensor_image_patch.cpp +++ b/unsupported/test/cxx11_tensor_image_patch.cpp @@ -568,13 +568,7 @@ static void test_imagenet_patches() VERIFY_IS_EQUAL(l_out.dimension(4), 16); // RowMajor - Tensor l_in_row_major = l_in.swap_layout(); - VERIFY_IS_EQUAL(l_in.dimension(0), l_in_row_major.dimension(3)); - VERIFY_IS_EQUAL(l_in.dimension(1), l_in_row_major.dimension(2)); - VERIFY_IS_EQUAL(l_in.dimension(2), l_in_row_major.dimension(1)); - VERIFY_IS_EQUAL(l_in.dimension(3), l_in_row_major.dimension(0)); - - Tensor l_out_row_major = l_in_row_major.extract_image_patches(11, 11); + Tensor l_out_row_major = l_in.swap_layout().extract_image_patches(11, 11); VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 16); VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 128*128); VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 11); @@ -589,10 +583,8 @@ static void test_imagenet_patches() for (int r = 0; r < 11; ++r) { for (int d = 0; d < 3; ++d) { float expected = 0.0f; - float expected_row_major = 0.0f; if (r-5+i >= 0 && c-5+j >= 0 && r-5+i < 128 && c-5+j < 128) { expected = l_in(d, r-5+i, c-5+j, b); - expected_row_major = l_in_row_major(b, c-5+j, r-5+i, d); } // ColMajor if (l_out(d, r, c, patchId, b) != expected) { @@ -601,15 +593,13 @@ static void test_imagenet_patches() VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected); // RowMajor if (l_out_row_major(b, patchId, c, r, d) != - expected_row_major) { + expected) { std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; } VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), - expected_row_major); - // Check that ColMajor and RowMajor agree. - VERIFY_IS_EQUAL(expected, expected_row_major); + expected); } } } @@ -628,8 +618,7 @@ static void test_imagenet_patches() VERIFY_IS_EQUAL(l_out.dimension(4), 32); // RowMajor - l_in_row_major = l_in.swap_layout(); - l_out_row_major = l_in_row_major.extract_image_patches(9, 9); + l_out_row_major = l_in.swap_layout().extract_image_patches(9, 9); VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32); VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 64*64); VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 9); @@ -644,10 +633,8 @@ static void test_imagenet_patches() for (int r = 0; r < 9; ++r) { for (int d = 0; d < 16; ++d) { float expected = 0.0f; - float expected_row_major = 0.0f; if (r-4+i >= 0 && c-4+j >= 0 && r-4+i < 64 && c-4+j < 64) { expected = l_in(d, r-4+i, c-4+j, b); - expected_row_major = l_in_row_major(b, c-4+j, r-4+i, d); } // ColMajor if (l_out(d, r, c, patchId, b) != expected) { @@ -655,12 +642,10 @@ static void test_imagenet_patches() } VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected); // RowMajor - if (l_out_row_major(b, patchId, c, r, d) != expected_row_major) { + if (l_out_row_major(b, patchId, c, r, d) != expected) { std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; } - VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected_row_major); - // Check that ColMajor and RowMajor agree. - VERIFY_IS_EQUAL(expected, expected_row_major); + VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected); } } } @@ -679,8 +664,7 @@ static void test_imagenet_patches() VERIFY_IS_EQUAL(l_out.dimension(4), 32); // RowMajor - l_in_row_major = l_in.swap_layout(); - l_out_row_major = l_in_row_major.extract_image_patches(7, 7); + l_out_row_major = l_in.swap_layout().extract_image_patches(7, 7); VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32); VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 16*16); VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 7); @@ -695,10 +679,8 @@ static void test_imagenet_patches() for (int r = 0; r < 7; ++r) { for (int d = 0; d < 32; ++d) { float expected = 0.0f; - float expected_row_major = 0.0f; if (r-3+i >= 0 && c-3+j >= 0 && r-3+i < 16 && c-3+j < 16) { expected = l_in(d, r-3+i, c-3+j, b); - expected_row_major = l_in_row_major(b, c-3+j, r-3+i, d); } // ColMajor if (l_out(d, r, c, patchId, b) != expected) { @@ -706,12 +688,10 @@ static void test_imagenet_patches() } VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected); // RowMajor - if (l_out_row_major(b, patchId, c, r, d) != expected_row_major) { + if (l_out_row_major(b, patchId, c, r, d) != expected) { std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; } - VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected_row_major); - // Check that ColMajor and RowMajor agree. - VERIFY_IS_EQUAL(expected, expected_row_major); + VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected); } } } @@ -730,8 +710,7 @@ static void test_imagenet_patches() VERIFY_IS_EQUAL(l_out.dimension(4), 32); // RowMajor - l_in_row_major = l_in.swap_layout(); - l_out_row_major = l_in_row_major.extract_image_patches(3, 3); + l_out_row_major = l_in.swap_layout().extract_image_patches(3, 3); VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32); VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 13*13); VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 3); @@ -746,10 +725,8 @@ static void test_imagenet_patches() for (int r = 0; r < 3; ++r) { for (int d = 0; d < 64; ++d) { float expected = 0.0f; - float expected_row_major = 0.0f; if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 13 && c-1+j < 13) { expected = l_in(d, r-1+i, c-1+j, b); - expected_row_major = l_in_row_major(b, c-1+j, r-1+i, d); } // ColMajor if (l_out(d, r, c, patchId, b) != expected) { @@ -757,12 +734,10 @@ static void test_imagenet_patches() } VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected); // RowMajor - if (l_out_row_major(b, patchId, c, r, d) != expected_row_major) { + if (l_out_row_major(b, patchId, c, r, d) != expected) { std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; } - VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected_row_major); - // Check that ColMajor and RowMajor agree. - VERIFY_IS_EQUAL(expected, expected_row_major); + VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected); } } } -- cgit v1.2.3 From dacb469bc93b5b8578afad19d327606659ec3a55 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Thu, 5 May 2016 13:35:45 +0200 Subject: Enable and fix -Wdouble-conversion warnings --- CMakeLists.txt | 6 ++++-- Eigen/src/Core/MathFunctions.h | 2 +- Eigen/src/Core/functors/UnaryFunctors.h | 4 ++-- Eigen/src/Geometry/Rotation2D.h | 10 ++++----- Eigen/src/SVD/BDCSVD.h | 12 +++++------ blas/single.cpp | 2 +- test/fastmath.cpp | 2 +- test/geo_hyperplane.cpp | 10 ++++----- test/geo_quaternion.cpp | 6 +++--- test/geo_transformations.cpp | 2 +- test/linearstructure.cpp | 3 ++- test/packetmath.cpp | 2 +- test/qr_colpivoting.cpp | 2 +- test/sparse_basic.cpp | 4 ++-- test/sparse_block.cpp | 2 +- test/sparse_product.cpp | 2 +- test/sparse_vector.cpp | 2 +- test/sparseqr.cpp | 2 +- test/svd_common.h | 4 ++-- test/svd_fill.h | 2 +- test/triangular.cpp | 4 ++-- .../Eigen/src/MatrixFunctions/MatrixExponential.h | 4 ++-- .../Eigen/src/MatrixFunctions/MatrixFunction.h | 3 ++- .../Eigen/src/MatrixFunctions/MatrixLogarithm.h | 5 +++-- .../Eigen/src/MatrixFunctions/MatrixPower.h | 2 +- unsupported/Eigen/src/Splines/Spline.h | 4 ++-- unsupported/test/FFTW.cpp | 2 +- unsupported/test/autodiff.cpp | 3 ++- unsupported/test/cxx11_float16.cpp | 4 ++-- unsupported/test/cxx11_tensor_expr.cpp | 24 +++++++++++----------- unsupported/test/cxx11_tensor_fft.cpp | 8 ++++---- unsupported/test/cxx11_tensor_fixed_size.cpp | 16 +++++++-------- unsupported/test/matrix_function.cpp | 4 ++-- unsupported/test/matrix_power.cpp | 2 +- 34 files changed, 86 insertions(+), 80 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3b3753332..b1247f75f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -120,7 +120,7 @@ endmacro(ei_add_cxx_compiler_flag) if(NOT MSVC) # We assume that other compilers are partly compatible with GNUCC - # clang outputs some warnings for unknwon flags that are not caught by check_cxx_compiler_flag + # clang outputs some warnings for unknown flags that are not caught by check_cxx_compiler_flag # adding -Werror turns such warnings into errors check_cxx_compiler_flag("-Werror" COMPILER_SUPPORT_WERROR) if(COMPILER_SUPPORT_WERROR) @@ -143,6 +143,8 @@ if(NOT MSVC) ei_add_cxx_compiler_flag("-Wshorten-64-to-32") ei_add_cxx_compiler_flag("-Wenum-conversion") ei_add_cxx_compiler_flag("-Wc++11-extensions") + ei_add_cxx_compiler_flag("-Wdouble-promotion") +# ei_add_cxx_compiler_flag("-Wconversion") # -Wshadow is insanely too strict with gcc, hopefully it will become usable with gcc 6 # if(NOT CMAKE_COMPILER_IS_GNUCXX OR (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "5.0.0")) @@ -158,7 +160,7 @@ if(NOT MSVC) ei_add_cxx_compiler_flag("-fno-common") ei_add_cxx_compiler_flag("-fstrict-aliasing") ei_add_cxx_compiler_flag("-wd981") # disable ICC's "operands are evaluated in unspecified order" remark - ei_add_cxx_compiler_flag("-wd2304") # disbale ICC's "warning #2304: non-explicit constructor with single argument may cause implicit type conversion" produced by -Wnon-virtual-dtor + ei_add_cxx_compiler_flag("-wd2304") # disable ICC's "warning #2304: non-explicit constructor with single argument may cause implicit type conversion" produced by -Wnon-virtual-dtor # The -ansi flag must be added last, otherwise it is also used as a linker flag by check_cxx_compiler_flag making it fails diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 5771abf7d..f31046b54 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -1192,7 +1192,7 @@ double tanh(const double &x) { return ::tanh(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T fmod(const T& a, const T& b) { - EIGEN_USING_STD_MATH(floor); + EIGEN_USING_STD_MATH(fmod); return fmod(a, b); } diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index 5baba1494..488ebf1d2 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -880,9 +880,9 @@ struct scalar_sign_op { { typedef typename NumTraits::Real real_type; real_type aa = numext::abs(a); - if (aa==0) + if (aa==real_type(0)) return Scalar(0); - aa = 1./aa; + aa = real_type(1)/aa; return Scalar(real(a)*aa, imag(a)*aa ); } //TODO diff --git a/Eigen/src/Geometry/Rotation2D.h b/Eigen/src/Geometry/Rotation2D.h index 5ab0d5920..b42a7df70 100644 --- a/Eigen/src/Geometry/Rotation2D.h +++ b/Eigen/src/Geometry/Rotation2D.h @@ -82,15 +82,15 @@ public: /** \returns the rotation angle in [0,2pi] */ inline Scalar smallestPositiveAngle() const { - Scalar tmp = fmod(m_angle,Scalar(2)*EIGEN_PI); - return tmpScalar(EIGEN_PI)) tmp -= Scalar(2)*Scalar(EIGEN_PI); - else if(tmp<-Scalar(EIGEN_PI)) tmp += Scalar(2)*Scalar(EIGEN_PI); + Scalar tmp = numext::fmod(m_angle,Scalar(2*EIGEN_PI)); + if(tmp>Scalar(EIGEN_PI)) tmp -= Scalar(2*EIGEN_PI); + else if(tmp<-Scalar(EIGEN_PI)) tmp += Scalar(2*EIGEN_PI); return tmp; } diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index 8df570647..799e81bd7 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -765,14 +765,14 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d RealScalar muPrev, muCur; if (shift == left) { - muPrev = (right - left) * 0.1; + muPrev = (right - left) * RealScalar(0.1); if (k == actual_n-1) muCur = right - left; - else muCur = (right - left) * 0.5; + else muCur = (right - left) * RealScalar(0.5); } else { - muPrev = -(right - left) * 0.1; - muCur = -(right - left) * 0.5; + muPrev = -(right - left) * RealScalar(0.1); + muCur = -(right - left) * RealScalar(0.5); } RealScalar fPrev = secularEq(muPrev, col0, diag, perm, diagShifted, shift); @@ -820,11 +820,11 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d leftShifted = (std::numeric_limits::min)(); // I don't understand why the case k==0 would be special there: // if (k == 0) rightShifted = right - left; else - rightShifted = (k==actual_n-1) ? right : ((right - left) * 0.6); // theoretically we can take 0.5, but let's be safe + rightShifted = (k==actual_n-1) ? right : ((right - left) * RealScalar(0.6)); // theoretically we can take 0.5, but let's be safe } else { - leftShifted = -(right - left) * 0.6; + leftShifted = -(right - left) * RealScalar(0.6); rightShifted = -(std::numeric_limits::min)(); } diff --git a/blas/single.cpp b/blas/single.cpp index 836e3eee2..20ea57d5c 100644 --- a/blas/single.cpp +++ b/blas/single.cpp @@ -19,4 +19,4 @@ #include "level3_impl.h" float BLASFUNC(sdsdot)(int* n, float* alpha, float* x, int* incx, float* y, int* incy) -{ return *alpha + BLASFUNC(dsdot)(n, x, incx, y, incy); } +{ return double(*alpha) + BLASFUNC(dsdot)(n, x, incx, y, incy); } diff --git a/test/fastmath.cpp b/test/fastmath.cpp index efdd5b313..438e6b2e5 100644 --- a/test/fastmath.cpp +++ b/test/fastmath.cpp @@ -49,7 +49,7 @@ void check_inf_nan(bool dryrun) { VERIFY( !m.allFinite() ); VERIFY( m.hasNaN() ); } - m(4) /= 0.0; + m(4) /= T(0.0); if(dryrun) { std::cout << "std::isfinite(" << m(4) << ") = "; check((std::isfinite)(m(4)),false); std::cout << " ; numext::isfinite = "; check((numext::isfinite)(m(4)), false); std::cout << "\n"; diff --git a/test/geo_hyperplane.cpp b/test/geo_hyperplane.cpp index c1cc691c9..e77702bc7 100644 --- a/test/geo_hyperplane.cpp +++ b/test/geo_hyperplane.cpp @@ -97,9 +97,9 @@ template void lines() Vector u = Vector::Random(); Vector v = Vector::Random(); Scalar a = internal::random(); - while (abs(a-1) < 1e-4) a = internal::random(); - while (u.norm() < 1e-4) u = Vector::Random(); - while (v.norm() < 1e-4) v = Vector::Random(); + while (abs(a-1) < Scalar(1e-4)) a = internal::random(); + while (u.norm() < Scalar(1e-4)) u = Vector::Random(); + while (v.norm() < Scalar(1e-4)) v = Vector::Random(); HLine line_u = HLine::Through(center + u, center + a*u); HLine line_v = HLine::Through(center + v, center + a*v); @@ -111,14 +111,14 @@ template void lines() Vector result = line_u.intersection(line_v); // the lines should intersect at the point we called "center" - if(abs(a-1) > 1e-2 && abs(v.normalized().dot(u.normalized()))<0.9) + if(abs(a-1) > Scalar(1e-2) && abs(v.normalized().dot(u.normalized())) void check_slerp(const QuatType& q0, const QuatType& Scalar largeEps = test_precision(); Scalar theta_tot = AA(q1*q0.inverse()).angle(); - if(theta_tot>EIGEN_PI) + if(theta_tot>Scalar(EIGEN_PI)) theta_tot = Scalar(2.*EIGEN_PI)-theta_tot; for(Scalar t=0; t<=Scalar(1.001); t+=Scalar(0.1)) { @@ -115,8 +115,8 @@ template void quaternion(void) // Do not execute the test if the rotation angle is almost zero, or // the rotation axis and v1 are almost parallel. if (abs(aa.angle()) > 5*test_precision() - && (aa.axis() - v1.normalized()).norm() < 1.99 - && (aa.axis() + v1.normalized()).norm() < 1.99) + && (aa.axis() - v1.normalized()).norm() < Scalar(1.99) + && (aa.axis() + v1.normalized()).norm() < Scalar(1.99)) { VERIFY_IS_NOT_APPROX(q1 * v1, Quaternionx(AngleAxisx(aa.angle()*2,aa.axis())) * v1); } diff --git a/test/geo_transformations.cpp b/test/geo_transformations.cpp index 51f90036d..48393a5c6 100644 --- a/test/geo_transformations.cpp +++ b/test/geo_transformations.cpp @@ -466,7 +466,7 @@ template void transformations() Scalar a2 = R0.slerp(Scalar(k+1)/Scalar(path_steps), R1).angle(); l += std::abs(a2-a1); } - VERIFY(l<=EIGEN_PI*(Scalar(1)+NumTraits::epsilon()*Scalar(path_steps/2))); + VERIFY(l<=Scalar(EIGEN_PI)*(Scalar(1)+NumTraits::epsilon()*Scalar(path_steps/2))); // check basic features { diff --git a/test/linearstructure.cpp b/test/linearstructure.cpp index 292f33969..e7f4b3dc5 100644 --- a/test/linearstructure.cpp +++ b/test/linearstructure.cpp @@ -21,6 +21,7 @@ template void linearStructure(const MatrixType& m) */ typedef typename MatrixType::Index Index; typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::RealScalar RealScalar; Index rows = m.rows(); Index cols = m.cols(); @@ -32,7 +33,7 @@ template void linearStructure(const MatrixType& m) m3(rows, cols); Scalar s1 = internal::random(); - while (abs(s1)<1e-3) s1 = internal::random(); + while (abs(s1)(); Index r = internal::random(0, rows-1), c = internal::random(0, cols-1); diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 37da6c86f..7f5a6512d 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -387,7 +387,7 @@ template void packetmath_real() data2[i] = internal::random(0,1) * std::pow(Scalar(10), internal::random(-6,6)); } - if(internal::random(0,1)<0.1) + if(internal::random(0,1)<0.1f) data1[internal::random(0, PacketSize)] = 0; CHECK_CWISE1_IF(PacketTraits::HasSqrt, std::sqrt, internal::psqrt); CHECK_CWISE1_IF(PacketTraits::HasLog, std::log, internal::plog); diff --git a/test/qr_colpivoting.cpp b/test/qr_colpivoting.cpp index 46c54b74f..ef3a6173b 100644 --- a/test/qr_colpivoting.cpp +++ b/test/qr_colpivoting.cpp @@ -206,7 +206,7 @@ template void qr_kahan_matrix() RealScalar c = std::sqrt(1 - s*s); for (Index i = 0; i < rows; ++i) { m1(i, i) = pow(s, i); - m1.row(i).tail(rows - i - 1) = -pow(s, i) * c * MatrixType::Ones(1, rows - i - 1); + m1.row(i).tail(rows - i - 1) = -RealScalar(pow(s, i)) * c * MatrixType::Ones(1, rows - i - 1); } m1 = (m1 + m1.transpose()).eval(); ColPivHouseholderQR qr(m1); diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp index cb8ebaedf..aa3882583 100644 --- a/test/sparse_basic.cpp +++ b/test/sparse_basic.cpp @@ -232,11 +232,11 @@ template void sparse_basic(const SparseMatrixType& re for (Index i=0; i(0,1); - if (x<0.1) + if (x<0.1f) { // do nothing } - else if (x<0.5) + else if (x<0.5f) { countFalseNonZero++; m2.insert(i,j) = Scalar(0); diff --git a/test/sparse_block.cpp b/test/sparse_block.cpp index 8a6e0687c..582bf34c3 100644 --- a/test/sparse_block.cpp +++ b/test/sparse_block.cpp @@ -150,7 +150,7 @@ template void sparse_block(const SparseMatrixType& re DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols); SparseMatrixType m2(rows, cols); initSparse(density, refMat2, m2); - if(internal::random(0,1)>0.5) m2.makeCompressed(); + if(internal::random(0,1)>0.5f) m2.makeCompressed(); Index j0 = internal::random(0,outer-2); Index j1 = internal::random(0,outer-2); Index n0 = internal::random(1,outer-(std::max)(j0,j1)); diff --git a/test/sparse_product.cpp b/test/sparse_product.cpp index 7ec5270e8..501aeeaa6 100644 --- a/test/sparse_product.cpp +++ b/test/sparse_product.cpp @@ -245,7 +245,7 @@ template void sparse_product() for (int k=0; k void sparse_vector(int rows, int cols) { double densityMat = (std::max)(8./(rows*cols), 0.01); - double densityVec = (std::max)(8./float(rows), 0.1); + double densityVec = (std::max)(8./(rows), 0.1); typedef Matrix DenseMatrix; typedef Matrix DenseVector; typedef SparseVector SparseVectorType; diff --git a/test/sparseqr.cpp b/test/sparseqr.cpp index 50d1fcdf2..e8605fd21 100644 --- a/test/sparseqr.cpp +++ b/test/sparseqr.cpp @@ -54,7 +54,7 @@ template void test_sparseqr_scalar() b = dA * DenseVector::Random(A.cols()); solver.compute(A); - if(internal::random(0,1)>0.5) + if(internal::random(0,1)>0.5f) solver.factorize(A); // this checks that calling analyzePattern is not needed if the pattern do not change. if (solver.info() != Success) { diff --git a/test/svd_common.h b/test/svd_common.h index d8611b541..3588eefaa 100644 --- a/test/svd_common.h +++ b/test/svd_common.h @@ -141,14 +141,14 @@ void svd_least_square(const MatrixType& m, unsigned int computationOptions) using std::abs; SolutionType y(x); - y.row(k) = (1.+2*NumTraits::epsilon())*x.row(k); + y.row(k) = (RealScalar(1)+2*NumTraits::epsilon())*x.row(k); RealScalar residual_y = (m*y-rhs).norm(); VERIFY( test_isMuchSmallerThan(abs(residual_y-residual), rhs_norm) || residual < residual_y ); if(internal::is_same::value) ++g_test_level; VERIFY( test_isApprox(residual_y,residual) || residual < residual_y ); if(internal::is_same::value) --g_test_level; - y.row(k) = (1.-2*NumTraits::epsilon())*x.row(k); + y.row(k) = (RealScalar(1)-2*NumTraits::epsilon())*x.row(k); residual_y = (m*y-rhs).norm(); VERIFY( test_isMuchSmallerThan(abs(residual_y-residual), rhs_norm) || residual < residual_y ); if(internal::is_same::value) ++g_test_level; diff --git a/test/svd_fill.h b/test/svd_fill.h index 1bbe645ee..500954d47 100644 --- a/test/svd_fill.h +++ b/test/svd_fill.h @@ -54,7 +54,7 @@ void svd_fill_random(MatrixType &m, int Option = 0) } Matrix samples(7); - samples << 0, 5.60844e-313, -5.60844e-313, 4.94e-324, -4.94e-324, -1./NumTraits::highest(), 1./NumTraits::highest(); + samples << 0, 5.60844e-313, -5.60844e-313, 4.94e-324, -4.94e-324, -RealScalar(1)/NumTraits::highest(), RealScalar(1)/NumTraits::highest(); if(Option==Symmetric) { diff --git a/test/triangular.cpp b/test/triangular.cpp index 936c2aef3..3e120f406 100644 --- a/test/triangular.cpp +++ b/test/triangular.cpp @@ -65,7 +65,7 @@ template void triangular_square(const MatrixType& m) m1 = MatrixType::Random(rows, cols); for (int i=0; i(); + while (numext::abs2(m1(i,i))(); Transpose trm4(m4); // test back and forward subsitution with a vector as the rhs @@ -78,7 +78,7 @@ template void triangular_square(const MatrixType& m) m3 = m1.template triangularView(); VERIFY(v2.isApprox(m3.conjugate() * (m1.conjugate().template triangularView().solve(v2)), largerEps)); - // test back and forward subsitution with a matrix as the rhs + // test back and forward substitution with a matrix as the rhs m3 = m1.template triangularView(); VERIFY(m2.isApprox(m3.adjoint() * (m1.adjoint().template triangularView().solve(m2)), largerEps)); m3 = m1.template triangularView(); diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h index bbb7e5776..af515eb13 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h @@ -210,9 +210,9 @@ struct matrix_exp_computeUV using std::pow; const float l1norm = arg.cwiseAbs().colwise().sum().maxCoeff(); squarings = 0; - if (l1norm < 4.258730016922831e-001) { + if (l1norm < 4.258730016922831e-001f) { matrix_exp_pade3(arg, U, V); - } else if (l1norm < 1.880152677804762e+000) { + } else if (l1norm < 1.880152677804762e+000f) { matrix_exp_pade5(arg, U, V); } else { const float maxnorm = 3.925724783138660f; diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h index 8f7a6f3b0..077853cbd 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h @@ -132,6 +132,7 @@ template void matrix_function_partition_eigenvalues(const EivalsType& eivals, std::list& clusters) { typedef typename EivalsType::Index Index; + typedef typename EivalsType::RealScalar RealScalar; for (Index i=0; i::iterator qi = matrix_function_find_cluster(i, clusters); @@ -145,7 +146,7 @@ void matrix_function_partition_eigenvalues(const EivalsType& eivals, std::listbegin(), qi->end(), j) == qi->end()) { typename std::list::iterator qj = matrix_function_find_cluster(j, clusters); if (qj == clusters.end()) { diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h index e43e86e90..8a78fc1f7 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h @@ -37,6 +37,7 @@ template void matrix_log_compute_2x2(const MatrixType& A, MatrixType& result) { typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::RealScalar RealScalar; using std::abs; using std::ceil; using std::imag; @@ -54,14 +55,14 @@ void matrix_log_compute_2x2(const MatrixType& A, MatrixType& result) { result(0,1) = A(0,1) / A(0,0); } - else if ((abs(A(0,0)) < 0.5*abs(A(1,1))) || (abs(A(0,0)) > 2*abs(A(1,1)))) + else if ((abs(A(0,0)) < RealScalar(0.5)*abs(A(1,1))) || (abs(A(0,0)) > 2*abs(A(1,1)))) { result(0,1) = A(0,1) * (logA11 - logA00) / y; } else { // computation in previous branch is inaccurate if A(1,1) \approx A(0,0) - int unwindingNumber = static_cast(ceil((imag(logA11 - logA00) - EIGEN_PI) / (2*EIGEN_PI))); + int unwindingNumber = static_cast(ceil((imag(logA11 - logA00) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI))); result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,2*EIGEN_PI*unwindingNumber)) / y; } } diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h index f37d31c3f..6167368d8 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h @@ -298,7 +298,7 @@ MatrixPowerAtomic::computeSuperDiag(const ComplexScalar& curr, const ComplexScalar logCurr = log(curr); ComplexScalar logPrev = log(prev); - int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - EIGEN_PI) / (2*EIGEN_PI)); + int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI)); ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, EIGEN_PI*unwindingNumber); return RealScalar(2) * exp(RealScalar(0.5) * p * (logCurr + logPrev)) * sinh(p * w) / (curr - prev); } diff --git a/unsupported/Eigen/src/Splines/Spline.h b/unsupported/Eigen/src/Splines/Spline.h index d1636f466..ddcddfc9a 100644 --- a/unsupported/Eigen/src/Splines/Spline.h +++ b/unsupported/Eigen/src/Splines/Spline.h @@ -394,7 +394,7 @@ namespace Eigen Matrix ndu(p+1,p+1); - double saved, temp; + Scalar saved, temp; // FIXME These were double instead of Scalar. Was there a reason for that? ndu(0,0) = 1.0; @@ -433,7 +433,7 @@ namespace Eigen // Compute the k-th derivative for (DenseIndex k=1; k<=static_cast(n); ++k) { - double d = 0.0; + Scalar d = 0.0; DenseIndex rk,pk,j1,j2; rk = r-k; pk = p-k; diff --git a/unsupported/test/FFTW.cpp b/unsupported/test/FFTW.cpp index d3718e2d2..1dd6dc97d 100644 --- a/unsupported/test/FFTW.cpp +++ b/unsupported/test/FFTW.cpp @@ -54,7 +54,7 @@ complex promote(long double x) { return complex( x); long double difpower=0; size_t n = (min)( buf1.size(),buf2.size() ); for (size_t k=0;k(x*2 - 1 + pow(1+x,2) + 2*sqrt(y*y+0) - 4 * sin(0+x) + 2 * cos(y+0) - exp(-0.5*x*x+0)); + // pow(float, int) promotes to pow(double, double) + return x*2 - 1 + static_cast(pow(1+x,2)) + 2*sqrt(y*y+0) - 4 * sin(0+x) + 2 * cos(y+0) - exp(Scalar(-0.5)*x*x+0); //return x+2*y*x;//x*2 -std::pow(x,2);//(2*y/x);// - y*2; EIGEN_ASM_COMMENT("myend"); } diff --git a/unsupported/test/cxx11_float16.cpp b/unsupported/test/cxx11_float16.cpp index 9a813653c..9141c4820 100644 --- a/unsupported/test/cxx11_float16.cpp +++ b/unsupported/test/cxx11_float16.cpp @@ -34,8 +34,8 @@ void test_conversion() float val1 = float(half(__half(0x3c00))); float val2 = float(half(__half(0x3c01))); float val3 = float(half(__half(0x3c02))); - VERIFY_IS_EQUAL(half(0.5 * (val1 + val2)).x, 0x3c00); - VERIFY_IS_EQUAL(half(0.5 * (val2 + val3)).x, 0x3c02); + VERIFY_IS_EQUAL(half(0.5f * (val1 + val2)).x, 0x3c00); + VERIFY_IS_EQUAL(half(0.5f * (val2 + val3)).x, 0x3c02); // Conversion from int. VERIFY_IS_EQUAL(half(-1).x, 0xbc00); diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp index 8389e9840..4dd355e6e 100644 --- a/unsupported/test/cxx11_tensor_expr.cpp +++ b/unsupported/test/cxx11_tensor_expr.cpp @@ -112,13 +112,13 @@ static void test_3d() Tensor mat1(2,3,7); Tensor mat2(2,3,7); - float val = 1.0; + float val = 1.0f; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { mat1(i,j,k) = val; mat2(i,j,k) = val; - val += 1.0; + val += 1.0f; } } } @@ -142,7 +142,7 @@ static void test_3d() Tensor mat11(2,3,7); mat11 = mat2 / 3.14f; - val = 1.0; + val = 1.0f; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { @@ -155,7 +155,7 @@ static void test_3d() VERIFY_IS_APPROX(mat9(i,j,k), val + 3.14f); VERIFY_IS_APPROX(mat10(i,j,k), val - 3.14f); VERIFY_IS_APPROX(mat11(i,j,k), val / 3.14f); - val += 1.0; + val += 1.0f; } } } @@ -167,25 +167,25 @@ static void test_constants() Tensor mat2(2,3,7); Tensor mat3(2,3,7); - float val = 1.0; + float val = 1.0f; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { mat1(i,j,k) = val; - val += 1.0; + val += 1.0f; } } } mat2 = mat1.constant(3.14f); mat3 = mat1.cwiseMax(7.3f).exp(); - val = 1.0; + val = 1.0f; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { VERIFY_IS_APPROX(mat2(i,j,k), 3.14f); VERIFY_IS_APPROX(mat3(i,j,k), expf((std::max)(val, 7.3f))); - val += 1.0; + val += 1.0f; } } } @@ -228,25 +228,25 @@ static void test_functors() Tensor mat2(2,3,7); Tensor mat3(2,3,7); - float val = 1.0; + float val = 1.0f; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { mat1(i,j,k) = val; - val += 1.0; + val += 1.0f; } } } mat2 = mat1.inverse().unaryExpr(&asinf); mat3 = mat1.unaryExpr(&tanhf); - val = 1.0; + val = 1.0f; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { VERIFY_IS_APPROX(mat2(i,j,k), asinf(1.0f / mat1(i,j,k))); VERIFY_IS_APPROX(mat3(i,j,k), tanhf(mat1(i,j,k))); - val += 1.0; + val += 1.0f; } } } diff --git a/unsupported/test/cxx11_tensor_fft.cpp b/unsupported/test/cxx11_tensor_fft.cpp index 89874349f..2f14ebc62 100644 --- a/unsupported/test/cxx11_tensor_fft.cpp +++ b/unsupported/test/cxx11_tensor_fft.cpp @@ -205,15 +205,15 @@ static void test_fft_real_input_energy() { VERIFY_IS_EQUAL(output.dimension(i), input.dimension(i)); } - float energy_original = 0.0; - float energy_after_fft = 0.0; + RealScalar energy_original = 0.0; + RealScalar energy_after_fft = 0.0; for (int i = 0; i < total_size; ++i) { - energy_original += pow(std::abs(input(i)), 2); + energy_original += numext::abs2(input(i)); } for (int i = 0; i < total_size; ++i) { - energy_after_fft += pow(std::abs(output(i)), 2); + energy_after_fft += numext::abs2(output(i)); } if(FFTDirection == FFT_FORWARD) { diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp index 46d741b05..4c660de65 100644 --- a/unsupported/test/cxx11_tensor_fixed_size.cpp +++ b/unsupported/test/cxx11_tensor_fixed_size.cpp @@ -188,13 +188,13 @@ static void test_3d() // VERIFY_IS_EQUAL((mat1.dimension(1)), 3); // VERIFY_IS_EQUAL((mat1.dimension(2)), 7); - float val = 0.0; + float val = 0.0f; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { mat1(i,j,k) = val; mat2(i,j,k) = val; - val += 1.0; + val += 1.0f; } } } @@ -210,13 +210,13 @@ static void test_3d() // VERIFY_IS_EQUAL((mat3.dimension(2)), 7); - val = 0.0; + val = 0.0f; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { VERIFY_IS_APPROX(mat3(i,j,k), sqrtf(val)); VERIFY_IS_APPROX(mat4(i,j,k), sqrtf(val)); - val += 1.0; + val += 1.0f; } } } @@ -226,12 +226,12 @@ static void test_3d() static void test_array() { TensorFixedSize > mat1; - float val = 0.0; + float val = 0.0f; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { mat1(i,j,k) = val; - val += 1.0; + val += 1.0f; } } } @@ -239,12 +239,12 @@ static void test_array() TensorFixedSize > mat3; mat3 = mat1.pow(3.5f); - val = 0.0; + val = 0.0f; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { VERIFY_IS_APPROX(mat3(i,j,k), powf(val, 3.5f)); - val += 1.0; + val += 1.0f; } } } diff --git a/unsupported/test/matrix_function.cpp b/unsupported/test/matrix_function.cpp index 9a995f941..cd24064ad 100644 --- a/unsupported/test/matrix_function.cpp +++ b/unsupported/test/matrix_function.cpp @@ -113,8 +113,8 @@ void testMatrixLogarithm(const MatrixType& A) MatrixType scaledA; RealScalar maxImagPartOfSpectrum = A.eigenvalues().imag().cwiseAbs().maxCoeff(); - if (maxImagPartOfSpectrum >= 0.9 * EIGEN_PI) - scaledA = A * 0.9 * EIGEN_PI / maxImagPartOfSpectrum; + if (maxImagPartOfSpectrum >= RealScalar(0.9 * EIGEN_PI)) + scaledA = A * RealScalar(0.9 * EIGEN_PI) / maxImagPartOfSpectrum; else scaledA = A; diff --git a/unsupported/test/matrix_power.cpp b/unsupported/test/matrix_power.cpp index 8e104ed1e..53911370f 100644 --- a/unsupported/test/matrix_power.cpp +++ b/unsupported/test/matrix_power.cpp @@ -24,7 +24,7 @@ void test2dRotation(double tol) s = std::sin(angle); B << c, s, -s, c; - C = Apow(std::ldexp(angle,1) / EIGEN_PI); + C = Apow(std::ldexp(angle,1) / T(EIGEN_PI)); std::cout << "test2dRotation: i = " << i << " error powerm = " << relerr(C,B) << '\n'; VERIFY(C.isApprox(B, tol)); } -- cgit v1.2.3 From b300a84989cd13a779ead980c3197c2599b15f14 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Thu, 5 May 2016 13:36:28 +0200 Subject: Fixed some singed/unsigned comparison warnings --- unsupported/test/cxx11_runqueue.cpp | 66 ++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp index d1770ee1b..d20d87111 100644 --- a/unsupported/test/cxx11_runqueue.cpp +++ b/unsupported/test/cxx11_runqueue.cpp @@ -33,73 +33,73 @@ void test_basic_runqueue() VERIFY_IS_EQUAL(0u, q.Size()); VERIFY_IS_EQUAL(0, q.PopFront()); std::vector stolen; - VERIFY_IS_EQUAL(0, q.PopBackHalf(&stolen)); + VERIFY_IS_EQUAL(0u, q.PopBackHalf(&stolen)); VERIFY_IS_EQUAL(0u, stolen.size()); // Push one front, pop one front. VERIFY_IS_EQUAL(0, q.PushFront(1)); - VERIFY_IS_EQUAL(1, q.Size()); + VERIFY_IS_EQUAL(1u, q.Size()); VERIFY_IS_EQUAL(1, q.PopFront()); - VERIFY_IS_EQUAL(0, q.Size()); + VERIFY_IS_EQUAL(0u, q.Size()); // Push front to overflow. VERIFY_IS_EQUAL(0, q.PushFront(2)); - VERIFY_IS_EQUAL(1, q.Size()); + VERIFY_IS_EQUAL(1u, q.Size()); VERIFY_IS_EQUAL(0, q.PushFront(3)); - VERIFY_IS_EQUAL(2, q.Size()); + VERIFY_IS_EQUAL(2u, q.Size()); VERIFY_IS_EQUAL(0, q.PushFront(4)); - VERIFY_IS_EQUAL(3, q.Size()); + VERIFY_IS_EQUAL(3u, q.Size()); VERIFY_IS_EQUAL(0, q.PushFront(5)); - VERIFY_IS_EQUAL(4, q.Size()); + VERIFY_IS_EQUAL(4u, q.Size()); VERIFY_IS_EQUAL(6, q.PushFront(6)); - VERIFY_IS_EQUAL(4, q.Size()); + VERIFY_IS_EQUAL(4u, q.Size()); VERIFY_IS_EQUAL(5, q.PopFront()); - VERIFY_IS_EQUAL(3, q.Size()); + VERIFY_IS_EQUAL(3u, q.Size()); VERIFY_IS_EQUAL(4, q.PopFront()); - VERIFY_IS_EQUAL(2, q.Size()); + VERIFY_IS_EQUAL(2u, q.Size()); VERIFY_IS_EQUAL(3, q.PopFront()); - VERIFY_IS_EQUAL(1, q.Size()); + VERIFY_IS_EQUAL(1u, q.Size()); VERIFY_IS_EQUAL(2, q.PopFront()); - VERIFY_IS_EQUAL(0, q.Size()); + VERIFY_IS_EQUAL(0u, q.Size()); VERIFY_IS_EQUAL(0, q.PopFront()); // Push one back, pop one back. VERIFY_IS_EQUAL(0, q.PushBack(7)); - VERIFY_IS_EQUAL(1, q.Size()); - VERIFY_IS_EQUAL(1, q.PopBackHalf(&stolen)); - VERIFY_IS_EQUAL(1, stolen.size()); + VERIFY_IS_EQUAL(1u, q.Size()); + VERIFY_IS_EQUAL(1u, q.PopBackHalf(&stolen)); + VERIFY_IS_EQUAL(1u, stolen.size()); VERIFY_IS_EQUAL(7, stolen[0]); - VERIFY_IS_EQUAL(0, q.Size()); + VERIFY_IS_EQUAL(0u, q.Size()); stolen.clear(); // Push back to overflow. VERIFY_IS_EQUAL(0, q.PushBack(8)); - VERIFY_IS_EQUAL(1, q.Size()); + VERIFY_IS_EQUAL(1u, q.Size()); VERIFY_IS_EQUAL(0, q.PushBack(9)); - VERIFY_IS_EQUAL(2, q.Size()); + VERIFY_IS_EQUAL(2u, q.Size()); VERIFY_IS_EQUAL(0, q.PushBack(10)); - VERIFY_IS_EQUAL(3, q.Size()); + VERIFY_IS_EQUAL(3u, q.Size()); VERIFY_IS_EQUAL(0, q.PushBack(11)); - VERIFY_IS_EQUAL(4, q.Size()); + VERIFY_IS_EQUAL(4u, q.Size()); VERIFY_IS_EQUAL(12, q.PushBack(12)); - VERIFY_IS_EQUAL(4, q.Size()); + VERIFY_IS_EQUAL(4u, q.Size()); // Pop back in halves. - VERIFY_IS_EQUAL(2, q.PopBackHalf(&stolen)); - VERIFY_IS_EQUAL(2, stolen.size()); + VERIFY_IS_EQUAL(2u, q.PopBackHalf(&stolen)); + VERIFY_IS_EQUAL(2u, stolen.size()); VERIFY_IS_EQUAL(10, stolen[0]); VERIFY_IS_EQUAL(11, stolen[1]); - VERIFY_IS_EQUAL(2, q.Size()); + VERIFY_IS_EQUAL(2u, q.Size()); stolen.clear(); - VERIFY_IS_EQUAL(1, q.PopBackHalf(&stolen)); - VERIFY_IS_EQUAL(1, stolen.size()); + VERIFY_IS_EQUAL(1u, q.PopBackHalf(&stolen)); + VERIFY_IS_EQUAL(1u, stolen.size()); VERIFY_IS_EQUAL(9, stolen[0]); - VERIFY_IS_EQUAL(1, q.Size()); + VERIFY_IS_EQUAL(1u, q.Size()); stolen.clear(); - VERIFY_IS_EQUAL(1, q.PopBackHalf(&stolen)); - VERIFY_IS_EQUAL(1, stolen.size()); + VERIFY_IS_EQUAL(1u, q.PopBackHalf(&stolen)); + VERIFY_IS_EQUAL(1u, stolen.size()); VERIFY_IS_EQUAL(8, stolen[0]); stolen.clear(); - VERIFY_IS_EQUAL(0, q.PopBackHalf(&stolen)); - VERIFY_IS_EQUAL(0, stolen.size()); + VERIFY_IS_EQUAL(0u, q.PopBackHalf(&stolen)); + VERIFY_IS_EQUAL(0u, stolen.size()); // Empty again. VERIFY(q.Empty()); - VERIFY_IS_EQUAL(0, q.Size()); + VERIFY_IS_EQUAL(0u, q.Size()); } // Empty tests that the queue is not claimed to be empty when is is in fact not. @@ -130,7 +130,7 @@ void test_empty_runqueue() stolen.clear(); break; } - VERIFY_IS_EQUAL(0, stolen.size()); + VERIFY_IS_EQUAL(0u, stolen.size()); } } } -- cgit v1.2.3 From 06d774bf5865cbecfde868b2554c177d95988552 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 5 May 2016 08:37:47 -0700 Subject: Updated the contraction code to ensure that full contraction return a tensor of rank 0 --- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 26 ++++++++-------------- unsupported/test/cxx11_tensor_contraction.cpp | 9 ++------ 2 files changed, 11 insertions(+), 24 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 6f113b903..9d0d432ee 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -37,7 +37,7 @@ struct traits > typedef typename remove_reference::type _RhsNested; // From NumDims below. - static const int NumDimensions = max_n_1::NumDimensions + traits::NumDimensions - 2 * array_size::value>::size; + static const int NumDimensions = traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; static const int Layout = traits::Layout; enum { @@ -65,7 +65,7 @@ struct traits::NumDimensions + traits::NumDimensions - 2 * array_size::value>::size; + static const int NumDimensions = traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; }; } // end namespace internal @@ -140,7 +140,7 @@ struct TensorContractionEvaluatorBase static const int RDims = internal::array_size::Dimensions>::value; static const int ContractDims = internal::array_size::value; - static const int NumDims = max_n_1::size; + static const int NumDims = LDims + RDims - 2 * ContractDims; typedef array contract_t; typedef array::size> left_nocontract_t; @@ -218,11 +218,9 @@ struct TensorContractionEvaluatorBase rhs_strides[i+1] = rhs_strides[i] * eval_right_dims[i]; } - m_i_strides[0] = 1; - m_j_strides[0] = 1; - if(ContractDims) { - m_k_strides[0] = 1; - } + if (m_i_strides.size() > 0) m_i_strides[0] = 1; + if (m_j_strides.size() > 0) m_j_strides[0] = 1; + if (m_k_strides.size() > 0) m_k_strides[0] = 1; m_i_size = 1; m_j_size = 1; @@ -318,11 +316,6 @@ struct TensorContractionEvaluatorBase } } - // Scalar case. We represent the result as a 1d tensor of size 1. - if (LDims + RDims == 2 * ContractDims) { - m_dimensions[0] = 1; - } - // If the layout is RowMajor, we need to reverse the m_dimensions if (static_cast(Layout) == static_cast(RowMajor)) { for (int i = 0, j = NumDims - 1; i < j; i++, j--) { @@ -607,15 +600,14 @@ struct TensorEvaluator::value; typedef array contract_t; - typedef array::size> left_nocontract_t; - typedef array::size> right_nocontract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; - static const int NumDims = max_n_1::size; + static const int NumDims = LDims + RDims - 2 * ContractDims; // Could we use NumDimensions here? typedef DSizes Dimensions; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp index 0e16308a2..73623b2ed 100644 --- a/unsupported/test/cxx11_tensor_contraction.cpp +++ b/unsupported/test/cxx11_tensor_contraction.cpp @@ -87,19 +87,14 @@ static void test_scalar() vec1.setRandom(); vec2.setRandom(); - Tensor scalar(1); - scalar.setZero(); Eigen::array dims = {{DimPair(0, 0)}}; - typedef TensorEvaluator Evaluator; - Evaluator eval(vec1.contract(vec2, dims), DefaultDevice()); - eval.evalTo(scalar.data()); - EIGEN_STATIC_ASSERT(Evaluator::NumDims==1ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + Tensor scalar = vec1.contract(vec2, dims); float expected = 0.0f; for (int i = 0; i < 6; ++i) { expected += vec1(i) * vec2(i); } - VERIFY_IS_APPROX(scalar(0), expected); + VERIFY_IS_APPROX(scalar(), expected); } template -- cgit v1.2.3 From f363e533aac5aac0d67fd5728b2e5b509c756bc8 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 5 May 2016 09:05:45 -0700 Subject: Added tests for full contractions using thread pools and gpu devices. Fixed a couple of issues in the corresponding code. --- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorContractionCuda.h | 6 +-- .../CXX11/src/Tensor/TensorContractionThreadPool.h | 6 +-- unsupported/test/cxx11_tensor_contract_cuda.cu | 62 ++++++++++++++++++++++ unsupported/test/cxx11_tensor_thread_pool.cpp | 39 ++++++++++++++ 5 files changed, 109 insertions(+), 8 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 9d0d432ee..f8ec0614f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -143,8 +143,8 @@ struct TensorContractionEvaluatorBase static const int NumDims = LDims + RDims - 2 * ContractDims; typedef array contract_t; - typedef array::size> left_nocontract_t; - typedef array::size> right_nocontract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; typedef DSizes Dimensions; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index 6a3ef14ef..886474986 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -1240,10 +1240,10 @@ struct TensorEvaluator right_dim_mapper_t; typedef array contract_t; - typedef array::size> left_nocontract_t; - typedef array::size> right_nocontract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; - static const int NumDims = max_n_1::size; + static const int NumDims = LDims + RDims - 2 * ContractDims; typedef DSizes Dimensions; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 9044454fd..73c48828c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -92,10 +92,10 @@ struct TensorEvaluator right_dim_mapper_t; typedef array contract_t; - typedef array::size> left_nocontract_t; - typedef array::size> right_nocontract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; - static const int NumDims = max_n_1::size; + static const int NumDims = LDims + RDims - 2 * ContractDims; typedef DSizes Dimensions; diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_cuda.cu index 6d1ef07f9..98ac180ef 100644 --- a/unsupported/test/cxx11_tensor_contract_cuda.cu +++ b/unsupported/test/cxx11_tensor_contract_cuda.cu @@ -84,6 +84,65 @@ void test_cuda_contraction(int m_size, int k_size, int n_size) cudaFree((void*)d_t_result); } + +template +void test_scalar(int m_size, int k_size, int n_size) +{ + std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; + // with these dimensions, the output has 300 * 140 elements, which is + // more than 30 * 1024, which is the number of threads in blocks on + // a 15 SM GK110 GPU + Tensor t_left(m_size, k_size); + Tensor t_right(k_size, n_size); + Tensor t_result; + Tensor t_result_gpu; + Eigen::array dims(DimPair(0, 0), DimPair(1, 1)); + + t_left.setRandom(); + t_right.setRandom(); + + std::size_t t_left_bytes = t_left.size() * sizeof(float); + std::size_t t_right_bytes = t_right.size() * sizeof(float); + std::size_t t_result_bytes = sizeof(float); + + float* d_t_left; + float* d_t_right; + float* d_t_result; + + cudaMalloc((void**)(&d_t_left), t_left_bytes); + cudaMalloc((void**)(&d_t_right), t_right_bytes); + cudaMalloc((void**)(&d_t_result), t_result_bytes); + + cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > + gpu_t_left(d_t_left, m_size, k_size); + Eigen::TensorMap > + gpu_t_right(d_t_right, k_size, n_size); + Eigen::TensorMap > + gpu_t_result(d_t_result); + + gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); + t_result = t_left.contract(t_right, dims); + + cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); + if (fabs(t_result() - t_result_gpu()) > 1e-4f && + !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) { + std::cout << "mismatch detected: " << t_result() + << " vs " << t_result_gpu() << std::endl; + assert(false); + } + + cudaFree((void*)d_t_left); + cudaFree((void*)d_t_right); + cudaFree((void*)d_t_result); +} + + template void test_cuda_contraction_m() { for (int k = 32; k < 256; k++) { @@ -138,6 +197,9 @@ void test_cxx11_tensor_cuda() CALL_SUBTEST_1(test_cuda_contraction(128, 128, 128)); CALL_SUBTEST_1(test_cuda_contraction(128, 128, 128)); + CALL_SUBTEST_1(test_scalar(128, 128, 128)); + CALL_SUBTEST_1(test_scalar(128, 128, 128)); + CALL_SUBTEST_2(test_cuda_contraction_m()); CALL_SUBTEST_3(test_cuda_contraction_m()); diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index e46197464..5fd3f0bf1 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -233,6 +233,42 @@ void test_multithread_contraction_agrees_with_singlethread() { } +template +void test_full_contraction() { + int contract_size1 = internal::random(1, 500); + int contract_size2 = internal::random(1, 500); + + Tensor left(contract_size1, + contract_size2); + Tensor right(contract_size1, + contract_size2); + left.setRandom(); + right.setRandom(); + + // add constants to shift values away from 0 for more precision + left += left.constant(1.5f); + right += right.constant(1.5f); + + typedef Tensor::DimensionPair DimPair; + Eigen::array dims({{DimPair(0, 0), DimPair(1, 1)}}); + + Eigen::ThreadPool tp(internal::random(2, 11)); + Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random(2, 11)); + + Tensor st_result; + st_result = left.contract(right, dims); + + Tensor tp_result; + tp_result.device(thread_pool_device) = left.contract(right, dims); + + VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions())); + // if both of the values are very small, then do nothing (because the test will fail + // due to numerical precision issues when values are small) + if (fabs(st_result() - tp_result()) >= 1e-4) { + VERIFY_IS_APPROX(st_result(), tp_result()); + } +} + template void test_multithreaded_reductions() { const int num_threads = internal::random(3, 11); @@ -324,6 +360,9 @@ void test_cxx11_tensor_thread_pool() CALL_SUBTEST_4(test_contraction_corner_cases()); CALL_SUBTEST_4(test_contraction_corner_cases()); + CALL_SUBTEST_4(test_full_contraction()); + CALL_SUBTEST_4(test_full_contraction()); + CALL_SUBTEST_5(test_multithreaded_reductions()); CALL_SUBTEST_5(test_multithreaded_reductions()); -- cgit v1.2.3 From 7875437ca0a356467085e1397ed9ff67b629cf91 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 5 May 2016 09:08:42 -0700 Subject: Avoided unecessary type promotion --- unsupported/test/cxx11_tensor_thread_pool.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index 5fd3f0bf1..423074a38 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -226,7 +226,7 @@ void test_multithread_contraction_agrees_with_singlethread() { for (ptrdiff_t i = 0; i < st_result.size(); i++) { // if both of the values are very small, then do nothing (because the test will fail // due to numerical precision issues when values are small) - if (fabs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4) { + if (fabs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4f) { VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]); } } @@ -264,7 +264,7 @@ void test_full_contraction() { VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions())); // if both of the values are very small, then do nothing (because the test will fail // due to numerical precision issues when values are small) - if (fabs(st_result() - tp_result()) >= 1e-4) { + if (fabs(st_result() - tp_result()) >= 1e-4f) { VERIFY_IS_APPROX(st_result(), tp_result()); } } -- cgit v1.2.3 From a4d6e8fef06c482467de60ba8c0b51f003675132 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 5 May 2016 09:25:55 -0700 Subject: Strongly hint but don't force the compiler to unroll a some loops in the tensor executor. This results in up to 27% faster code. --- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 5c3d4d630..1155354cd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -59,13 +59,14 @@ class TensorExecutor { const Index size = array_prod(evaluator.dimensions()); const int PacketSize = unpacket_traits::PacketReturnType>::size; - // Manually unroll this loop since compilers don't do it. + // Give the compiler a strong hint to unroll the loop. But don't insist + // on unrolling, because if the function is expensive the compiler should not + // unroll the loop at the expense of inlining. const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) { - evaluator.evalPacket(i); - evaluator.evalPacket(i+PacketSize); - evaluator.evalPacket(i+2*PacketSize); - evaluator.evalPacket(i+3*PacketSize); + for (Index j = 0; j < 4; j++) { + evaluator.evalPacket(i + j * PacketSize); + } } const Index VectorizedSize = (size / PacketSize) * PacketSize; for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) { @@ -104,12 +105,13 @@ struct EvalRange { if (last - first >= PacketSize) { eigen_assert(first % PacketSize == 0); Index last_chunk_offset = last - 4 * PacketSize; - // Manually unroll this loop since compilers don't do it. + // Give the compiler a strong hint to unroll the loop. But don't insist + // on unrolling, because if the function is expensive the compiler should not + // unroll the loop at the expense of inlining. for (; i <= last_chunk_offset; i += 4*PacketSize) { - evaluator.evalPacket(i); - evaluator.evalPacket(i+PacketSize); - evaluator.evalPacket(i+2*PacketSize); - evaluator.evalPacket(i+3*PacketSize); + for (Index j = 0; j < 4; j++) { + evaluator.evalPacket(i + j * PacketSize); + } } last_chunk_offset = last - PacketSize; for (; i <= last_chunk_offset; i += PacketSize) { -- cgit v1.2.3 From 2aba40d208ba14ab7576b6c5edfbf4098f01c752 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 5 May 2016 09:26:57 -0700 Subject: Avoid unecessary type promotion --- unsupported/test/cxx11_tensor_device.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu index cbe9e6449..b6ca54d93 100644 --- a/unsupported/test/cxx11_tensor_device.cu +++ b/unsupported/test/cxx11_tensor_device.cu @@ -241,7 +241,7 @@ void test_cpu() { const float result = out(i,j,k); const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) + (in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f); - if (fabs(expected) < 1e-4 && fabs(result) < 1e-4) { + if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) { continue; } VERIFY_IS_APPROX(expected, result); @@ -258,7 +258,7 @@ void test_cpu() { in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) + (in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f + in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f); - if (fabs(expected) < 1e-4 && fabs(result) < 1e-4) { + if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) { continue; } VERIFY_IS_APPROX(expected, result); -- cgit v1.2.3 From 28d557265803e3b0891309f5e06644bafdacddd6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 5 May 2016 10:02:26 -0700 Subject: Fixed some incorrect assertions --- unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h | 2 +- unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h | 4 ++-- unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h b/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h index b30e0a90a..995427978 100644 --- a/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h +++ b/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h @@ -304,7 +304,7 @@ LevenbergMarquardt::minimizeInit(FVectorType &x) // m_fjac.reserve(VectorXi::Constant(n,5)); // FIXME Find a better alternative if (!m_useExternalScaling) m_diag.resize(n); - eigen_assert( (!m_useExternalScaling || m_diag.size()==n) || "When m_useExternalScaling is set, the caller must provide a valid 'm_diag'"); + eigen_assert( (!m_useExternalScaling || m_diag.size()==n) && "When m_useExternalScaling is set, the caller must provide a valid 'm_diag'"); m_qtf.resize(n); /* Function Body */ diff --git a/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h b/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h index b8ba6ddcb..8fe3ed86b 100644 --- a/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h +++ b/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h @@ -150,7 +150,7 @@ HybridNonLinearSolver::solveInit(FVectorType &x) fjac.resize(n, n); if (!useExternalScaling) diag.resize(n); - eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'"); + eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'"); /* Function Body */ nfev = 0; @@ -390,7 +390,7 @@ HybridNonLinearSolver::solveNumericalDiffInit(FVectorType & fvec.resize(n); if (!useExternalScaling) diag.resize(n); - eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'"); + eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'"); /* Function Body */ nfev = 0; diff --git a/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h b/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h index 69106ddc5..fe3b79ca7 100644 --- a/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h +++ b/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h @@ -179,7 +179,7 @@ LevenbergMarquardt::minimizeInit(FVectorType &x) fjac.resize(m, n); if (!useExternalScaling) diag.resize(n); - eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'"); + eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'"); qtf.resize(n); /* Function Body */ @@ -215,7 +215,7 @@ LevenbergMarquardt::minimizeOneStep(FVectorType &x) { using std::abs; using std::sqrt; - + eigen_assert(x.size()==n); // check the caller is not cheating us /* calculate the jacobian matrix. */ @@ -398,7 +398,7 @@ LevenbergMarquardt::minimizeOptimumStorageInit(FVectorType fjac.resize(n, n); if (!useExternalScaling) diag.resize(n); - eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'"); + eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'"); qtf.resize(n); /* Function Body */ -- cgit v1.2.3 From f81e4131802d8f437ef52956aa760f56f9e39dd7 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 5 May 2016 14:15:11 -0700 Subject: Added a benchmark to measure the performance of full reductions of 16 bit floats --- bench/tensors/tensor_benchmarks.h | 2 +- bench/tensors/tensor_benchmarks_fp16_gpu.cu | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index 62533a608..e0631b401 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -368,7 +368,7 @@ template class BenchmarkSuite { const TensorMap, Eigen::Aligned> B( b_, input_size); Eigen::array output_size; - TensorMap, Eigen::Aligned> C( + TensorMap, Eigen::Aligned> C( c_, output_size); StartBenchmarkTiming(); diff --git a/bench/tensors/tensor_benchmarks_fp16_gpu.cu b/bench/tensors/tensor_benchmarks_fp16_gpu.cu index 14876556e..65784d0d6 100644 --- a/bench/tensors/tensor_benchmarks_fp16_gpu.cu +++ b/bench/tensors/tensor_benchmarks_fp16_gpu.cu @@ -33,6 +33,7 @@ BM_FuncGPU(algebraicFunc); BM_FuncGPU(transcendentalFunc); BM_FuncGPU(rowReduction); BM_FuncGPU(colReduction); +BM_FuncGPU(fullReduction); // Contractions -- cgit v1.2.3 From 910e0135066b3f6c928dc11dff2120284e1b6d26 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 5 May 2016 15:38:16 -0700 Subject: Relaxed an assertion that was tighter that necessary. --- unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 2742dbb95..f7a9f4ed3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -152,8 +152,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device) { - // We could also support the case where NumInputDims==1 if needed. - EIGEN_STATIC_ASSERT(NumInputDims >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(NumInputDims >= 1, YOU_MADE_A_PROGRAMMING_MISTAKE); eigen_assert(NumInputDims > m_dim.actualDim()); const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); -- cgit v1.2.3 From 0451940fa4caeb6d9658b971e452fcb383b45874 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 5 May 2016 15:40:01 -0700 Subject: Relaxed the dummy precision for fp16 --- Eigen/src/Core/arch/CUDA/Half.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 060c2c805..a63b20318 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -378,7 +378,7 @@ template<> struct NumTraits EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() { return internal::raw_uint16_to_half(0x0800); } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { return half(1e-3f); } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { return half(1e-2f); } EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half highest() { return internal::raw_uint16_to_half(0x7bff); } -- cgit v1.2.3 From 9a48688d3723b5cbe5d2f7ec32e20064eeee0f77 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 5 May 2016 15:58:05 -0700 Subject: Further improved the testing of fp16 --- unsupported/test/cxx11_tensor_of_float16_cuda.cu | 118 ++++++++++++----------- 1 file changed, 61 insertions(+), 57 deletions(-) diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index 37fe3e9a4..26d6d10e7 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -141,43 +141,43 @@ void test_cuda_trancendental() { float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_res1_half = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_res1_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_res2_half = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_res2_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + Eigen::half* d_res1_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); + Eigen::half* d_res1_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); + Eigen::half* d_res2_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); + Eigen::half* d_res2_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); Eigen::TensorMap, Eigen::Aligned> gpu_float1( d_float1, num_elem); Eigen::TensorMap, Eigen::Aligned> gpu_float2( d_float2, num_elem); - Eigen::TensorMap, Eigen::Aligned> gpu_res1_half( + Eigen::TensorMap, Eigen::Aligned> gpu_res1_half( d_res1_half, num_elem); - Eigen::TensorMap, Eigen::Aligned> gpu_res1_float( + Eigen::TensorMap, Eigen::Aligned> gpu_res1_float( d_res1_float, num_elem); - Eigen::TensorMap, Eigen::Aligned> gpu_res2_half( + Eigen::TensorMap, Eigen::Aligned> gpu_res2_half( d_res2_half, num_elem); - Eigen::TensorMap, Eigen::Aligned> gpu_res2_float( + Eigen::TensorMap, Eigen::Aligned> gpu_res2_float( d_res2_float, num_elem); gpu_float1.device(gpu_device) = gpu_float1.random(); gpu_float2.device(gpu_device) = gpu_float2.random(); - gpu_res1_float.device(gpu_device) = gpu_float1.exp(); - gpu_res2_float.device(gpu_device) = gpu_float2.log(); - gpu_res1_half.device(gpu_device) = gpu_float1.cast().exp().cast(); - gpu_res2_half.device(gpu_device) = gpu_float2.cast().log().cast(); + gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast(); + gpu_res2_float.device(gpu_device) = gpu_float2.log().cast(); + gpu_res1_half.device(gpu_device) = gpu_float1.cast().exp(); + gpu_res2_half.device(gpu_device) = gpu_float2.cast().log(); Tensor input1(num_elem); - Tensor half_prec1(num_elem); - Tensor full_prec1(num_elem); + Tensor half_prec1(num_elem); + Tensor full_prec1(num_elem); Tensor input2(num_elem); - Tensor half_prec2(num_elem); - Tensor full_prec2(num_elem); + Tensor half_prec2(num_elem); + Tensor full_prec2(num_elem); gpu_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float)); gpu_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float)); - gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(float)); - gpu_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(float)); - gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(float)); - gpu_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half)); + gpu_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half)); + gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half)); + gpu_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half)); gpu_device.synchronize(); for (int i = 0; i < num_elem; ++i) { @@ -206,16 +206,16 @@ void test_cuda_contractions() { float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); + Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); Eigen::TensorMap, Eigen::Aligned> gpu_float1( d_float1, rows, cols); Eigen::TensorMap, Eigen::Aligned> gpu_float2( d_float2, rows, cols); - Eigen::TensorMap, Eigen::Aligned> gpu_res_half( + Eigen::TensorMap, Eigen::Aligned> gpu_res_half( d_res_half, rows, cols); - Eigen::TensorMap, Eigen::Aligned> gpu_res_float( + Eigen::TensorMap, Eigen::Aligned> gpu_res_float( d_res_float, rows, cols); gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f); @@ -223,13 +223,13 @@ void test_cuda_contractions() { typedef Tensor::DimensionPair DimPair; Eigen::array dims(DimPair(1, 0)); - gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims); - gpu_res_half.device(gpu_device) = gpu_float1.cast().contract(gpu_float2.cast(), dims).cast(); + gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims).cast(); + gpu_res_half.device(gpu_device) = gpu_float1.cast().contract(gpu_float2.cast(), dims); - Tensor half_prec(rows, cols); - Tensor full_prec(rows, cols); - gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); - gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); + Tensor half_prec(rows, cols); + Tensor full_prec(rows, cols); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(Eigen::half)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(Eigen::half)); gpu_device.synchronize(); for (int i = 0; i < rows; ++i) { @@ -254,29 +254,42 @@ void test_cuda_reductions() { float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_res_half = (float*)gpu_device.allocate(size * sizeof(float)); - float* d_res_float = (float*)gpu_device.allocate(size * sizeof(float)); + Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(size * sizeof(Eigen::half)); + Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(size * sizeof(Eigen::half)); Eigen::TensorMap, Eigen::Aligned> gpu_float1( d_float1, size, size); Eigen::TensorMap, Eigen::Aligned> gpu_float2( d_float2, size, size); - Eigen::TensorMap, Eigen::Aligned> gpu_res_half( + Eigen::TensorMap, Eigen::Aligned> gpu_res_half( d_res_half, size); - Eigen::TensorMap, Eigen::Aligned> gpu_res_float( + Eigen::TensorMap, Eigen::Aligned> gpu_res_float( d_res_float, size); gpu_float1.device(gpu_device) = gpu_float1.random(); gpu_float2.device(gpu_device) = gpu_float2.random(); Eigen::array redux_dim = {{0}}; - gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim); - gpu_res_half.device(gpu_device) = gpu_float1.cast().sum(redux_dim).cast(); + gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim).cast(); + gpu_res_half.device(gpu_device) = gpu_float1.cast().sum(redux_dim); + + Tensor half_prec(size); + Tensor full_prec(size); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, size*sizeof(Eigen::half)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, size*sizeof(Eigen::half)); + gpu_device.synchronize(); + + for (int i = 0; i < size; ++i) { + std::cout << "Checking redux " << i << std::endl; + VERIFY_IS_APPROX(full_prec(i), half_prec(i)); + } + + redux_dim = {{1}}; + gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim).cast(); + gpu_res_half.device(gpu_device) = gpu_float1.cast().sum(redux_dim); - Tensor half_prec(size); - Tensor full_prec(size); - gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, size*sizeof(float)); - gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, size*sizeof(float)); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, size*sizeof(Eigen::half)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, size*sizeof(Eigen::half)); gpu_device.synchronize(); for (int i = 0; i < size; ++i) { @@ -333,23 +346,14 @@ void test_cuda_forced_evals() { void test_cxx11_tensor_of_float16_cuda() { #ifdef EIGEN_HAS_CUDA_FP16 - Eigen::CudaStreamDevice stream; - Eigen::GpuDevice device(&stream); - if (device.majorDeviceVersion() > 5 || - (device.majorDeviceVersion() == 5 && device.minorDeviceVersion() >= 3)) { - std::cout << "Running test on device with capability " << device.majorDeviceVersion() << "." << device.minorDeviceVersion() << std::endl; - - CALL_SUBTEST_1(test_cuda_conversion()); - CALL_SUBTEST_1(test_cuda_unary()); - CALL_SUBTEST_1(test_cuda_elementwise()); - CALL_SUBTEST_1(test_cuda_trancendental()); - CALL_SUBTEST_2(test_cuda_contractions()); - CALL_SUBTEST_3(test_cuda_reductions()); - CALL_SUBTEST_4(test_cuda_forced_evals()); - } - else { - std::cout << "Half floats require compute capability of at least 5.3. This device only supports " << device.majorDeviceVersion() << "." << device.minorDeviceVersion() << ". Skipping the test" << std::endl; - } + CALL_SUBTEST_1(test_cuda_conversion()); + CALL_SUBTEST_1(test_cuda_unary()); + CALL_SUBTEST_1(test_cuda_elementwise()); + CALL_SUBTEST_1(test_cuda_trancendental()); + CALL_SUBTEST_2(test_cuda_contractions()); + CALL_SUBTEST_3(test_cuda_reductions()); + CALL_SUBTEST_4(test_cuda_forced_evals()); + #else std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl; #endif -- cgit v1.2.3 From e3d053e14ea625ccd3bb666a535f063cd43d806a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 5 May 2016 16:24:15 -0700 Subject: Refined the testing of log and exp on fp16 --- unsupported/test/cxx11_tensor_of_float16_cuda.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index 26d6d10e7..39a28501b 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -159,8 +159,8 @@ void test_cuda_trancendental() { Eigen::TensorMap, Eigen::Aligned> gpu_res2_float( d_res2_float, num_elem); - gpu_float1.device(gpu_device) = gpu_float1.random(); - gpu_float2.device(gpu_device) = gpu_float2.random(); + gpu_float1.device(gpu_device) = gpu_float1.random() - 0.5f; + gpu_float2.device(gpu_device) = gpu_float2.random() + 0.5f;; gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast(); gpu_res2_float.device(gpu_device) = gpu_float2.log().cast(); gpu_res1_half.device(gpu_device) = gpu_float1.cast().exp(); -- cgit v1.2.3 From 678a17ba796aa757e542e6d633c15574ef6fb586 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 5 May 2016 16:36:39 -0700 Subject: Made the testing of contractions on fp16 more robust --- unsupported/test/cxx11_tensor_of_float16_cuda.cu | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index 39a28501b..443373d4e 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -219,7 +219,7 @@ void test_cuda_contractions() { d_res_float, rows, cols); gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f); - gpu_float2.device(gpu_device) = gpu_float2.random() - gpu_float1.constant(0.5f); + gpu_float2.device(gpu_device) = gpu_float2.random() - gpu_float2.constant(0.5f); typedef Tensor::DimensionPair DimPair; Eigen::array dims(DimPair(1, 0)); @@ -234,8 +234,10 @@ void test_cuda_contractions() { for (int i = 0; i < rows; ++i) { for (int j = 0; j < cols; ++j) { - std::cout << "Checking contract " << i << " " << j << std::endl; - VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j)); + std::cout << "Checking contract " << i << " " << j << full_prec(i, j) << " " << half_prec(i, j) << std::endl; + if (numext::abs(full_prec(i, j) - half_prec(i, j)) > Eigen::half(1e-2f)) { + VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j)); + } } } -- cgit v1.2.3 From 69a8a4e1f316d1a86d6fc74a99c794295b0776c0 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 5 May 2016 16:52:50 -0700 Subject: Added a test to validate full reduction on tensor of half floats --- unsupported/test/cxx11_tensor_of_float16_cuda.cu | 44 ++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index 443373d4e..302bf8fd4 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -305,6 +305,49 @@ void test_cuda_reductions() { gpu_device.deallocate(d_res_float); } + + +void test_cuda_full_reductions() { + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int size = 13; + int num_elem = size*size; + + float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half)); + Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half)); + + Eigen::TensorMap, Eigen::Aligned> gpu_float1( + d_float1, size, size); + Eigen::TensorMap, Eigen::Aligned> gpu_float2( + d_float2, size, size); + Eigen::TensorMap, Eigen::Aligned> gpu_res_half( + d_res_half); + Eigen::TensorMap, Eigen::Aligned> gpu_res_float( + d_res_float); + + gpu_float1.device(gpu_device) = gpu_float1.random(); + gpu_float2.device(gpu_device) = gpu_float2.random(); + + gpu_res_float.device(gpu_device) = gpu_float1.sum().cast(); + gpu_res_half.device(gpu_device) = gpu_float1.cast().sum(); + + Tensor half_prec; + Tensor full_prec; + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half)); + gpu_device.synchronize(); + + VERIFY_IS_APPROX(full_prec(), half_prec()); + + gpu_device.deallocate(d_float1); + gpu_device.deallocate(d_float2); + gpu_device.deallocate(d_res_half); + gpu_device.deallocate(d_res_float); +} + + void test_cuda_forced_evals() { Eigen::CudaStreamDevice stream; @@ -354,6 +397,7 @@ void test_cxx11_tensor_of_float16_cuda() CALL_SUBTEST_1(test_cuda_trancendental()); CALL_SUBTEST_2(test_cuda_contractions()); CALL_SUBTEST_3(test_cuda_reductions()); + CALL_SUBTEST_3(test_cuda_full_reductions()); CALL_SUBTEST_4(test_cuda_forced_evals()); #else -- cgit v1.2.3 From c54ae65c836873c57cac81eaa600348fa28ae98d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 5 May 2016 17:18:47 -0700 Subject: Marked a few tensor operations as read only --- unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 091007ab7..abdf742c6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -254,7 +254,7 @@ struct nested, 1, t template -class TensorConvolutionOp : public TensorBase > +class TensorConvolutionOp : public TensorBase, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index c556fec0f..26b1f65a8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -56,7 +56,7 @@ struct nested, 1, typename eval template -class TensorEvalToOp : public TensorBase > +class TensorEvalToOp : public TensorBase, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h index 8491c4ca2..ea250d8bc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -252,7 +252,7 @@ struct nested, 1, typename e template -class TensorSelectOp : public TensorBase > +class TensorSelectOp : public TensorBase, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index 7ec757519..5d0548b84 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -55,7 +55,7 @@ struct nested, 1, typename eval -class TensorForcedEvalOp : public TensorBase > +class TensorForcedEvalOp : public TensorBase, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; -- cgit v1.2.3 From a11bd82dc3683f19a531495ad9a6bd751bb2ee57 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Fri, 6 May 2016 11:31:56 +0200 Subject: bug #1213: Give names to anonymous enums --- Eigen/src/Core/util/Constants.h | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index 5f71ba3df..5e2cff90d 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -199,7 +199,7 @@ const unsigned int HereditaryBits = RowMajorBit /** \ingroup enums * Enum containing possible values for the \c Mode or \c UpLo parameter of * MatrixBase::selfadjointView() and MatrixBase::triangularView(), and selfadjoint solvers. */ -enum { +enum TriangularOptions { /** View matrix as a lower triangular matrix. */ Lower=0x1, /** View matrix as an upper triangular matrix. */ @@ -224,7 +224,7 @@ enum { /** \ingroup enums * Enum for indicating whether a buffer is aligned or not. */ -enum { +enum AlignmentOptions { Unaligned=0, /**< Data pointer has no specific alignment. */ Aligned8=8, /**< Data pointer is aligned on a 8 bytes boundary. */ Aligned16=16, /**< Data pointer is aligned on a 16 bytes boundary. */ @@ -273,7 +273,7 @@ enum DirectionType { /** \internal \ingroup enums * Enum to specify how to traverse the entries of a matrix. */ -enum { +enum TraversalOptions { /** \internal Default traversal, no vectorization, no index-based access */ DefaultTraversal, /** \internal No vectorization, use index-based access to have only one for loop instead of 2 nested loops */ @@ -295,7 +295,7 @@ enum { /** \internal \ingroup enums * Enum to specify whether to unroll loops when traversing over the entries of a matrix. */ -enum { +enum UnrollingOptions { /** \internal Do not unroll loops. */ NoUnrolling, /** \internal Unroll only the inner loop, but not the outer loop. */ @@ -307,7 +307,7 @@ enum { /** \internal \ingroup enums * Enum to specify whether to use the default (built-in) implementation or the specialization. */ -enum { +enum SpecializedOptions { Specialized, BuiltIn }; @@ -315,7 +315,7 @@ enum { /** \ingroup enums * Enum containing possible values for the \p _Options template parameter of * Matrix, Array and BandMatrix. */ -enum { +enum StorageOptions { /** Storage order is column major (see \ref TopicStorageOrders). */ ColMajor = 0, /** Storage order is row major (see \ref TopicStorageOrders). */ @@ -328,7 +328,7 @@ enum { /** \ingroup enums * Enum for specifying whether to apply or solve on the left or right. */ -enum { +enum SolverOptions { /** Apply transformation on the left. */ OnTheLeft = 1, /** Apply transformation on the right. */ @@ -353,7 +353,7 @@ enum Default_t { Default }; /** \internal \ingroup enums * Used in AmbiVector. */ -enum { +enum AmbiVectorOptions { IsDense = 0, IsSparse }; @@ -479,8 +479,9 @@ namespace Architecture } /** \internal \ingroup enums - * Enum used as template parameter in Product and product evalautors. */ -enum { DefaultProduct=0, LazyProduct, AliasFreeProduct, CoeffBasedProductMode, LazyCoeffBasedProductMode, OuterProduct, InnerProduct, GemvProduct, GemmProduct }; + * Enum used as template parameter in Product and product evaluators. */ +enum ProductType +{ DefaultProduct=0, LazyProduct, AliasFreeProduct, CoeffBasedProductMode, LazyCoeffBasedProductMode, OuterProduct, InnerProduct, GemvProduct, GemmProduct }; /** \internal \ingroup enums * Enum used in experimental parallel implementation. */ @@ -492,7 +493,7 @@ struct Dense {}; /** The type used to identify a general sparse storage. */ struct Sparse {}; -/** The type used to identify a general solver (foctored) storage. */ +/** The type used to identify a general solver (factored) storage. */ struct SolverStorage {}; /** The type used to identify a permutation storage. */ -- cgit v1.2.3 From 1660e749b40b79e17b39983976555881c9b943f4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 6 May 2016 08:15:12 -0700 Subject: Avoid double promotion --- test/main.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/main.h b/test/main.h index b0e3b7818..1bfb9e1b0 100644 --- a/test/main.h +++ b/test/main.h @@ -302,7 +302,7 @@ namespace Eigen { template inline typename NumTraits::Real test_precision() { return NumTraits::dummy_precision(); } template<> inline float test_precision() { return 1e-3f; } template<> inline double test_precision() { return 1e-6; } -template<> inline long double test_precision() { return 1e-6; } +template<> inline long double test_precision() { return 1e-6l; } template<> inline float test_precision >() { return test_precision(); } template<> inline double test_precision >() { return test_precision(); } template<> inline long double test_precision >() { return test_precision(); } -- cgit v1.2.3 From 8adf5cc70f232f919302332e2a2fa0971c34f264 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 6 May 2016 19:16:43 -0700 Subject: Added support for packet processing of fp16 on kepler and maxwell gpus --- Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 99 ++++++++++++++++++++++++++++--- Eigen/src/Core/arch/CUDA/TypeCasting.h | 2 +- 2 files changed, 93 insertions(+), 8 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index 61d532e4d..0cebc1017 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -17,8 +17,8 @@ // we'll use on the host side (SSE, AVX, ...) #if defined(__CUDACC__) && defined(EIGEN_USE_GPU) -// Most of the following operations require arch >= 5.3 -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +// Most of the following operations require arch >= 3.0 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 namespace Eigen { namespace internal { @@ -67,13 +67,21 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(half* to, co } template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const half* from) { - return __ldg((const half2*)from); + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const half* from) { +#if __CUDA_ARCH__ >= 320 + return __ldg((const half2*)from); +#else + return __halves2half2(*(from+0), *(from+1)); +#endif } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const half* from) { - return __halves2half2(__ldg(from+0), __ldg(from+1)); +#if __CUDA_ARCH__ >= 320 + return __halves2half2(__ldg(from+0), __ldg(from+1)); +#else + return __halves2half2(*(from+0), *(from+1)); +#endif } template<> EIGEN_DEVICE_FUNC inline half2 pgather(const half* from, Index stride) { @@ -107,30 +115,83 @@ ptranspose(PacketBlock& kernel) { } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const half& a) { +#if __CUDA_ARCH__ >= 530 return __halves2half2(a, __hadd(a, __float2half(1.0f))); +#else + float f = __half2float(a) + 1.0f; + return __halves2half2(a, __float2half(f)); +#endif } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { +#if __CUDA_ARCH__ >= 530 return __hadd2(a, b); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 + b1; + float r2 = a2 + b2; + return __floats2half2_rn(r1, r2); +#endif } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { +#if __CUDA_ARCH__ >= 530 return __hsub2(a, b); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 - b1; + float r2 = a2 - b2; + return __floats2half2_rn(r1, r2); +#endif } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { +#if __CUDA_ARCH__ >= 530 return __hneg2(a); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return __floats2half2_rn(-a1, -a2); +#endif } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { +#if __CUDA_ARCH__ >= 530 return __hmul2(a, b); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 * b1; + float r2 = a2 * b2; + return __floats2half2_rn(r1, r2); +#endif } - template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) { +#if __CUDA_ARCH__ >= 530 return __hfma2(a, b, c); - } +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float c1 = __low2float(c); + float c2 = __high2float(c); + float r1 = a1 * b1 + c2; + float r2 = a2 * b2 + c2; + return __floats2half2_rn(r1, r2); +#endif +} template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { float a1 = __low2float(a); @@ -163,23 +224,47 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& } template<> EIGEN_DEVICE_FUNC inline half predux(const half2& a) { +#if __CUDA_ARCH__ >= 530 return __hadd(__low2half(a), __high2half(a)); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return half(__float2half_rn(a1 + a2)); +#endif } template<> EIGEN_DEVICE_FUNC inline half predux_max(const half2& a) { +#if __CUDA_ARCH__ >= 530 half first = __low2half(a); half second = __high2half(a); return __hgt(first, second) ? first : second; +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return half(__float2half_rn(numext::maxi(a1, a2))); +#endif } template<> EIGEN_DEVICE_FUNC inline half predux_min(const half2& a) { +#if __CUDA_ARCH__ >= 530 half first = __low2half(a); half second = __high2half(a); return __hlt(first, second) ? first : second; +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return half(__float2half_rn(numext::mini(a1, a2))); +#endif } template<> EIGEN_DEVICE_FUNC inline half predux_mul(const half2& a) { +#if __CUDA_ARCH__ >= 530 return __hmul(__low2half(a), __high2half(a)); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return half(__float2half_rn(a1 * a2)); +#endif } } // end namespace internal diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h index 396b38eaf..3ea218133 100644 --- a/Eigen/src/Core/arch/CUDA/TypeCasting.h +++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h @@ -71,7 +71,7 @@ struct functor_traits > -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 template <> struct type_casting_traits { -- cgit v1.2.3 From 691614bd2c24fee2ea3771196965a279abb41801 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 7 May 2016 13:28:53 -0700 Subject: Worked around a bug in nvcc on tegra x1 --- unsupported/test/cxx11_tensor_of_float16_cuda.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index 302bf8fd4..dceac793e 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -159,8 +159,8 @@ void test_cuda_trancendental() { Eigen::TensorMap, Eigen::Aligned> gpu_res2_float( d_res2_float, num_elem); - gpu_float1.device(gpu_device) = gpu_float1.random() - 0.5f; - gpu_float2.device(gpu_device) = gpu_float2.random() + 0.5f;; + gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f); + gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f); gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast(); gpu_res2_float.device(gpu_device) = gpu_float2.log().cast(); gpu_res1_half.device(gpu_device) = gpu_float1.cast().exp(); -- cgit v1.2.3 From dc7dbc2df71e88615c4f179a2eded7f617fca7a9 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 9 May 2016 10:17:17 -0700 Subject: Optimized the non blocking thread pool: * Use a pseudo-random permutation of queue indices during random stealing. This ensures that all the queues are considered. * Directly pop from a non-empty queue when we are waiting for work, instead of first noticing that there is a non-empty queue and then doing another round of random stealing to re-discover the non-empty queue. * Steal only 1 task from a remote queue instead of half of tasks. --- .../CXX11/src/ThreadPool/NonBlockingThreadPool.h | 150 ++++++++++++--------- unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h | 2 +- unsupported/test/cxx11_runqueue.cpp | 8 ++ 3 files changed, 96 insertions(+), 64 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h index 1c471a19f..f7e73aabe 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h @@ -23,14 +23,38 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { : env_(env), threads_(num_threads), queues_(num_threads), + coprimes_(num_threads), waiters_(num_threads), blocked_(), spinning_(), done_(), ec_(waiters_) { - for (int i = 0; i < num_threads; i++) queues_.push_back(new Queue()); - for (int i = 0; i < num_threads; i++) + // Calculate coprimes of num_threads. + // Coprimes are used for a random walk over all threads in Steal + // and NonEmptyQueueIndex. Iteration is based on the fact that if we take + // a walk starting thread index t and calculate num_threads - 1 subsequent + // indices as (t + coprime) % num_threads, we will cover all threads without + // repetitions (effectively getting a presudo-random permutation of thread + // indices). + for (unsigned i = 1; i <= num_threads; i++) { + unsigned a = i; + unsigned b = num_threads; + // If GCD(a, b) == 1, then a and b are coprimes. + while (b != 0) { + unsigned tmp = a; + a = b; + b = tmp % b; + } + if (a == 1) { + coprimes_.push_back(i); + } + } + for (int i = 0; i < num_threads; i++) { + queues_.push_back(new Queue()); + } + for (int i = 0; i < num_threads; i++) { threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); })); + } } ~NonBlockingThreadPoolTempl() { @@ -84,6 +108,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { Environment env_; MaxSizeVector threads_; MaxSizeVector queues_; + MaxSizeVector coprimes_; std::vector waiters_; std::atomic blocked_; std::atomic spinning_; @@ -97,82 +122,69 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { pt->index = index; Queue* q = queues_[index]; EventCount::Waiter* waiter = &waiters_[index]; - std::vector stolen; for (;;) { - Task t; - if (!stolen.empty()) { - t = std::move(stolen.back()); - stolen.pop_back(); - } - if (!t.f) t = q->PopFront(); + Task t = q->PopFront(); if (!t.f) { - if (Steal(&stolen)) { - t = std::move(stolen.back()); - stolen.pop_back(); - while (stolen.size()) { - Task t1 = q->PushFront(std::move(stolen.back())); - stolen.pop_back(); - if (t1.f) { - // There is not much we can do in this case. Just execute the - // remaining directly. - stolen.push_back(std::move(t1)); - break; + t = Steal(); + if (!t.f) { + // Leave one thread spinning. This reduces latency. + // TODO(dvyukov): 1000 iterations is based on fair dice roll, tune it. + // Also, the time it takes to attempt to steal work 1000 times depends + // on the size of the thread pool. However the speed at which the user + // of the thread pool submit tasks is independent of the size of the + // pool. Consider a time based limit instead. + if (!spinning_ && !spinning_.exchange(true)) { + for (int i = 0; i < 1000 && !t.f; i++) { + t = Steal(); + } + spinning_ = false; + } + if (!t.f) { + if (!WaitForWork(waiter, &t)) { + return; } } } } if (t.f) { env_.ExecuteTask(t); - continue; - } - // Leave one thread spinning. This reduces latency. - if (!spinning_ && !spinning_.exchange(true)) { - bool nowork = true; - for (int i = 0; i < 1000; i++) { - if (!OutOfWork()) { - nowork = false; - break; - } - } - spinning_ = false; - if (!nowork) continue; } - if (!WaitForWork(waiter)) return; } } // Steal tries to steal work from other worker threads in best-effort manner. - bool Steal(std::vector* stolen) { - if (queues_.size() == 1) return false; + Task Steal() { PerThread* pt = GetPerThread(); - unsigned lastq = pt->index; - for (unsigned i = queues_.size(); i > 0; i--) { - unsigned victim = Rand(&pt->rand) % queues_.size(); - if (victim == lastq && queues_.size() > 2) { - i++; - continue; + unsigned size = queues_.size(); + unsigned r = Rand(&pt->rand); + unsigned inc = coprimes_[r % coprimes_.size()]; + unsigned victim = r % size; + for (unsigned i = 0; i < size; i++) { + Task t = queues_[victim]->PopBack(); + if (t.f) { + return t; + } + victim += inc; + if (victim >= size) { + victim -= size; } - // Steal half of elements from a victim queue. - // It is typical to steal just one element, but that assumes that work is - // recursively subdivided in halves so that the stolen element is exactly - // half of work. If work elements are equally-sized, then is makes sense - // to steal half of elements at once and then work locally for a while. - if (queues_[victim]->PopBackHalf(stolen)) return true; - lastq = victim; } - // Just to make sure that we did not miss anything. - for (unsigned i = queues_.size(); i > 0; i--) - if (queues_[i - 1]->PopBackHalf(stolen)) return true; - return false; + return Task(); } - // WaitForWork blocks until new work is available, or if it is time to exit. - bool WaitForWork(EventCount::Waiter* waiter) { - // We already did best-effort emptiness check in Steal, so prepare blocking. + // WaitForWork blocks until new work is available (returns true), or if it is + // time to exit (returns false). Can optionally return a task to execute in t + // (in such case t.f != nullptr on return). + bool WaitForWork(EventCount::Waiter* waiter, Task* t) { + eigen_assert(!t->f); + // We already did best-effort emptiness check in Steal, so prepare for + // blocking. ec_.Prewait(waiter); - // Now do reliable emptiness check. - if (!OutOfWork()) { + // Now do a reliable emptiness check. + int victim = NonEmptyQueueIndex(); + if (victim != -1) { ec_.CancelWait(waiter); + *t = queues_[victim]->PopBack(); return true; } // Number of blocked threads is used as termination condition. @@ -186,7 +198,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { // right after incrementing blocked_ above. Now a free-standing thread // submits work and calls destructor (which sets done_). If we don't // re-check queues, we will exit leaving the work unexecuted. - if (!OutOfWork()) { + if (NonEmptyQueueIndex() != -1) { // Note: we must not pop from queues before we decrement blocked_, // otherwise the following scenario is possible. Consider that instead // of checking for emptiness we popped the only element from queues. @@ -205,10 +217,22 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { return true; } - bool OutOfWork() { - for (unsigned i = 0; i < queues_.size(); i++) - if (!queues_[i]->Empty()) return false; - return true; + int NonEmptyQueueIndex() { + PerThread* pt = GetPerThread(); + unsigned size = queues_.size(); + unsigned r = Rand(&pt->rand); + unsigned inc = coprimes_[r % coprimes_.size()]; + unsigned victim = r % size; + for (unsigned i = 0; i < size; i++) { + if (!queues_[victim]->Empty()) { + return victim; + } + victim += inc; + if (victim >= size) { + victim -= size; + } + } + return -1; } PerThread* GetPerThread() { diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h index 0544a6e15..48de960ae 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h @@ -100,7 +100,7 @@ class RunQueue { // PopBack removes and returns the last elements in the queue. // Can fail spuriously. Work PopBack() { - if (Empty()) return 0; + if (Empty()) return Work(); std::unique_lock lock(mutex_, std::try_to_lock); if (!lock) return Work(); unsigned back = back_.load(std::memory_order_relaxed); diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp index d20d87111..2594ff0c5 100644 --- a/unsupported/test/cxx11_runqueue.cpp +++ b/unsupported/test/cxx11_runqueue.cpp @@ -100,6 +100,14 @@ void test_basic_runqueue() // Empty again. VERIFY(q.Empty()); VERIFY_IS_EQUAL(0u, q.Size()); + VERIFY_IS_EQUAL(0, q.PushFront(1)); + VERIFY_IS_EQUAL(0, q.PushFront(2)); + VERIFY_IS_EQUAL(0, q.PushFront(3)); + VERIFY_IS_EQUAL(1, q.PopBack()); + VERIFY_IS_EQUAL(2, q.PopBack()); + VERIFY_IS_EQUAL(3, q.PopBack()); + VERIFY(q.Empty()); + VERIFY_IS_EQUAL(0, q.Size()); } // Empty tests that the queue is not claimed to be empty when is is in fact not. -- cgit v1.2.3 From ba95e43ea25fd0c6066630ac121559c2e0ba7728 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 9 May 2016 10:45:12 -0700 Subject: Added a new parallelFor api to the thread pool device. --- .../CXX11/src/Tensor/TensorDeviceThreadPool.h | 98 ++++++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index c02891465..4df4cc220 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -172,7 +172,105 @@ struct ThreadPoolDevice { pool_->Schedule(func); } + // parallelFor executes f with [0, size) arguments in parallel and waits for + // completion. Block size is choosen between min_block_size and + // 2 * min_block_size to achieve the best parallel efficiency. + // If min_block_size == -1, parallelFor uses block size of 1. + // If hard_align > 0, block size is aligned to hard_align. + // If soft_align > hard_align, block size is aligned to soft_align provided + // that it does not increase block size too much. + void parallelFor(Index size, Index min_block_size, Index hard_align, + Index soft_align, + std::function f) const { + if (size <= 1 || (min_block_size != -1 && size < min_block_size) || + numThreads() == 1) { + f(0, size); + return; + } + + Index block_size = 1; + Index block_count = size; + if (min_block_size != -1) { + // Calculate block size based on (1) estimated cost and (2) parallel + // efficiency. We want blocks to be not too small to mitigate + // parallelization overheads; not too large to mitigate tail effect and + // potential load imbalance and we also want number of blocks to be evenly + // dividable across threads. + min_block_size = numext::maxi(min_block_size, 1); + block_size = numext::mini(min_block_size, size); + // Upper bound on block size: + const Index max_block_size = numext::mini(min_block_size * 2, size); + block_size = numext::mini( + alignBlockSize(block_size, hard_align, soft_align), size); + block_count = divup(size, block_size); + // Calculate parallel efficiency as fraction of total CPU time used for + // computations: + double max_efficiency = + static_cast(block_count) / + (divup(block_count, numThreads()) * numThreads()); + // Now try to increase block size up to max_block_size as long as it + // doesn't decrease parallel efficiency. + for (Index prev_block_count = block_count; prev_block_count > 1;) { + // This is the next block size that divides size into a smaller number + // of blocks than the current block_size. + Index coarser_block_size = divup(size, prev_block_count - 1); + coarser_block_size = + alignBlockSize(coarser_block_size, hard_align, soft_align); + if (coarser_block_size > max_block_size) { + break; // Reached max block size. Stop. + } + // Recalculate parallel efficiency. + const Index coarser_block_count = divup(size, coarser_block_size); + eigen_assert(coarser_block_count < prev_block_count); + prev_block_count = coarser_block_count; + const double coarser_efficiency = + static_cast(coarser_block_count) / + (divup(coarser_block_count, numThreads()) * numThreads()); + if (coarser_efficiency + 0.01 >= max_efficiency) { + // Taking it. + block_size = coarser_block_size; + block_count = coarser_block_count; + if (max_efficiency < coarser_efficiency) { + max_efficiency = coarser_efficiency; + } + } + } + } + + // Recursively divide size into halves until we reach block_size. + // Division code rounds mid to block_size, so we are guaranteed to get + // block_count leaves that do actual computations. + Barrier barrier(block_count); + std::function handleRange; + handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) { + if (last - first <= block_size) { + // Single block or less, execute directly. + f(first, last); + barrier.Notify(); + return; + } + // Split into halves and submit to the pool. + Index mid = first + divup((last - first) / 2, block_size) * block_size; + pool_->Schedule([=, &handleRange]() { handleRange(mid, last); }); + pool_->Schedule([=, &handleRange]() { handleRange(first, mid); }); + }; + handleRange(0, size); + barrier.Wait(); + } + private: + static Index alignBlockSize(Index size, Index hard_align, Index soft_align) { + if (soft_align > hard_align && size >= 4 * soft_align) { + // Align to soft_align, if it won't increase size by more than 25%. + return (size + soft_align - 1) & ~(soft_align - 1); + } + if (hard_align > 0) { + return (size + hard_align - 1) & ~(hard_align - 1); + } + return size; + } + + ThreadPoolInterface* pool_; size_t num_threads_; }; -- cgit v1.2.3 From c3859a2b584730ed1cb155a6d9bc422592d8d26b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 9 May 2016 17:05:53 -0700 Subject: Added the ability to use a scratch buffer in cuda kernels --- .../Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 31 +++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index 1d2d162dc..2fa6cad34 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -24,6 +24,9 @@ class StreamInterface { // Allocate memory on the actual device where the computation will run virtual void* allocate(size_t num_bytes) const = 0; virtual void deallocate(void* buffer) const = 0; + + // Return a scratchpad buffer of size 1k + virtual void* scratchpad() const = 0; }; static cudaDeviceProp* m_deviceProperties; @@ -62,12 +65,12 @@ static const cudaStream_t default_stream = cudaStreamDefault; class CudaStreamDevice : public StreamInterface { public: // Use the default stream on the current device - CudaStreamDevice() : stream_(&default_stream) { + CudaStreamDevice() : stream_(&default_stream), scratch_(NULL) { cudaGetDevice(&device_); initializeDeviceProp(); } // Use the default stream on the specified device - CudaStreamDevice(int device) : stream_(&default_stream), device_(device) { + CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL) { initializeDeviceProp(); } // Use the specified stream. Note that it's the @@ -75,7 +78,7 @@ class CudaStreamDevice : public StreamInterface { // the specified device. If no device is specified the code // assumes that the stream is associated to the current gpu device. CudaStreamDevice(const cudaStream_t* stream, int device = -1) - : stream_(stream), device_(device) { + : stream_(stream), device_(device), scratch_(NULL) { if (device < 0) { cudaGetDevice(&device_); } else { @@ -89,6 +92,12 @@ class CudaStreamDevice : public StreamInterface { initializeDeviceProp(); } + virtual ~CudaStreamDevice() { + if (scratch_) { + deallocate(scratch_); + } + } + const cudaStream_t& stream() const { return *stream_; } const cudaDeviceProp& deviceProperties() const { return m_deviceProperties[device_]; @@ -112,9 +121,17 @@ class CudaStreamDevice : public StreamInterface { assert(err == cudaSuccess); } + virtual void* scratchpad() const { + if (scratch_ == NULL) { + scratch_ = allocate(1024); + } + return scratch_; + } + private: const cudaStream_t* stream_; int device_; + mutable void* scratch_; }; struct GpuDevice { @@ -143,10 +160,18 @@ struct GpuDevice { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { #ifndef __CUDA_ARCH__ stream_->deallocate(buffer); +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* scratchpad() const { +#ifndef __CUDA_ARCH__ + return stream_->scratchpad(); #else eigen_assert(false && "The default device should be used instead to generate kernel code"); #endif + return NULL; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { -- cgit v1.2.3 From 4670d7d5ce2517b2e9201f1cf44ae62ef2284eb5 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 9 May 2016 17:09:54 -0700 Subject: Improved the performance of full reductions on GPU: Before: BM_fullReduction/10 200000 11751 8.51 MFlops/s BM_fullReduction/80 5000 523385 12.23 MFlops/s BM_fullReduction/640 50 36179326 11.32 MFlops/s BM_fullReduction/4K 1 2173517195 11.50 MFlops/s After: BM_fullReduction/10 500000 5987 16.70 MFlops/s BM_fullReduction/80 200000 10636 601.73 MFlops/s BM_fullReduction/640 50000 58428 7010.31 MFlops/s BM_fullReduction/4K 1000 2006106 12461.95 MFlops/s --- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 8 + .../Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 170 ++++++++++++++++++--- 2 files changed, 159 insertions(+), 19 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 885295f0a..97f4b34b3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -322,6 +322,12 @@ struct OuterReducer { template __global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*); + +template +__global__ void ReductionInitKernelHalfFloat(R, const S, I, half2*); +template +__global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*); + template __global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); @@ -618,6 +624,8 @@ struct TensorEvaluator, Device> #endif #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) template friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*); + template friend void internal::ReductionInitKernelHalfFloat(R, const S, I, half2*); + template friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); template friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); template friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index fd2587dd5..9186dffe4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -67,6 +67,30 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) #endif } + +template