From 1aa89fb85548dc425d54d2cbe7f28915c29db13a Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 1 Apr 2016 10:27:59 -0700 Subject: Add matrix condition estimator module that implements the Higham/Hager algorithm from http://www.maths.manchester.ac.uk/~higham/narep/narep135.pdf used in LPACK. Add rcond() methods to FullPivLU and PartialPivLU. --- Eigen/Core | 9 +- Eigen/src/Core/ConditionEstimator.h | 279 ++++++++++++++++++++++++++++++++++++ Eigen/src/LU/FullPivLU.h | 13 +- Eigen/src/LU/PartialPivLU.h | 19 ++- test/lu.cpp | 22 ++- 5 files changed, 332 insertions(+), 10 deletions(-) create mode 100644 Eigen/src/Core/ConditionEstimator.h diff --git a/Eigen/Core b/Eigen/Core index 8428c51e4..0a196c814 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -33,13 +33,13 @@ #ifdef EIGEN_EXCEPTIONS #undef EIGEN_EXCEPTIONS #endif - + // All functions callable from CUDA code must be qualified with __device__ #define EIGEN_DEVICE_FUNC __host__ __device__ - + #else #define EIGEN_DEVICE_FUNC - + #endif #if defined(__CUDA_ARCH__) @@ -282,7 +282,7 @@ inline static const char *SimdInstructionSetsInUse(void) { // we use size_t frequently and we'll never remember to prepend it with std:: everytime just to // ensure QNX/QCC support using std::size_t; -// gcc 4.6.0 wants std:: for ptrdiff_t +// gcc 4.6.0 wants std:: for ptrdiff_t using std::ptrdiff_t; /** \defgroup Core_Module Core module @@ -422,6 +422,7 @@ using std::ptrdiff_t; #include "src/Core/products/TriangularSolverVector.h" #include "src/Core/BandMatrix.h" #include "src/Core/CoreIterators.h" +#include "src/Core/ConditionEstimator.h" #include "src/Core/BooleanRedux.h" #include "src/Core/Select.h" diff --git a/Eigen/src/Core/ConditionEstimator.h b/Eigen/src/Core/ConditionEstimator.h new file mode 100644 index 000000000..ab6f59319 --- /dev/null +++ b/Eigen/src/Core/ConditionEstimator.h @@ -0,0 +1,279 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Rasmus Munk Larsen (rmlarsen@google.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CONDITIONESTIMATOR_H +#define EIGEN_CONDITIONESTIMATOR_H + +namespace Eigen { + +namespace internal { +template +struct EstimateInverseL1NormImpl {}; +} // namespace internal + +template +class ConditionEstimator { + public: + typedef typename Decomposition::MatrixType MatrixType; + typedef typename internal::traits::Scalar Scalar; + typedef typename NumTraits::Real RealScalar; + typedef typename internal::plain_col_type::type Vector; + + /** \class ConditionEstimator + * \ingroup Core_Module + * + * \brief Condition number estimator. + * + * Computing a decomposition of a dense matrix takes O(n^3) operations, while + * this method estimates the condition number quickly and reliably in O(n^2) + * operations. + * + * \returns an estimate of the reciprocal condition number + * (1 / (||matrix||_1 * ||inv(matrix)||_1)) of matrix, given the matrix and + * its decomposition. Supports the following decompositions: FullPivLU, + * PartialPivLU. + * + * \sa FullPivLU, PartialPivLU. + */ + static RealScalar rcond(const MatrixType& matrix, const Decomposition& dec) { + eigen_assert(matrix.rows() == dec.rows()); + eigen_assert(matrix.cols() == dec.cols()); + eigen_assert(matrix.rows() == matrix.cols()); + if (dec.rows() == 0) { + return RealScalar(1); + } + RealScalar matrix_l1_norm = matrix.cwiseAbs().colwise().sum().maxCoeff(); + return rcond(MatrixL1Norm(matrix), dec); + } + + /** \class ConditionEstimator + * \ingroup Core_Module + * + * \brief Condition number estimator. + * + * Computing a decomposition of a dense matrix takes O(n^3) operations, while + * this method estimates the condition number quickly and reliably in O(n^2) + * operations. + * + * \returns an estimate of the reciprocal condition number + * (1 / (||matrix||_1 * ||inv(matrix)||_1)) of matrix, given ||matrix||_1 and + * its decomposition. Supports the following decompositions: FullPivLU, + * PartialPivLU. + * + * \sa FullPivLU, PartialPivLU. + */ + static RealScalar rcond(RealScalar matrix_norm, const Decomposition& dec) { + eigen_assert(dec.rows() == dec.cols()); + if (dec.rows() == 0) { + return 1; + } + if (matrix_norm == 0) { + return 0; + } + const RealScalar inverse_matrix_norm = EstimateInverseL1Norm(dec); + return inverse_matrix_norm == 0 ? 0 + : (1 / inverse_matrix_norm) / matrix_norm; + } + + /* + * Fast algorithm for computing a lower bound estimate on the L1 norm of + * the inverse of the matrix using at most 10 calls to the solve method on its + * decomposition. This is an implementation of Algorithm 4.1 in + * http://www.maths.manchester.ac.uk/~higham/narep/narep135.pdf + * The most common usage of this algorithm is in estimating the condition + * number ||A||_1 * ||A^{-1}||_1 of a matrix A. While ||A||_1 can be computed + * directly in O(dims^2) operations (see MatrixL1Norm() below), while + * there is no cheap closed-form expression for ||A^{-1}||_1. + * Given a decompostion of A, this algorithm estimates ||A^{-1}|| in O(dims^2) + * operations. This is done by providing operators that use the decomposition + * to solve systems of the form A x = b or A^* z = c by back-substitution, + * each costing O(dims^2) operations. Since at most 10 calls are performed, + * the total cost is O(dims^2), as opposed to O(dims^3) if the inverse matrix + * B^{-1} was formed explicitly. + */ + static RealScalar EstimateInverseL1Norm(const Decomposition& dec) { + eigen_assert(dec.rows() == dec.cols()); + const int n = dec.rows(); + if (n == 0) { + return 0; + } + return internal::EstimateInverseL1NormImpl< + Decomposition, NumTraits::IsComplex>::compute(dec); + } +}; + +namespace internal { +// Partial specialization for real matrices. +template +struct EstimateInverseL1NormImpl { + typedef typename Decomposition::MatrixType MatrixType; + typedef typename internal::traits::Scalar Scalar; + typedef typename internal::plain_col_type::type Vector; + + // Shorthand for vector L1 norm in Eigen. + inline static Scalar VectorL1Norm(const Vector& v) { + return v.template lpNorm<1>(); + } + + static inline Scalar compute(const Decomposition& dec) { + const int n = dec.rows(); + const Vector plus = Vector::Ones(n); + Vector v = plus / n; + v = dec.solve(v); + Scalar lower_bound = VectorL1Norm(v); + if (n == 1) { + return lower_bound; + } + // lower_bound is a lower bound on ||inv(A)||_1 = sup_v ||inv(A) v||_1 / + // ||v||_1 and is the objective maximized by the ("super-") gradient ascent + // algorithm. + // Basic idea: We know that the optimum is achieved at one of the simplices + // v = e_i, so in each iteration we follow a super-gradient to move towards + // the optimal one. + Scalar old_lower_bound = lower_bound; + const Vector minus = -Vector::Ones(n); + Vector sign_vector = (v.cwiseAbs().array() == 0).select(plus, minus); + Vector old_sign_vector = sign_vector; + int v_max_abs_index = -1; + int old_v_max_abs_index = v_max_abs_index; + for (int k = 0; k < 4; ++k) { + // argmax |inv(A)^T * sign_vector| + v = dec.transpose().solve(sign_vector); + v.cwiseAbs().maxCoeff(&v_max_abs_index); + if (v_max_abs_index == old_v_max_abs_index) { + // Break if the solution stagnated. + break; + } + // Move to the new simplex e_j, where j = v_max_abs_index. + v.setZero(); + v[v_max_abs_index] = 1; + v = dec.solve(v); // v = inv(A) * e_j. + lower_bound = VectorL1Norm(v); + if (lower_bound <= old_lower_bound) { + // Break if the gradient step did not increase the lower_bound. + break; + } + sign_vector = (v.array() < 0).select(plus, minus); + if (sign_vector == old_sign_vector) { + // Break if the solution stagnated. + break; + } + old_sign_vector = sign_vector; + old_v_max_abs_index = v_max_abs_index; + old_lower_bound = lower_bound; + } + // The following calculates an independent estimate of ||A||_1 by + // multiplying + // A by a vector with entries of slowly increasing magnitude and alternating + // sign: v_i = (-1)^{i} (1 + (i / (dim-1))), i = 0,...,dim-1. This + // improvement + // to Hager's algorithm above is due to Higham. It was added to make the + // algorithm more robust in certain corner cases where large elements in + // the matrix might otherwise escape detection due to exact cancellation + // (especially when op and op_adjoint correspond to a sequence of + // backsubstitutions and permutations), which could cause Hager's algorithm + // to vastly underestimate ||A||_1. + Scalar alternating_sign = 1; + for (int i = 0; i < n; ++i) { + v[i] = alternating_sign * static_cast(1) + + (static_cast(i) / (static_cast(n - 1))); + alternating_sign = -alternating_sign; + } + v = dec.solve(v); + const Scalar alternate_lower_bound = + (2 * VectorL1Norm(v)) / (3 * static_cast(n)); + return numext::maxi(lower_bound, alternate_lower_bound); + } +}; + +// Partial specialization for complex matrices. +template +struct EstimateInverseL1NormImpl { + typedef typename Decomposition::MatrixType MatrixType; + typedef typename internal::traits::Scalar Scalar; + typedef typename NumTraits::Real RealScalar; + typedef typename internal::plain_col_type::type Vector; + typedef typename internal::plain_col_type::type + RealVector; + + // Shorthand for vector L1 norm in Eigen. + inline static RealScalar VectorL1Norm(const Vector& v) { + return v.template lpNorm<1>(); + } + + static inline RealScalar compute(const Decomposition& dec) { + const int n = dec.rows(); + const Vector ones = Vector::Ones(n); + Vector v = ones / n; + v = dec.solve(v); + RealScalar lower_bound = VectorL1Norm(v); + if (n == 1) { + return lower_bound; + } + // lower_bound is a lower bound on ||inv(A)||_1 = sup_v ||inv(A) v||_1 / + // ||v||_1 and is the objective maximized by the ("super-") gradient ascent + // algorithm. + // Basic idea: We know that the optimum is achieved at one of the simplices + // v = e_i, so in each iteration we follow a super-gradient to move towards + // the optimal one. + RealScalar old_lower_bound = lower_bound; + int v_max_abs_index = -1; + int old_v_max_abs_index = v_max_abs_index; + for (int k = 0; k < 4; ++k) { + // argmax |inv(A)^* * sign_vector| + RealVector abs_v = v.cwiseAbs(); + const Vector psi = + (abs_v.array() == 0).select(v.cwiseQuotient(abs_v), ones); + v = dec.adjoint().solve(psi); + const RealVector z = v.real(); + z.cwiseAbs().maxCoeff(&v_max_abs_index); + if (v_max_abs_index == old_v_max_abs_index) { + // Break if the solution stagnated. + break; + } + // Move to the new simplex e_j, where j = v_max_abs_index. + v.setZero(); + v[v_max_abs_index] = 1; + v = dec.solve(v); // v = inv(A) * e_j. + lower_bound = VectorL1Norm(v); + if (lower_bound <= old_lower_bound) { + // Break if the gradient step did not increase the lower_bound. + break; + } + old_v_max_abs_index = v_max_abs_index; + old_lower_bound = lower_bound; + } + // The following calculates an independent estimate of ||A||_1 by + // multiplying + // A by a vector with entries of slowly increasing magnitude and alternating + // sign: v_i = (-1)^{i} (1 + (i / (dim-1))), i = 0,...,dim-1. This + // improvement + // to Hager's algorithm above is due to Higham. It was added to make the + // algorithm more robust in certain corner cases where large elements in + // the matrix might otherwise escape detection due to exact cancellation + // (especially when op and op_adjoint correspond to a sequence of + // backsubstitutions and permutations), which could cause Hager's algorithm + // to vastly underestimate ||A||_1. + RealScalar alternating_sign = 1; + for (int i = 0; i < n; ++i) { + v[i] = alternating_sign * static_cast(1) + + (static_cast(i) / (static_cast(n - 1))); + alternating_sign = -alternating_sign; + } + v = dec.solve(v); + const RealScalar alternate_lower_bound = + (2 * VectorL1Norm(v)) / (3 * static_cast(n)); + return numext::maxi(lower_bound, alternate_lower_bound); + } +}; + +} // namespace internal +} // namespace Eigen + +#endif diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h index 1721213d6..ff0b78c35 100644 --- a/Eigen/src/LU/FullPivLU.h +++ b/Eigen/src/LU/FullPivLU.h @@ -231,6 +231,15 @@ template class FullPivLU return Solve(*this, b.derived()); } + /** \returns an estimate of the reciprocal condition number of the matrix of which *this is + the LU decomposition. + */ + inline RealScalar rcond() const + { + eigen_assert(m_isInitialized && "PartialPivLU is not initialized."); + return ConditionEstimator >::rcond(m_l1_norm, *this); + } + /** \returns the determinant of the matrix of which * *this is the LU decomposition. It has only linear complexity * (that is, O(n) where n is the dimension of the square matrix) @@ -410,6 +419,7 @@ template class FullPivLU IntColVectorType m_rowsTranspositions; IntRowVectorType m_colsTranspositions; Index m_det_pq, m_nonzero_pivots; + RealScalar m_l1_norm; RealScalar m_maxpivot, m_prescribedThreshold; bool m_isInitialized, m_usePrescribedThreshold; }; @@ -455,11 +465,12 @@ FullPivLU& FullPivLU::compute(const EigenBase // the permutations are stored as int indices, so just to be sure: eigen_assert(matrix.rows()<=NumTraits::highest() && matrix.cols()<=NumTraits::highest()); - m_isInitialized = true; m_lu = matrix.derived(); + m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff(); computeInPlace(); + m_isInitialized = true; return *this; } diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h index ab7797d2a..5d71a66d0 100644 --- a/Eigen/src/LU/PartialPivLU.h +++ b/Eigen/src/LU/PartialPivLU.h @@ -76,7 +76,6 @@ template class PartialPivLU typedef Transpositions TranspositionType; typedef typename MatrixType::PlainObject PlainObject; - /** * \brief Default Constructor. * @@ -152,6 +151,15 @@ template class PartialPivLU return Solve(*this, b.derived()); } + /** \returns an estimate of the reciprocal condition number of the matrix of which *this is + the LU decomposition. + */ + inline RealScalar rcond() const + { + eigen_assert(m_isInitialized && "PartialPivLU is not initialized."); + return ConditionEstimator >::rcond(m_l1_norm, *this); + } + /** \returns the inverse of the matrix of which *this is the LU decomposition. * * \warning The matrix being decomposed here is assumed to be invertible. If you need to check for @@ -178,7 +186,7 @@ template class PartialPivLU * * \sa MatrixBase::determinant() */ - typename internal::traits::Scalar determinant() const; + Scalar determinant() const; MatrixType reconstructedMatrix() const; @@ -247,6 +255,7 @@ template class PartialPivLU PermutationType m_p; TranspositionType m_rowsTranspositions; Index m_det_p; + RealScalar m_l1_norm; bool m_isInitialized; }; @@ -256,6 +265,7 @@ PartialPivLU::PartialPivLU() m_p(), m_rowsTranspositions(), m_det_p(0), + m_l1_norm(0), m_isInitialized(false) { } @@ -266,6 +276,7 @@ PartialPivLU::PartialPivLU(Index size) m_p(size), m_rowsTranspositions(size), m_det_p(0), + m_l1_norm(0), m_isInitialized(false) { } @@ -277,6 +288,7 @@ PartialPivLU::PartialPivLU(const EigenBase& matrix) m_p(matrix.rows()), m_rowsTranspositions(matrix.rows()), m_det_p(0), + m_l1_norm(0), m_isInitialized(false) { compute(matrix.derived()); @@ -467,6 +479,7 @@ PartialPivLU& PartialPivLU::compute(const EigenBase::highest()); m_lu = matrix.derived(); + m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff(); eigen_assert(matrix.rows() == matrix.cols() && "PartialPivLU is only for square (and moreover invertible) matrices"); const Index size = matrix.rows(); @@ -484,7 +497,7 @@ PartialPivLU& PartialPivLU::compute(const EigenBase -typename internal::traits::Scalar PartialPivLU::determinant() const +typename PartialPivLU::Scalar PartialPivLU::determinant() const { eigen_assert(m_isInitialized && "PartialPivLU is not initialized."); return Scalar(m_det_p) * m_lu.diagonal().prod(); diff --git a/test/lu.cpp b/test/lu.cpp index f14435114..31991520d 100644 --- a/test/lu.cpp +++ b/test/lu.cpp @@ -11,6 +11,11 @@ #include using namespace std; +template +typename MatrixType::RealScalar matrix_l1_norm(const MatrixType& m) { + return m.cwiseAbs().colwise().sum().maxCoeff(); +} + template void lu_non_invertible() { typedef typename MatrixType::Index Index; @@ -143,7 +148,13 @@ template void lu_invertible() m3 = MatrixType::Random(size,size); m2 = lu.solve(m3); VERIFY_IS_APPROX(m3, m1*m2); - VERIFY_IS_APPROX(m2, lu.inverse()*m3); + MatrixType m1_inverse = lu.inverse(); + VERIFY_IS_APPROX(m2, m1_inverse*m3); + + // Test condition number estimation. + RealScalar rcond = RealScalar(1) / matrix_l1_norm(m1) / matrix_l1_norm(m1_inverse); + // Verify that the estimate is within a factor of 10 of the truth. + VERIFY(lu.rcond() > rcond / 10 && lu.rcond() < rcond * 10); // test solve with transposed lu.template _solve_impl_transposed(m3, m2); @@ -170,6 +181,7 @@ template void lu_partial_piv() PartialPivLU.h */ typedef typename MatrixType::Index Index; + typedef typename NumTraits::Real RealScalar; Index size = internal::random(1,4); MatrixType m1(size, size), m2(size, size), m3(size, size); @@ -181,7 +193,13 @@ template void lu_partial_piv() m3 = MatrixType::Random(size,size); m2 = plu.solve(m3); VERIFY_IS_APPROX(m3, m1*m2); - VERIFY_IS_APPROX(m2, plu.inverse()*m3); + MatrixType m1_inverse = plu.inverse(); + VERIFY_IS_APPROX(m2, m1_inverse*m3); + + // Test condition number estimation. + RealScalar rcond = RealScalar(1) / matrix_l1_norm(m1) / matrix_l1_norm(m1_inverse); + // Verify that the estimate is within a factor of 10 of the truth. + VERIFY(plu.rcond() > rcond / 10 && plu.rcond() < rcond * 10); // test solve with transposed plu.template _solve_impl_transposed(m3, m2); -- cgit v1.2.3 From 91414e0042779b1b9d312d9255f389e67aa38106 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 1 Apr 2016 11:58:17 -0700 Subject: Fix comments in ConditionEstimator and minor cleanup. --- Eigen/src/Core/ConditionEstimator.h | 119 +++++++++++++++++++----------------- test/lu.cpp | 4 +- 2 files changed, 65 insertions(+), 58 deletions(-) diff --git a/Eigen/src/Core/ConditionEstimator.h b/Eigen/src/Core/ConditionEstimator.h index ab6f59319..68e4535aa 100644 --- a/Eigen/src/Core/ConditionEstimator.h +++ b/Eigen/src/Core/ConditionEstimator.h @@ -14,7 +14,7 @@ namespace Eigen { namespace internal { template -struct EstimateInverseL1NormImpl {}; +struct EstimateInverseMatrixL1NormImpl {}; } // namespace internal template @@ -48,7 +48,6 @@ class ConditionEstimator { if (dec.rows() == 0) { return RealScalar(1); } - RealScalar matrix_l1_norm = matrix.cwiseAbs().colwise().sum().maxCoeff(); return rcond(MatrixL1Norm(matrix), dec); } @@ -76,42 +75,50 @@ class ConditionEstimator { if (matrix_norm == 0) { return 0; } - const RealScalar inverse_matrix_norm = EstimateInverseL1Norm(dec); + const RealScalar inverse_matrix_norm = EstimateInverseMatrixL1Norm(dec); return inverse_matrix_norm == 0 ? 0 : (1 / inverse_matrix_norm) / matrix_norm; } - /* - * Fast algorithm for computing a lower bound estimate on the L1 norm of - * the inverse of the matrix using at most 10 calls to the solve method on its - * decomposition. This is an implementation of Algorithm 4.1 in - * http://www.maths.manchester.ac.uk/~higham/narep/narep135.pdf - * The most common usage of this algorithm is in estimating the condition - * number ||A||_1 * ||A^{-1}||_1 of a matrix A. While ||A||_1 can be computed - * directly in O(dims^2) operations (see MatrixL1Norm() below), while - * there is no cheap closed-form expression for ||A^{-1}||_1. - * Given a decompostion of A, this algorithm estimates ||A^{-1}|| in O(dims^2) - * operations. This is done by providing operators that use the decomposition - * to solve systems of the form A x = b or A^* z = c by back-substitution, - * each costing O(dims^2) operations. Since at most 10 calls are performed, - * the total cost is O(dims^2), as opposed to O(dims^3) if the inverse matrix - * B^{-1} was formed explicitly. - */ - static RealScalar EstimateInverseL1Norm(const Decomposition& dec) { + /** + * \returns an estimate of ||inv(matrix)||_1 given a decomposition of + * matrix that implements .solve() and .adjoint().solve() methods. + * + * The method implements Algorithms 4.1 and 5.1 from + * http://www.maths.manchester.ac.uk/~higham/narep/narep135.pdf + * which also forms the basis for the condition number estimators in + * LAPACK. Since at most 10 calls to the solve method of dec are + * performed, the total cost is O(dims^2), as opposed to O(dims^3) + * needed to compute the inverse matrix explicitly. + * + * The most common usage is in estimating the condition number + * ||matrix||_1 * ||inv(matrix)||_1. The first term ||matrix||_1 can be + * computed directly in O(n^2) operations. + */ + static RealScalar EstimateInverseMatrixL1Norm(const Decomposition& dec) { eigen_assert(dec.rows() == dec.cols()); - const int n = dec.rows(); - if (n == 0) { + if (dec.rows() == 0) { return 0; } - return internal::EstimateInverseL1NormImpl< + return internal::EstimateInverseMatrixL1NormImpl< Decomposition, NumTraits::IsComplex>::compute(dec); } + + /** + * \returns the induced matrix l1-norm + * ||matrix||_1 = sup ||matrix * v||_1 / ||v||_1, which is equal to + * the greatest absolute column sum. + */ + inline static Scalar MatrixL1Norm(const MatrixType& matrix) { + return matrix.cwiseAbs().colwise().sum().maxCoeff(); + } }; namespace internal { + // Partial specialization for real matrices. template -struct EstimateInverseL1NormImpl { +struct EstimateInverseMatrixL1NormImpl { typedef typename Decomposition::MatrixType MatrixType; typedef typename internal::traits::Scalar Scalar; typedef typename internal::plain_col_type::type Vector; @@ -130,8 +137,9 @@ struct EstimateInverseL1NormImpl { if (n == 1) { return lower_bound; } - // lower_bound is a lower bound on ||inv(A)||_1 = sup_v ||inv(A) v||_1 / - // ||v||_1 and is the objective maximized by the ("super-") gradient ascent + // lower_bound is a lower bound on + // ||inv(matrix)||_1 = sup_v ||inv(matrix) v||_1 / ||v||_1 + // and is the objective maximized by the ("super-") gradient ascent // algorithm. // Basic idea: We know that the optimum is achieved at one of the simplices // v = e_i, so in each iteration we follow a super-gradient to move towards @@ -143,8 +151,8 @@ struct EstimateInverseL1NormImpl { int v_max_abs_index = -1; int old_v_max_abs_index = v_max_abs_index; for (int k = 0; k < 4; ++k) { - // argmax |inv(A)^T * sign_vector| - v = dec.transpose().solve(sign_vector); + // argmax |inv(matrix)^T * sign_vector| + v = dec.adjoint().solve(sign_vector); v.cwiseAbs().maxCoeff(&v_max_abs_index); if (v_max_abs_index == old_v_max_abs_index) { // Break if the solution stagnated. @@ -153,7 +161,7 @@ struct EstimateInverseL1NormImpl { // Move to the new simplex e_j, where j = v_max_abs_index. v.setZero(); v[v_max_abs_index] = 1; - v = dec.solve(v); // v = inv(A) * e_j. + v = dec.solve(v); // v = inv(matrix) * e_j. lower_bound = VectorL1Norm(v); if (lower_bound <= old_lower_bound) { // Break if the gradient step did not increase the lower_bound. @@ -168,17 +176,16 @@ struct EstimateInverseL1NormImpl { old_v_max_abs_index = v_max_abs_index; old_lower_bound = lower_bound; } - // The following calculates an independent estimate of ||A||_1 by - // multiplying - // A by a vector with entries of slowly increasing magnitude and alternating - // sign: v_i = (-1)^{i} (1 + (i / (dim-1))), i = 0,...,dim-1. This - // improvement - // to Hager's algorithm above is due to Higham. It was added to make the - // algorithm more robust in certain corner cases where large elements in - // the matrix might otherwise escape detection due to exact cancellation - // (especially when op and op_adjoint correspond to a sequence of - // backsubstitutions and permutations), which could cause Hager's algorithm - // to vastly underestimate ||A||_1. + // The following calculates an independent estimate of ||matrix||_1 by + // multiplying matrix by a vector with entries of slowly increasing + // magnitude and alternating sign: + // v_i = (-1)^{i} (1 + (i / (dim-1))), i = 0,...,dim-1. + // This improvement to Hager's algorithm above is due to Higham. It was + // added to make the algorithm more robust in certain corner cases where + // large elements in the matrix might otherwise escape detection due to + // exact cancellation (especially when op and op_adjoint correspond to a + // sequence of backsubstitutions and permutations), which could cause + // Hager's algorithm to vastly underestimate ||matrix||_1. Scalar alternating_sign = 1; for (int i = 0; i < n; ++i) { v[i] = alternating_sign * static_cast(1) + @@ -194,7 +201,7 @@ struct EstimateInverseL1NormImpl { // Partial specialization for complex matrices. template -struct EstimateInverseL1NormImpl { +struct EstimateInverseMatrixL1NormImpl { typedef typename Decomposition::MatrixType MatrixType; typedef typename internal::traits::Scalar Scalar; typedef typename NumTraits::Real RealScalar; @@ -216,8 +223,9 @@ struct EstimateInverseL1NormImpl { if (n == 1) { return lower_bound; } - // lower_bound is a lower bound on ||inv(A)||_1 = sup_v ||inv(A) v||_1 / - // ||v||_1 and is the objective maximized by the ("super-") gradient ascent + // lower_bound is a lower bound on + // ||inv(matrix)||_1 = sup_v ||inv(matrix) v||_1 / ||v||_1 + // and is the objective maximized by the ("super-") gradient ascent // algorithm. // Basic idea: We know that the optimum is achieved at one of the simplices // v = e_i, so in each iteration we follow a super-gradient to move towards @@ -226,7 +234,7 @@ struct EstimateInverseL1NormImpl { int v_max_abs_index = -1; int old_v_max_abs_index = v_max_abs_index; for (int k = 0; k < 4; ++k) { - // argmax |inv(A)^* * sign_vector| + // argmax |inv(matrix)^* * sign_vector| RealVector abs_v = v.cwiseAbs(); const Vector psi = (abs_v.array() == 0).select(v.cwiseQuotient(abs_v), ones); @@ -240,7 +248,7 @@ struct EstimateInverseL1NormImpl { // Move to the new simplex e_j, where j = v_max_abs_index. v.setZero(); v[v_max_abs_index] = 1; - v = dec.solve(v); // v = inv(A) * e_j. + v = dec.solve(v); // v = inv(matrix) * e_j. lower_bound = VectorL1Norm(v); if (lower_bound <= old_lower_bound) { // Break if the gradient step did not increase the lower_bound. @@ -249,17 +257,16 @@ struct EstimateInverseL1NormImpl { old_v_max_abs_index = v_max_abs_index; old_lower_bound = lower_bound; } - // The following calculates an independent estimate of ||A||_1 by - // multiplying - // A by a vector with entries of slowly increasing magnitude and alternating - // sign: v_i = (-1)^{i} (1 + (i / (dim-1))), i = 0,...,dim-1. This - // improvement - // to Hager's algorithm above is due to Higham. It was added to make the - // algorithm more robust in certain corner cases where large elements in - // the matrix might otherwise escape detection due to exact cancellation - // (especially when op and op_adjoint correspond to a sequence of - // backsubstitutions and permutations), which could cause Hager's algorithm - // to vastly underestimate ||A||_1. + // The following calculates an independent estimate of ||matrix||_1 by + // multiplying matrix by a vector with entries of slowly increasing + // magnitude and alternating sign: + // v_i = (-1)^{i} (1 + (i / (dim-1))), i = 0,...,dim-1. + // This improvement to Hager's algorithm above is due to Higham. It was + // added to make the algorithm more robust in certain corner cases where + // large elements in the matrix might otherwise escape detection due to + // exact cancellation (especially when op and op_adjoint correspond to a + // sequence of backsubstitutions and permutations), which could cause + // Hager's algorithm to vastly underestimate ||matrix||_1. RealScalar alternating_sign = 1; for (int i = 0; i < n; ++i) { v[i] = alternating_sign * static_cast(1) + diff --git a/test/lu.cpp b/test/lu.cpp index 31991520d..9e8059f58 100644 --- a/test/lu.cpp +++ b/test/lu.cpp @@ -152,7 +152,7 @@ template void lu_invertible() VERIFY_IS_APPROX(m2, m1_inverse*m3); // Test condition number estimation. - RealScalar rcond = RealScalar(1) / matrix_l1_norm(m1) / matrix_l1_norm(m1_inverse); + RealScalar rcond = (RealScalar(1) / matrix_l1_norm(m1)) / matrix_l1_norm(m1_inverse); // Verify that the estimate is within a factor of 10 of the truth. VERIFY(lu.rcond() > rcond / 10 && lu.rcond() < rcond * 10); @@ -197,7 +197,7 @@ template void lu_partial_piv() VERIFY_IS_APPROX(m2, m1_inverse*m3); // Test condition number estimation. - RealScalar rcond = RealScalar(1) / matrix_l1_norm(m1) / matrix_l1_norm(m1_inverse); + RealScalar rcond = (RealScalar(1) / matrix_l1_norm(m1)) / matrix_l1_norm(m1_inverse); // Verify that the estimate is within a factor of 10 of the truth. VERIFY(plu.rcond() > rcond / 10 && plu.rcond() < rcond * 10); -- cgit v1.2.3 From fb8dccc23e5f717319c230c2701a5fbf1d3c3975 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 1 Apr 2016 12:48:18 -0700 Subject: Replace "inline static" with "static inline" for consistency. --- Eigen/src/Core/ConditionEstimator.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/ConditionEstimator.h b/Eigen/src/Core/ConditionEstimator.h index 68e4535aa..b65306d56 100644 --- a/Eigen/src/Core/ConditionEstimator.h +++ b/Eigen/src/Core/ConditionEstimator.h @@ -109,7 +109,7 @@ class ConditionEstimator { * ||matrix||_1 = sup ||matrix * v||_1 / ||v||_1, which is equal to * the greatest absolute column sum. */ - inline static Scalar MatrixL1Norm(const MatrixType& matrix) { + static inline Scalar MatrixL1Norm(const MatrixType& matrix) { return matrix.cwiseAbs().colwise().sum().maxCoeff(); } }; @@ -124,7 +124,7 @@ struct EstimateInverseMatrixL1NormImpl { typedef typename internal::plain_col_type::type Vector; // Shorthand for vector L1 norm in Eigen. - inline static Scalar VectorL1Norm(const Vector& v) { + static inline Scalar VectorL1Norm(const Vector& v) { return v.template lpNorm<1>(); } @@ -210,7 +210,7 @@ struct EstimateInverseMatrixL1NormImpl { RealVector; // Shorthand for vector L1 norm in Eigen. - inline static RealScalar VectorL1Norm(const Vector& v) { + static inline RealScalar VectorL1Norm(const Vector& v) { return v.template lpNorm<1>(); } -- cgit v1.2.3 From f54137606eb6d68cbafd10d90013e254b26137ed Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 1 Apr 2016 16:19:45 -0700 Subject: Add condition estimation to Cholesky (LLT) factorization. --- Eigen/src/Cholesky/LLT.h | 53 +++++++++++++++++++++++++++++-------- Eigen/src/Core/ConditionEstimator.h | 37 +++++++++++++++++++------- test/cholesky.cpp | 47 +++++++++++++++++++++++--------- test/lu.cpp | 10 ++++--- 4 files changed, 111 insertions(+), 36 deletions(-) diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index 74cf5bfe1..b55c5bebf 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -10,7 +10,7 @@ #ifndef EIGEN_LLT_H #define EIGEN_LLT_H -namespace Eigen { +namespace Eigen { namespace internal{ template struct LLT_Traits; @@ -40,7 +40,7 @@ template struct LLT_Traits; * * Example: \include LLT_example.cpp * Output: \verbinclude LLT_example.out - * + * * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT */ /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH) @@ -135,6 +135,16 @@ template class LLT template LLT& compute(const EigenBase& matrix); + /** \returns an estimate of the reciprocal condition number of the matrix of + * which *this is the Cholesky decomposition. + */ + RealScalar rcond() const + { + eigen_assert(m_isInitialized && "LLT is not initialized."); + eigen_assert(m_info == Success && "LLT failed because matrix appears to be negative"); + return ConditionEstimator, true >::rcond(m_l1_norm, *this); + } + /** \returns the LLT decomposition matrix * * TODO: document the storage layout @@ -164,7 +174,7 @@ template class LLT template LLT rankUpdate(const VectorType& vec, const RealScalar& sigma = 1); - + #ifndef EIGEN_PARSED_BY_DOXYGEN template EIGEN_DEVICE_FUNC @@ -172,17 +182,18 @@ template class LLT #endif protected: - + static void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } - + /** \internal * Used to compute and store L * The strict upper part is not used and even not initialized. */ MatrixType m_matrix; + RealScalar m_l1_norm; bool m_isInitialized; ComputationInfo m_info; }; @@ -268,7 +279,7 @@ template struct llt_inplace static Index unblocked(MatrixType& mat) { using std::sqrt; - + eigen_assert(mat.rows()==mat.cols()); const Index size = mat.rows(); for(Index k = 0; k < size; ++k) @@ -328,7 +339,7 @@ template struct llt_inplace return Eigen::internal::llt_rank_update_lower(mat, vec, sigma); } }; - + template struct llt_inplace { typedef typename NumTraits::Real RealScalar; @@ -387,12 +398,32 @@ template LLT& LLT::compute(const EigenBase& a) { check_template_parameters(); - + eigen_assert(a.rows()==a.cols()); const Index size = a.rows(); m_matrix.resize(size, size); m_matrix = a.derived(); + // Compute matrix L1 norm = max abs column sum. + m_l1_norm = RealScalar(0); + if (_UpLo == Lower) { + for (int col = 0; col < size; ++col) { + const RealScalar abs_col_sum = m_matrix.col(col).tail(size - col).cwiseAbs().sum() + + m_matrix.row(col).tail(col).cwiseAbs().sum(); + if (abs_col_sum > m_l1_norm) { + m_l1_norm = abs_col_sum; + } + } + } else { + for (int col = 0; col < a.cols(); ++col) { + const RealScalar abs_col_sum = m_matrix.col(col).tail(col).cwiseAbs().sum() + + m_matrix.row(col).tail(size - col).cwiseAbs().sum(); + if (abs_col_sum > m_l1_norm) { + m_l1_norm = abs_col_sum; + } + } + } + m_isInitialized = true; bool ok = Traits::inplace_decomposition(m_matrix); m_info = ok ? Success : NumericalIssue; @@ -419,7 +450,7 @@ LLT<_MatrixType,_UpLo> LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, c return *this; } - + #ifndef EIGEN_PARSED_BY_DOXYGEN template template @@ -431,7 +462,7 @@ void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const #endif /** \internal use x = llt_object.solve(x); - * + * * This is the \em in-place version of solve(). * * \param bAndX represents both the right-hand side matrix b and result x. @@ -483,7 +514,7 @@ SelfAdjointView::llt() const return LLT(m_matrix); } #endif // __CUDACC__ - + } // end namespace Eigen #endif // EIGEN_LLT_H diff --git a/Eigen/src/Core/ConditionEstimator.h b/Eigen/src/Core/ConditionEstimator.h index b65306d56..dca3da417 100644 --- a/Eigen/src/Core/ConditionEstimator.h +++ b/Eigen/src/Core/ConditionEstimator.h @@ -13,11 +13,11 @@ namespace Eigen { namespace internal { -template +template struct EstimateInverseMatrixL1NormImpl {}; } // namespace internal -template +template class ConditionEstimator { public: typedef typename Decomposition::MatrixType MatrixType; @@ -101,7 +101,8 @@ class ConditionEstimator { return 0; } return internal::EstimateInverseMatrixL1NormImpl< - Decomposition, NumTraits::IsComplex>::compute(dec); + Decomposition, IsSelfAdjoint, + NumTraits::IsComplex != 0>::compute(dec); } /** @@ -116,9 +117,27 @@ class ConditionEstimator { namespace internal { +template +struct solve_helper { + static inline Vector solve_adjoint(const Decomposition& dec, + const Vector& v) { + return dec.adjoint().solve(v); + } +}; + +// Partial specialization for self_adjoint matrices. +template +struct solve_helper { + static inline Vector solve_adjoint(const Decomposition& dec, + const Vector& v) { + return dec.solve(v); + } +}; + + // Partial specialization for real matrices. -template -struct EstimateInverseMatrixL1NormImpl { +template +struct EstimateInverseMatrixL1NormImpl { typedef typename Decomposition::MatrixType MatrixType; typedef typename internal::traits::Scalar Scalar; typedef typename internal::plain_col_type::type Vector; @@ -152,7 +171,7 @@ struct EstimateInverseMatrixL1NormImpl { int old_v_max_abs_index = v_max_abs_index; for (int k = 0; k < 4; ++k) { // argmax |inv(matrix)^T * sign_vector| - v = dec.adjoint().solve(sign_vector); + v = solve_helper::solve_adjoint(dec, sign_vector); v.cwiseAbs().maxCoeff(&v_max_abs_index); if (v_max_abs_index == old_v_max_abs_index) { // Break if the solution stagnated. @@ -200,8 +219,8 @@ struct EstimateInverseMatrixL1NormImpl { }; // Partial specialization for complex matrices. -template -struct EstimateInverseMatrixL1NormImpl { +template +struct EstimateInverseMatrixL1NormImpl { typedef typename Decomposition::MatrixType MatrixType; typedef typename internal::traits::Scalar Scalar; typedef typename NumTraits::Real RealScalar; @@ -238,7 +257,7 @@ struct EstimateInverseMatrixL1NormImpl { RealVector abs_v = v.cwiseAbs(); const Vector psi = (abs_v.array() == 0).select(v.cwiseQuotient(abs_v), ones); - v = dec.adjoint().solve(psi); + v = solve_helper::solve_adjoint(dec, psi); const RealVector z = v.real(); z.cwiseAbs().maxCoeff(&v_max_abs_index); if (v_max_abs_index == old_v_max_abs_index) { diff --git a/test/cholesky.cpp b/test/cholesky.cpp index d652af5bf..8a21cdbd5 100644 --- a/test/cholesky.cpp +++ b/test/cholesky.cpp @@ -17,6 +17,12 @@ #include #include +template +typename MatrixType::RealScalar matrix_l1_norm(const MatrixType& m) { + MatrixType symm = m.template selfadjointView(); + return symm.cwiseAbs().colwise().sum().maxCoeff(); +} + template class CholType> void test_chol_update(const MatrixType& symm) { typedef typename MatrixType::Scalar Scalar; @@ -77,7 +83,7 @@ template void cholesky(const MatrixType& m) { SquareMatrixType symmUp = symm.template triangularView(); SquareMatrixType symmLo = symm.template triangularView(); - + LLT chollo(symmLo); VERIFY_IS_APPROX(symm, chollo.reconstructedMatrix()); vecX = chollo.solve(vecB); @@ -85,6 +91,14 @@ template void cholesky(const MatrixType& m) matX = chollo.solve(matB); VERIFY_IS_APPROX(symm * matX, matB); + // Verify that the estimated condition number is within a factor of 10 of the + // truth. + const MatrixType symmLo_inverse = chollo.solve(MatrixType::Identity(rows,cols)); + RealScalar rcond = (RealScalar(1) / matrix_l1_norm(symmLo)) / + matrix_l1_norm(symmLo_inverse); + RealScalar rcond_est = chollo.rcond(); + VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10); + // test the upper mode LLT cholup(symmUp); VERIFY_IS_APPROX(symm, cholup.reconstructedMatrix()); @@ -93,6 +107,15 @@ template void cholesky(const MatrixType& m) matX = cholup.solve(matB); VERIFY_IS_APPROX(symm * matX, matB); + // Verify that the estimated condition number is within a factor of 10 of the + // truth. + const MatrixType symmUp_inverse = cholup.solve(MatrixType::Identity(rows,cols)); + rcond = (RealScalar(1) / matrix_l1_norm(symmUp)) / + matrix_l1_norm(symmUp_inverse); + rcond_est = cholup.rcond(); + VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10); + + MatrixType neg = -symmLo; chollo.compute(neg); VERIFY(chollo.info()==NumericalIssue); @@ -101,7 +124,7 @@ template void cholesky(const MatrixType& m) VERIFY_IS_APPROX(MatrixType(chollo.matrixU().transpose().conjugate()), MatrixType(chollo.matrixL())); VERIFY_IS_APPROX(MatrixType(cholup.matrixL().transpose().conjugate()), MatrixType(cholup.matrixU())); VERIFY_IS_APPROX(MatrixType(cholup.matrixU().transpose().conjugate()), MatrixType(cholup.matrixL())); - + // test some special use cases of SelfCwiseBinaryOp: MatrixType m1 = MatrixType::Random(rows,cols), m2(rows,cols); m2 = m1; @@ -167,7 +190,7 @@ template void cholesky(const MatrixType& m) // restore if(sign == -1) symm = -symm; - + // check matrices coming from linear constraints with Lagrange multipliers if(rows>=3) { @@ -183,7 +206,7 @@ template void cholesky(const MatrixType& m) vecX = ldltlo.solve(vecB); VERIFY_IS_APPROX(A * vecX, vecB); } - + // check non-full rank matrices if(rows>=3) { @@ -199,7 +222,7 @@ template void cholesky(const MatrixType& m) vecX = ldltlo.solve(vecB); VERIFY_IS_APPROX(A * vecX, vecB); } - + // check matrices with a wide spectrum if(rows>=3) { @@ -225,7 +248,7 @@ template void cholesky(const MatrixType& m) { RealScalar large_tol = std::sqrt(test_precision()); VERIFY((A * vecX).isApprox(vecB, large_tol)); - + ++g_test_level; VERIFY_IS_APPROX(A * vecX,vecB); --g_test_level; @@ -314,14 +337,14 @@ template void cholesky_bug241(const MatrixType& m) } // LDLT is not guaranteed to work for indefinite matrices, but happens to work fine if matrix is diagonal. -// This test checks that LDLT reports correctly that matrix is indefinite. +// This test checks that LDLT reports correctly that matrix is indefinite. // See http://forum.kde.org/viewtopic.php?f=74&t=106942 and bug 736 template void cholesky_definiteness(const MatrixType& m) { eigen_assert(m.rows() == 2 && m.cols() == 2); MatrixType mat; LDLT ldlt(2); - + { mat << 1, 0, 0, -1; ldlt.compute(mat); @@ -384,11 +407,11 @@ void test_cholesky() CALL_SUBTEST_3( cholesky_definiteness(Matrix2d()) ); CALL_SUBTEST_4( cholesky(Matrix3f()) ); CALL_SUBTEST_5( cholesky(Matrix4d()) ); - - s = internal::random(1,EIGEN_TEST_MAX_SIZE); + + s = internal::random(1,EIGEN_TEST_MAX_SIZE); CALL_SUBTEST_2( cholesky(MatrixXd(s,s)) ); TEST_SET_BUT_UNUSED_VARIABLE(s) - + s = internal::random(1,EIGEN_TEST_MAX_SIZE/2); CALL_SUBTEST_6( cholesky_cplx(MatrixXcd(s,s)) ); TEST_SET_BUT_UNUSED_VARIABLE(s) @@ -402,6 +425,6 @@ void test_cholesky() // Test problem size constructors CALL_SUBTEST_9( LLT(10) ); CALL_SUBTEST_9( LDLT(10) ); - + TEST_SET_BUT_UNUSED_VARIABLE(nb_temporaries) } diff --git a/test/lu.cpp b/test/lu.cpp index 9e8059f58..53b3fcee4 100644 --- a/test/lu.cpp +++ b/test/lu.cpp @@ -151,10 +151,11 @@ template void lu_invertible() MatrixType m1_inverse = lu.inverse(); VERIFY_IS_APPROX(m2, m1_inverse*m3); - // Test condition number estimation. + // Verify that the estimated condition number is within a factor of 10 of the + // truth. RealScalar rcond = (RealScalar(1) / matrix_l1_norm(m1)) / matrix_l1_norm(m1_inverse); - // Verify that the estimate is within a factor of 10 of the truth. - VERIFY(lu.rcond() > rcond / 10 && lu.rcond() < rcond * 10); + const RealScalar rcond_est = lu.rcond(); + VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10); // test solve with transposed lu.template _solve_impl_transposed(m3, m2); @@ -199,7 +200,8 @@ template void lu_partial_piv() // Test condition number estimation. RealScalar rcond = (RealScalar(1) / matrix_l1_norm(m1)) / matrix_l1_norm(m1_inverse); // Verify that the estimate is within a factor of 10 of the truth. - VERIFY(plu.rcond() > rcond / 10 && plu.rcond() < rcond * 10); + const RealScalar rcond_est = plu.rcond(); + VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10); // test solve with transposed plu.template _solve_impl_transposed(m3, m2); -- cgit v1.2.3 From 9d51f7c457671bfcbab9a1d62d416e1a83e6ad8a Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 1 Apr 2016 16:48:38 -0700 Subject: Add rcond method to LDLT. --- Eigen/src/Cholesky/LDLT.h | 54 ++++++++++++++++++++++++++++++++++++----------- test/cholesky.cpp | 17 +++++++++++++++ 2 files changed, 59 insertions(+), 12 deletions(-) diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h index c3cc3746c..9753c84d8 100644 --- a/Eigen/src/Cholesky/LDLT.h +++ b/Eigen/src/Cholesky/LDLT.h @@ -13,7 +13,7 @@ #ifndef EIGEN_LDLT_H #define EIGEN_LDLT_H -namespace Eigen { +namespace Eigen { namespace internal { template struct LDLT_Traits; @@ -73,11 +73,11 @@ template class LDLT * The default constructor is useful in cases in which the user intends to * perform decompositions via LDLT::compute(const MatrixType&). */ - LDLT() - : m_matrix(), - m_transpositions(), + LDLT() + : m_matrix(), + m_transpositions(), m_sign(internal::ZeroSign), - m_isInitialized(false) + m_isInitialized(false) {} /** \brief Default Constructor with memory preallocation @@ -168,7 +168,7 @@ template class LDLT * \note_about_checking_solutions * * More precisely, this method solves \f$ A x = b \f$ using the decomposition \f$ A = P^T L D L^* P \f$ - * by solving the systems \f$ P^T y_1 = b \f$, \f$ L y_2 = y_1 \f$, \f$ D y_3 = y_2 \f$, + * by solving the systems \f$ P^T y_1 = b \f$, \f$ L y_2 = y_1 \f$, \f$ D y_3 = y_2 \f$, * \f$ L^* y_4 = y_3 \f$ and \f$ P x = y_4 \f$ in succession. If the matrix \f$ A \f$ is singular, then * \f$ D \f$ will also be singular (all the other matrices are invertible). In that case, the * least-square solution of \f$ D y_3 = y_2 \f$ is computed. This does not mean that this function @@ -192,6 +192,15 @@ template class LDLT template LDLT& compute(const EigenBase& matrix); + /** \returns an estimate of the reciprocal condition number of the matrix of + * which *this is the LDLT decomposition. + */ + RealScalar rcond() const + { + eigen_assert(m_isInitialized && "LDLT is not initialized."); + return ConditionEstimator, true >::rcond(m_l1_norm, *this); + } + template LDLT& rankUpdate(const MatrixBase& w, const RealScalar& alpha=1); @@ -220,7 +229,7 @@ template class LDLT eigen_assert(m_isInitialized && "LDLT is not initialized."); return Success; } - + #ifndef EIGEN_PARSED_BY_DOXYGEN template EIGEN_DEVICE_FUNC @@ -228,7 +237,7 @@ template class LDLT #endif protected: - + static void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); @@ -241,6 +250,7 @@ template class LDLT * is not stored), and the diagonal entries correspond to D. */ MatrixType m_matrix; + RealScalar m_l1_norm; TranspositionType m_transpositions; TmpMatrixType m_temporary; internal::SignMatrix m_sign; @@ -314,7 +324,7 @@ template<> struct ldlt_inplace if(rs>0) A21.noalias() -= A20 * temp.head(k); } - + // In some previous versions of Eigen (e.g., 3.2.1), the scaling was omitted if the pivot // was smaller than the cutoff value. However, since LDLT is not rank-revealing // we should only make sure that we do not introduce INF or NaN values. @@ -433,12 +443,32 @@ template LDLT& LDLT::compute(const EigenBase& a) { check_template_parameters(); - + eigen_assert(a.rows()==a.cols()); const Index size = a.rows(); m_matrix = a.derived(); + // Compute matrix L1 norm = max abs column sum. + m_l1_norm = RealScalar(0); + if (_UpLo == Lower) { + for (int col = 0; col < size; ++col) { + const RealScalar abs_col_sum = m_matrix.col(col).tail(size - col).cwiseAbs().sum() + + m_matrix.row(col).tail(col).cwiseAbs().sum(); + if (abs_col_sum > m_l1_norm) { + m_l1_norm = abs_col_sum; + } + } + } else { + for (int col = 0; col < a.cols(); ++col) { + const RealScalar abs_col_sum = m_matrix.col(col).tail(col).cwiseAbs().sum() + + m_matrix.row(col).tail(size - col).cwiseAbs().sum(); + if (abs_col_sum > m_l1_norm) { + m_l1_norm = abs_col_sum; + } + } + } + m_transpositions.resize(size); m_isInitialized = false; m_temporary.resize(size); @@ -466,7 +496,7 @@ LDLT& LDLT::rankUpdate(const MatrixBase::_solve_impl(const RhsType &rhs, DstType &dst) cons // diagonal element is not well justified and leads to numerical issues in some cases. // Moreover, Lapack's xSYTRS routines use 0 for the tolerance. RealScalar tolerance = RealScalar(1) / NumTraits::highest(); - + for (Index i = 0; i < vecD.size(); ++i) { if(abs(vecD(i)) > tolerance) diff --git a/test/cholesky.cpp b/test/cholesky.cpp index 8a21cdbd5..148a0b388 100644 --- a/test/cholesky.cpp +++ b/test/cholesky.cpp @@ -160,6 +160,15 @@ template void cholesky(const MatrixType& m) matX = ldltlo.solve(matB); VERIFY_IS_APPROX(symm * matX, matB); + // Verify that the estimated condition number is within a factor of 10 of the + // truth. + const MatrixType symmLo_inverse = ldltlo.solve(MatrixType::Identity(rows,cols)); + RealScalar rcond = (RealScalar(1) / matrix_l1_norm(symmLo)) / + matrix_l1_norm(symmLo_inverse); + RealScalar rcond_est = ldltlo.rcond(); + VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10); + + LDLT ldltup(symmUp); VERIFY_IS_APPROX(symm, ldltup.reconstructedMatrix()); vecX = ldltup.solve(vecB); @@ -167,6 +176,14 @@ template void cholesky(const MatrixType& m) matX = ldltup.solve(matB); VERIFY_IS_APPROX(symm * matX, matB); + // Verify that the estimated condition number is within a factor of 10 of the + // truth. + const MatrixType symmUp_inverse = ldltup.solve(MatrixType::Identity(rows,cols)); + rcond = (RealScalar(1) / matrix_l1_norm(symmUp)) / + matrix_l1_norm(symmUp_inverse); + rcond_est = ldltup.rcond(); + VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10); + VERIFY_IS_APPROX(MatrixType(ldltlo.matrixL().transpose().conjugate()), MatrixType(ldltlo.matrixU())); VERIFY_IS_APPROX(MatrixType(ldltlo.matrixU().transpose().conjugate()), MatrixType(ldltlo.matrixL())); VERIFY_IS_APPROX(MatrixType(ldltup.matrixL().transpose().conjugate()), MatrixType(ldltup.matrixU())); -- cgit v1.2.3 From 86e0ed81f8db5a0c9562b62a67a9ba60ec58dec0 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Mon, 4 Apr 2016 14:20:01 -0700 Subject: Addresses comments on Eigen pull request PR-174. * Get rid of code-duplication for real vs. complex matrices. * Fix flipped arguments to select. * Make the condition estimation functions free functions. * Use Vector::Unit() to generate canonical unit vectors. * Misc. cleanup. --- Eigen/src/Cholesky/LDLT.h | 12 +- Eigen/src/Cholesky/LLT.h | 12 +- Eigen/src/Core/ConditionEstimator.h | 405 ++++++++++++++---------------------- Eigen/src/LU/FullPivLU.h | 2 +- Eigen/src/LU/PartialPivLU.h | 2 +- test/cholesky.cpp | 8 +- test/lu.cpp | 7 +- 7 files changed, 179 insertions(+), 269 deletions(-) diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h index 9753c84d8..80f18977c 100644 --- a/Eigen/src/Cholesky/LDLT.h +++ b/Eigen/src/Cholesky/LDLT.h @@ -198,7 +198,7 @@ template class LDLT RealScalar rcond() const { eigen_assert(m_isInitialized && "LDLT is not initialized."); - return ConditionEstimator, true >::rcond(m_l1_norm, *this); + return ReciprocalConditionNumberEstimate(m_l1_norm, *this); } template @@ -216,6 +216,12 @@ template class LDLT MatrixType reconstructedMatrix() const; + /** \returns the decomposition itself to allow generic code to do + * ldlt.transpose().solve(rhs). + */ + const LDLT& transpose() const { return *this; }; + const LDLT& adjoint() const { return *this; }; + inline Index rows() const { return m_matrix.rows(); } inline Index cols() const { return m_matrix.cols(); } @@ -454,14 +460,14 @@ LDLT& LDLT::compute(const EigenBase m_l1_norm) { m_l1_norm = abs_col_sum; } } } else { for (int col = 0; col < a.cols(); ++col) { - const RealScalar abs_col_sum = m_matrix.col(col).tail(col).cwiseAbs().sum() + + const RealScalar abs_col_sum = m_matrix.col(col).head(col).cwiseAbs().sum() + m_matrix.row(col).tail(size - col).cwiseAbs().sum(); if (abs_col_sum > m_l1_norm) { m_l1_norm = abs_col_sum; diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index b55c5bebf..94da1d52d 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -142,7 +142,7 @@ template class LLT { eigen_assert(m_isInitialized && "LLT is not initialized."); eigen_assert(m_info == Success && "LLT failed because matrix appears to be negative"); - return ConditionEstimator, true >::rcond(m_l1_norm, *this); + return ReciprocalConditionNumberEstimate(m_l1_norm, *this); } /** \returns the LLT decomposition matrix @@ -169,6 +169,12 @@ template class LLT return m_info; } + /** \returns the decomposition itself to allow generic code to do + * llt.transpose().solve(rhs). + */ + const LLT& transpose() const { return *this; }; + const LLT& adjoint() const { return *this; }; + inline Index rows() const { return m_matrix.rows(); } inline Index cols() const { return m_matrix.cols(); } @@ -409,14 +415,14 @@ LLT& LLT::compute(const EigenBase if (_UpLo == Lower) { for (int col = 0; col < size; ++col) { const RealScalar abs_col_sum = m_matrix.col(col).tail(size - col).cwiseAbs().sum() + - m_matrix.row(col).tail(col).cwiseAbs().sum(); + m_matrix.row(col).head(col).cwiseAbs().sum(); if (abs_col_sum > m_l1_norm) { m_l1_norm = abs_col_sum; } } } else { for (int col = 0; col < a.cols(); ++col) { - const RealScalar abs_col_sum = m_matrix.col(col).tail(col).cwiseAbs().sum() + + const RealScalar abs_col_sum = m_matrix.col(col).head(col).cwiseAbs().sum() + m_matrix.row(col).tail(size - col).cwiseAbs().sum(); if (abs_col_sum > m_l1_norm) { m_l1_norm = abs_col_sum; diff --git a/Eigen/src/Core/ConditionEstimator.h b/Eigen/src/Core/ConditionEstimator.h index dca3da417..12b4ae648 100644 --- a/Eigen/src/Core/ConditionEstimator.h +++ b/Eigen/src/Core/ConditionEstimator.h @@ -13,45 +13,35 @@ namespace Eigen { namespace internal { -template -struct EstimateInverseMatrixL1NormImpl {}; -} // namespace internal - -template -class ConditionEstimator { - public: - typedef typename Decomposition::MatrixType MatrixType; - typedef typename internal::traits::Scalar Scalar; - typedef typename NumTraits::Real RealScalar; - typedef typename internal::plain_col_type::type Vector; +template +inline typename MatrixType::RealScalar MatrixL1Norm(const MatrixType& matrix) { + return matrix.cwiseAbs().colwise().sum().maxCoeff(); +} + +template +inline typename Vector::RealScalar VectorL1Norm(const Vector& v) { + return v.template lpNorm<1>(); +} + +template +struct SignOrUnity { + static inline Vector run(const Vector& v) { + const RealVector v_abs = v.cwiseAbs(); + return (v_abs.array() == 0).select(Vector::Ones(v.size()), v.cwiseQuotient(v_abs)); + } +}; - /** \class ConditionEstimator - * \ingroup Core_Module - * - * \brief Condition number estimator. - * - * Computing a decomposition of a dense matrix takes O(n^3) operations, while - * this method estimates the condition number quickly and reliably in O(n^2) - * operations. - * - * \returns an estimate of the reciprocal condition number - * (1 / (||matrix||_1 * ||inv(matrix)||_1)) of matrix, given the matrix and - * its decomposition. Supports the following decompositions: FullPivLU, - * PartialPivLU. - * - * \sa FullPivLU, PartialPivLU. - */ - static RealScalar rcond(const MatrixType& matrix, const Decomposition& dec) { - eigen_assert(matrix.rows() == dec.rows()); - eigen_assert(matrix.cols() == dec.cols()); - eigen_assert(matrix.rows() == matrix.cols()); - if (dec.rows() == 0) { - return RealScalar(1); - } - return rcond(MatrixL1Norm(matrix), dec); +// Partial specialization to avoid elementwise division for real vectors. +template +struct SignOrUnity { + static inline Vector run(const Vector& v) { + return (v.array() < 0).select(-Vector::Ones(v.size()), Vector::Ones(v.size())); } +}; - /** \class ConditionEstimator +} // namespace internal + +/** \class ConditionEstimator * \ingroup Core_Module * * \brief Condition number estimator. @@ -61,245 +51,154 @@ class ConditionEstimator { * operations. * * \returns an estimate of the reciprocal condition number - * (1 / (||matrix||_1 * ||inv(matrix)||_1)) of matrix, given ||matrix||_1 and + * (1 / (||matrix||_1 * ||inv(matrix)||_1)) of matrix, given the matrix and * its decomposition. Supports the following decompositions: FullPivLU, * PartialPivLU. * * \sa FullPivLU, PartialPivLU. */ - static RealScalar rcond(RealScalar matrix_norm, const Decomposition& dec) { - eigen_assert(dec.rows() == dec.cols()); - if (dec.rows() == 0) { - return 1; - } - if (matrix_norm == 0) { - return 0; - } - const RealScalar inverse_matrix_norm = EstimateInverseMatrixL1Norm(dec); - return inverse_matrix_norm == 0 ? 0 - : (1 / inverse_matrix_norm) / matrix_norm; +template +typename Decomposition::RealScalar ReciprocalConditionNumberEstimate( + const typename Decomposition::MatrixType& matrix, + const Decomposition& dec) { + eigen_assert(matrix.rows() == dec.rows()); + eigen_assert(matrix.cols() == dec.cols()); + eigen_assert(matrix.rows() == matrix.cols()); + if (dec.rows() == 0) { + return Decomposition::RealScalar(1); } - - /** - * \returns an estimate of ||inv(matrix)||_1 given a decomposition of - * matrix that implements .solve() and .adjoint().solve() methods. - * - * The method implements Algorithms 4.1 and 5.1 from - * http://www.maths.manchester.ac.uk/~higham/narep/narep135.pdf - * which also forms the basis for the condition number estimators in - * LAPACK. Since at most 10 calls to the solve method of dec are - * performed, the total cost is O(dims^2), as opposed to O(dims^3) - * needed to compute the inverse matrix explicitly. - * - * The most common usage is in estimating the condition number - * ||matrix||_1 * ||inv(matrix)||_1. The first term ||matrix||_1 can be - * computed directly in O(n^2) operations. - */ - static RealScalar EstimateInverseMatrixL1Norm(const Decomposition& dec) { - eigen_assert(dec.rows() == dec.cols()); - if (dec.rows() == 0) { - return 0; - } - return internal::EstimateInverseMatrixL1NormImpl< - Decomposition, IsSelfAdjoint, - NumTraits::IsComplex != 0>::compute(dec); + return ReciprocalConditionNumberEstimate(MatrixL1Norm(matrix), dec); +} + +/** \class ConditionEstimator + * \ingroup Core_Module + * + * \brief Condition number estimator. + * + * Computing a decomposition of a dense matrix takes O(n^3) operations, while + * this method estimates the condition number quickly and reliably in O(n^2) + * operations. + * + * \returns an estimate of the reciprocal condition number + * (1 / (||matrix||_1 * ||inv(matrix)||_1)) of matrix, given ||matrix||_1 and + * its decomposition. Supports the following decompositions: FullPivLU, + * PartialPivLU. + * + * \sa FullPivLU, PartialPivLU. + */ +template +typename Decomposition::RealScalar ReciprocalConditionNumberEstimate( + typename Decomposition::RealScalar matrix_norm, const Decomposition& dec) { + eigen_assert(dec.rows() == dec.cols()); + if (dec.rows() == 0) { + return 1; } - - /** - * \returns the induced matrix l1-norm - * ||matrix||_1 = sup ||matrix * v||_1 / ||v||_1, which is equal to - * the greatest absolute column sum. - */ - static inline Scalar MatrixL1Norm(const MatrixType& matrix) { - return matrix.cwiseAbs().colwise().sum().maxCoeff(); + if (matrix_norm == 0) { + return 0; } -}; - -namespace internal { - -template -struct solve_helper { - static inline Vector solve_adjoint(const Decomposition& dec, - const Vector& v) { - return dec.adjoint().solve(v); - } -}; - -// Partial specialization for self_adjoint matrices. -template -struct solve_helper { - static inline Vector solve_adjoint(const Decomposition& dec, - const Vector& v) { - return dec.solve(v); - } -}; - - -// Partial specialization for real matrices. -template -struct EstimateInverseMatrixL1NormImpl { + const typename Decomposition::RealScalar inverse_matrix_norm = InverseMatrixL1NormEstimate(dec); + return inverse_matrix_norm == 0 ? 0 : (1 / inverse_matrix_norm) / matrix_norm; +} + +/** + * \returns an estimate of ||inv(matrix)||_1 given a decomposition of + * matrix that implements .solve() and .adjoint().solve() methods. + * + * The method implements Algorithms 4.1 and 5.1 from + * http://www.maths.manchester.ac.uk/~higham/narep/narep135.pdf + * which also forms the basis for the condition number estimators in + * LAPACK. Since at most 10 calls to the solve method of dec are + * performed, the total cost is O(dims^2), as opposed to O(dims^3) + * needed to compute the inverse matrix explicitly. + * + * The most common usage is in estimating the condition number + * ||matrix||_1 * ||inv(matrix)||_1. The first term ||matrix||_1 can be + * computed directly in O(n^2) operations. + */ +template +typename Decomposition::RealScalar InverseMatrixL1NormEstimate( + const Decomposition& dec) { typedef typename Decomposition::MatrixType MatrixType; - typedef typename internal::traits::Scalar Scalar; + typedef typename Decomposition::Scalar Scalar; + typedef typename Decomposition::RealScalar RealScalar; typedef typename internal::plain_col_type::type Vector; + typedef typename internal::plain_col_type::type RealVector; + const bool is_complex = (NumTraits::IsComplex != 0); - // Shorthand for vector L1 norm in Eigen. - static inline Scalar VectorL1Norm(const Vector& v) { - return v.template lpNorm<1>(); + eigen_assert(dec.rows() == dec.cols()); + const int n = dec.rows(); + if (n == 0) { + return 0; } - - static inline Scalar compute(const Decomposition& dec) { - const int n = dec.rows(); - const Vector plus = Vector::Ones(n); - Vector v = plus / n; - v = dec.solve(v); - Scalar lower_bound = VectorL1Norm(v); - if (n == 1) { - return lower_bound; - } - // lower_bound is a lower bound on - // ||inv(matrix)||_1 = sup_v ||inv(matrix) v||_1 / ||v||_1 - // and is the objective maximized by the ("super-") gradient ascent - // algorithm. - // Basic idea: We know that the optimum is achieved at one of the simplices - // v = e_i, so in each iteration we follow a super-gradient to move towards - // the optimal one. - Scalar old_lower_bound = lower_bound; - const Vector minus = -Vector::Ones(n); - Vector sign_vector = (v.cwiseAbs().array() == 0).select(plus, minus); - Vector old_sign_vector = sign_vector; - int v_max_abs_index = -1; - int old_v_max_abs_index = v_max_abs_index; - for (int k = 0; k < 4; ++k) { - // argmax |inv(matrix)^T * sign_vector| - v = solve_helper::solve_adjoint(dec, sign_vector); - v.cwiseAbs().maxCoeff(&v_max_abs_index); - if (v_max_abs_index == old_v_max_abs_index) { - // Break if the solution stagnated. - break; - } - // Move to the new simplex e_j, where j = v_max_abs_index. - v.setZero(); - v[v_max_abs_index] = 1; - v = dec.solve(v); // v = inv(matrix) * e_j. - lower_bound = VectorL1Norm(v); - if (lower_bound <= old_lower_bound) { - // Break if the gradient step did not increase the lower_bound. - break; - } - sign_vector = (v.array() < 0).select(plus, minus); + Vector v = Vector::Ones(n) / n; + v = dec.solve(v); + + // lower_bound is a lower bound on + // ||inv(matrix)||_1 = sup_v ||inv(matrix) v||_1 / ||v||_1 + // and is the objective maximized by the ("super-") gradient ascent + // algorithm below. + RealScalar lower_bound = internal::VectorL1Norm(v); + if (n == 1) { + return lower_bound; + } + // Gradient ascent algorithm follows: We know that the optimum is achieved at + // one of the simplices v = e_i, so in each iteration we follow a + // super-gradient to move towards the optimal one. + RealScalar old_lower_bound = lower_bound; + Vector sign_vector(n); + Vector old_sign_vector; + int v_max_abs_index = -1; + int old_v_max_abs_index = v_max_abs_index; + for (int k = 0; k < 4; ++k) { + sign_vector = internal::SignOrUnity::run(v); + if (k > 0 && !is_complex) { if (sign_vector == old_sign_vector) { // Break if the solution stagnated. break; } - old_sign_vector = sign_vector; - old_v_max_abs_index = v_max_abs_index; - old_lower_bound = lower_bound; - } - // The following calculates an independent estimate of ||matrix||_1 by - // multiplying matrix by a vector with entries of slowly increasing - // magnitude and alternating sign: - // v_i = (-1)^{i} (1 + (i / (dim-1))), i = 0,...,dim-1. - // This improvement to Hager's algorithm above is due to Higham. It was - // added to make the algorithm more robust in certain corner cases where - // large elements in the matrix might otherwise escape detection due to - // exact cancellation (especially when op and op_adjoint correspond to a - // sequence of backsubstitutions and permutations), which could cause - // Hager's algorithm to vastly underestimate ||matrix||_1. - Scalar alternating_sign = 1; - for (int i = 0; i < n; ++i) { - v[i] = alternating_sign * static_cast(1) + - (static_cast(i) / (static_cast(n - 1))); - alternating_sign = -alternating_sign; } - v = dec.solve(v); - const Scalar alternate_lower_bound = - (2 * VectorL1Norm(v)) / (3 * static_cast(n)); - return numext::maxi(lower_bound, alternate_lower_bound); - } -}; - -// Partial specialization for complex matrices. -template -struct EstimateInverseMatrixL1NormImpl { - typedef typename Decomposition::MatrixType MatrixType; - typedef typename internal::traits::Scalar Scalar; - typedef typename NumTraits::Real RealScalar; - typedef typename internal::plain_col_type::type Vector; - typedef typename internal::plain_col_type::type - RealVector; - - // Shorthand for vector L1 norm in Eigen. - static inline RealScalar VectorL1Norm(const Vector& v) { - return v.template lpNorm<1>(); - } - - static inline RealScalar compute(const Decomposition& dec) { - const int n = dec.rows(); - const Vector ones = Vector::Ones(n); - Vector v = ones / n; - v = dec.solve(v); - RealScalar lower_bound = VectorL1Norm(v); - if (n == 1) { - return lower_bound; + // v_max_abs_index = argmax |real( inv(matrix)^T * sign_vector )| + v = dec.adjoint().solve(sign_vector); + v.real().cwiseAbs().maxCoeff(&v_max_abs_index); + if (v_max_abs_index == old_v_max_abs_index) { + // Break if the solution stagnated. + break; } - // lower_bound is a lower bound on - // ||inv(matrix)||_1 = sup_v ||inv(matrix) v||_1 / ||v||_1 - // and is the objective maximized by the ("super-") gradient ascent - // algorithm. - // Basic idea: We know that the optimum is achieved at one of the simplices - // v = e_i, so in each iteration we follow a super-gradient to move towards - // the optimal one. - RealScalar old_lower_bound = lower_bound; - int v_max_abs_index = -1; - int old_v_max_abs_index = v_max_abs_index; - for (int k = 0; k < 4; ++k) { - // argmax |inv(matrix)^* * sign_vector| - RealVector abs_v = v.cwiseAbs(); - const Vector psi = - (abs_v.array() == 0).select(v.cwiseQuotient(abs_v), ones); - v = solve_helper::solve_adjoint(dec, psi); - const RealVector z = v.real(); - z.cwiseAbs().maxCoeff(&v_max_abs_index); - if (v_max_abs_index == old_v_max_abs_index) { - // Break if the solution stagnated. - break; - } - // Move to the new simplex e_j, where j = v_max_abs_index. - v.setZero(); - v[v_max_abs_index] = 1; - v = dec.solve(v); // v = inv(matrix) * e_j. - lower_bound = VectorL1Norm(v); - if (lower_bound <= old_lower_bound) { - // Break if the gradient step did not increase the lower_bound. - break; - } - old_v_max_abs_index = v_max_abs_index; - old_lower_bound = lower_bound; + // Move to the new simplex e_j, where j = v_max_abs_index. + v = dec.solve(Vector::Unit(n, v_max_abs_index)); // v = inv(matrix) * e_j. + lower_bound = internal::VectorL1Norm(v); + if (lower_bound <= old_lower_bound) { + // Break if the gradient step did not increase the lower_bound. + break; } - // The following calculates an independent estimate of ||matrix||_1 by - // multiplying matrix by a vector with entries of slowly increasing - // magnitude and alternating sign: - // v_i = (-1)^{i} (1 + (i / (dim-1))), i = 0,...,dim-1. - // This improvement to Hager's algorithm above is due to Higham. It was - // added to make the algorithm more robust in certain corner cases where - // large elements in the matrix might otherwise escape detection due to - // exact cancellation (especially when op and op_adjoint correspond to a - // sequence of backsubstitutions and permutations), which could cause - // Hager's algorithm to vastly underestimate ||matrix||_1. - RealScalar alternating_sign = 1; - for (int i = 0; i < n; ++i) { - v[i] = alternating_sign * static_cast(1) + - (static_cast(i) / (static_cast(n - 1))); - alternating_sign = -alternating_sign; + if (!is_complex) { + old_sign_vector = sign_vector; } - v = dec.solve(v); - const RealScalar alternate_lower_bound = - (2 * VectorL1Norm(v)) / (3 * static_cast(n)); - return numext::maxi(lower_bound, alternate_lower_bound); + old_v_max_abs_index = v_max_abs_index; + old_lower_bound = lower_bound; } -}; + // The following calculates an independent estimate of ||matrix||_1 by + // multiplying matrix by a vector with entries of slowly increasing + // magnitude and alternating sign: + // v_i = (-1)^{i} (1 + (i / (dim-1))), i = 0,...,dim-1. + // This improvement to Hager's algorithm above is due to Higham. It was + // added to make the algorithm more robust in certain corner cases where + // large elements in the matrix might otherwise escape detection due to + // exact cancellation (especially when op and op_adjoint correspond to a + // sequence of backsubstitutions and permutations), which could cause + // Hager's algorithm to vastly underestimate ||matrix||_1. + Scalar alternating_sign = 1; + for (int i = 0; i < n; ++i) { + v[i] = alternating_sign * static_cast(1) + + (static_cast(i) / (static_cast(n - 1))); + alternating_sign = -alternating_sign; + } + v = dec.solve(v); + const RealScalar alternate_lower_bound = + (2 * internal::VectorL1Norm(v)) / (3 * static_cast(n)); + return numext::maxi(lower_bound, alternate_lower_bound); +} -} // namespace internal } // namespace Eigen #endif diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h index ff0b78c35..978a54eff 100644 --- a/Eigen/src/LU/FullPivLU.h +++ b/Eigen/src/LU/FullPivLU.h @@ -237,7 +237,7 @@ template class FullPivLU inline RealScalar rcond() const { eigen_assert(m_isInitialized && "PartialPivLU is not initialized."); - return ConditionEstimator >::rcond(m_l1_norm, *this); + return ReciprocalConditionNumberEstimate(m_l1_norm, *this); } /** \returns the determinant of the matrix of which diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h index 5d71a66d0..b22dd75fe 100644 --- a/Eigen/src/LU/PartialPivLU.h +++ b/Eigen/src/LU/PartialPivLU.h @@ -157,7 +157,7 @@ template class PartialPivLU inline RealScalar rcond() const { eigen_assert(m_isInitialized && "PartialPivLU is not initialized."); - return ConditionEstimator >::rcond(m_l1_norm, *this); + return ReciprocalConditionNumberEstimate(m_l1_norm, *this); } /** \returns the inverse of the matrix of which *this is the LU decomposition. diff --git a/test/cholesky.cpp b/test/cholesky.cpp index 148a0b388..b7abc230b 100644 --- a/test/cholesky.cpp +++ b/test/cholesky.cpp @@ -91,12 +91,12 @@ template void cholesky(const MatrixType& m) matX = chollo.solve(matB); VERIFY_IS_APPROX(symm * matX, matB); - // Verify that the estimated condition number is within a factor of 10 of the - // truth. const MatrixType symmLo_inverse = chollo.solve(MatrixType::Identity(rows,cols)); RealScalar rcond = (RealScalar(1) / matrix_l1_norm(symmLo)) / matrix_l1_norm(symmLo_inverse); RealScalar rcond_est = chollo.rcond(); + // Verify that the estimated condition number is within a factor of 10 of the + // truth. VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10); // test the upper mode @@ -160,12 +160,12 @@ template void cholesky(const MatrixType& m) matX = ldltlo.solve(matB); VERIFY_IS_APPROX(symm * matX, matB); - // Verify that the estimated condition number is within a factor of 10 of the - // truth. const MatrixType symmLo_inverse = ldltlo.solve(MatrixType::Identity(rows,cols)); RealScalar rcond = (RealScalar(1) / matrix_l1_norm(symmLo)) / matrix_l1_norm(symmLo_inverse); RealScalar rcond_est = ldltlo.rcond(); + // Verify that the estimated condition number is within a factor of 10 of the + // truth. VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10); diff --git a/test/lu.cpp b/test/lu.cpp index 53b3fcee4..9787f4d86 100644 --- a/test/lu.cpp +++ b/test/lu.cpp @@ -151,10 +151,10 @@ template void lu_invertible() MatrixType m1_inverse = lu.inverse(); VERIFY_IS_APPROX(m2, m1_inverse*m3); - // Verify that the estimated condition number is within a factor of 10 of the - // truth. RealScalar rcond = (RealScalar(1) / matrix_l1_norm(m1)) / matrix_l1_norm(m1_inverse); const RealScalar rcond_est = lu.rcond(); + // Verify that the estimated condition number is within a factor of 10 of the + // truth. VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10); // test solve with transposed @@ -197,10 +197,9 @@ template void lu_partial_piv() MatrixType m1_inverse = plu.inverse(); VERIFY_IS_APPROX(m2, m1_inverse*m3); - // Test condition number estimation. RealScalar rcond = (RealScalar(1) / matrix_l1_norm(m1)) / matrix_l1_norm(m1_inverse); - // Verify that the estimate is within a factor of 10 of the truth. const RealScalar rcond_est = plu.rcond(); + // Verify that the estimate is within a factor of 10 of the truth. VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10); // test solve with transposed -- cgit v1.2.3 From 513c3729605a8d447d6f4b494851601601db70e4 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Mon, 4 Apr 2016 14:34:59 -0700 Subject: Fix docstrings to list all supported decompositions. --- Eigen/src/Core/ConditionEstimator.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/ConditionEstimator.h b/Eigen/src/Core/ConditionEstimator.h index 12b4ae648..35ec56128 100644 --- a/Eigen/src/Core/ConditionEstimator.h +++ b/Eigen/src/Core/ConditionEstimator.h @@ -53,9 +53,9 @@ struct SignOrUnity { * \returns an estimate of the reciprocal condition number * (1 / (||matrix||_1 * ||inv(matrix)||_1)) of matrix, given the matrix and * its decomposition. Supports the following decompositions: FullPivLU, - * PartialPivLU. + * PartialPivLU, LDLT, and LLT. * - * \sa FullPivLU, PartialPivLU. + * \sa FullPivLU, PartialPivLU, LDLT, LLT. */ template typename Decomposition::RealScalar ReciprocalConditionNumberEstimate( @@ -82,9 +82,9 @@ typename Decomposition::RealScalar ReciprocalConditionNumberEstimate( * \returns an estimate of the reciprocal condition number * (1 / (||matrix||_1 * ||inv(matrix)||_1)) of matrix, given ||matrix||_1 and * its decomposition. Supports the following decompositions: FullPivLU, - * PartialPivLU. + * PartialPivLU, LDLT, and LLT. * - * \sa FullPivLU, PartialPivLU. + * \sa FullPivLU, PartialPivLU, LDLT, LLT. */ template typename Decomposition::RealScalar ReciprocalConditionNumberEstimate( @@ -114,6 +114,10 @@ typename Decomposition::RealScalar ReciprocalConditionNumberEstimate( * The most common usage is in estimating the condition number * ||matrix||_1 * ||inv(matrix)||_1. The first term ||matrix||_1 can be * computed directly in O(n^2) operations. + * + * Supports the following decompositions: FullPivLU, PartialPivLU, LDLT, and LLT. + * + * \sa FullPivLU, PartialPivLU, LDLT, LLT. */ template typename Decomposition::RealScalar InverseMatrixL1NormEstimate( -- cgit v1.2.3 From 4d07064a3d357453aab7fe668065ce89a31ee4ab Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Tue, 5 Apr 2016 16:40:48 -0700 Subject: Fix bug in alternate lower bound calculation due to missing parentheses. Make a few expressions more concise. --- Eigen/src/Core/ConditionEstimator.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/Eigen/src/Core/ConditionEstimator.h b/Eigen/src/Core/ConditionEstimator.h index 35ec56128..9027aa2f7 100644 --- a/Eigen/src/Core/ConditionEstimator.h +++ b/Eigen/src/Core/ConditionEstimator.h @@ -134,8 +134,7 @@ typename Decomposition::RealScalar InverseMatrixL1NormEstimate( if (n == 0) { return 0; } - Vector v = Vector::Ones(n) / n; - v = dec.solve(v); + Vector v = dec.solve(Vector::Ones(n) / n); // lower_bound is a lower bound on // ||inv(matrix)||_1 = sup_v ||inv(matrix) v||_1 / ||v||_1 @@ -155,11 +154,9 @@ typename Decomposition::RealScalar InverseMatrixL1NormEstimate( int old_v_max_abs_index = v_max_abs_index; for (int k = 0; k < 4; ++k) { sign_vector = internal::SignOrUnity::run(v); - if (k > 0 && !is_complex) { - if (sign_vector == old_sign_vector) { - // Break if the solution stagnated. - break; - } + if (k > 0 && !is_complex && sign_vector == old_sign_vector) { + // Break if the solution stagnated. + break; } // v_max_abs_index = argmax |real( inv(matrix)^T * sign_vector )| v = dec.adjoint().solve(sign_vector); @@ -193,8 +190,9 @@ typename Decomposition::RealScalar InverseMatrixL1NormEstimate( // Hager's algorithm to vastly underestimate ||matrix||_1. Scalar alternating_sign = 1; for (int i = 0; i < n; ++i) { - v[i] = alternating_sign * static_cast(1) + - (static_cast(i) / (static_cast(n - 1))); + v[i] = alternating_sign * + (static_cast(1) + + (static_cast(i) / (static_cast(n - 1)))); alternating_sign = -alternating_sign; } v = dec.solve(v); -- cgit v1.2.3 From 0b5546d182928acb428daba0787755180da4b3fc Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 7 Apr 2016 15:49:30 -0700 Subject: Use lpNorm<1>() to compute l1 norms in LLT and LDLT. --- Eigen/src/Cholesky/LDLT.h | 8 ++++---- Eigen/src/Cholesky/LLT.h | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h index 80f18977c..902376fd6 100644 --- a/Eigen/src/Cholesky/LDLT.h +++ b/Eigen/src/Cholesky/LDLT.h @@ -459,16 +459,16 @@ LDLT& LDLT::compute(const EigenBase() + + m_matrix.row(col).head(col).template lpNorm<1>(); if (abs_col_sum > m_l1_norm) { m_l1_norm = abs_col_sum; } } } else { for (int col = 0; col < a.cols(); ++col) { - const RealScalar abs_col_sum = m_matrix.col(col).head(col).cwiseAbs().sum() + - m_matrix.row(col).tail(size - col).cwiseAbs().sum(); + const RealScalar abs_col_sum = m_matrix.col(col).head(col).template lpNorm<1>() + + m_matrix.row(col).tail(size - col).template lpNorm<1>(); if (abs_col_sum > m_l1_norm) { m_l1_norm = abs_col_sum; } diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index 94da1d52d..dc2ccd6a4 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -414,16 +414,16 @@ LLT& LLT::compute(const EigenBase m_l1_norm = RealScalar(0); if (_UpLo == Lower) { for (int col = 0; col < size; ++col) { - const RealScalar abs_col_sum = m_matrix.col(col).tail(size - col).cwiseAbs().sum() + - m_matrix.row(col).head(col).cwiseAbs().sum(); + const RealScalar abs_col_sum = m_matrix.col(col).tail(size - col).template lpNorm<1>() + + m_matrix.row(col).head(col).template lpNorm<1>(); if (abs_col_sum > m_l1_norm) { m_l1_norm = abs_col_sum; } } } else { for (int col = 0; col < a.cols(); ++col) { - const RealScalar abs_col_sum = m_matrix.col(col).head(col).cwiseAbs().sum() + - m_matrix.row(col).tail(size - col).cwiseAbs().sum(); + const RealScalar abs_col_sum = m_matrix.col(col).head(col).template lpNorm<1>() + + m_matrix.row(col).tail(size - col).template lpNorm<1>(); if (abs_col_sum > m_l1_norm) { m_l1_norm = abs_col_sum; } -- cgit v1.2.3 From fd872aefb3a9a0e4be08e66671494babdb921c2b Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 7 Apr 2016 16:28:44 -0700 Subject: Remove transpose() method from LLT and LDLT classes as it would imply conjugation. Explicitly cast constants to RealScalar in ConditionEstimator.h. --- Eigen/src/Cholesky/LDLT.h | 3 +-- Eigen/src/Cholesky/LLT.h | 3 +-- Eigen/src/Core/ConditionEstimator.h | 31 ++++++++++++++++++++----------- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h index 902376fd6..d246a459c 100644 --- a/Eigen/src/Cholesky/LDLT.h +++ b/Eigen/src/Cholesky/LDLT.h @@ -217,9 +217,8 @@ template class LDLT MatrixType reconstructedMatrix() const; /** \returns the decomposition itself to allow generic code to do - * ldlt.transpose().solve(rhs). + * ldlt.adjoint().solve(rhs). */ - const LDLT& transpose() const { return *this; }; const LDLT& adjoint() const { return *this; }; inline Index rows() const { return m_matrix.rows(); } diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index dc2ccd6a4..f88afe8b5 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -170,9 +170,8 @@ template class LLT } /** \returns the decomposition itself to allow generic code to do - * llt.transpose().solve(rhs). + * llt.adjoint().solve(rhs). */ - const LLT& transpose() const { return *this; }; const LLT& adjoint() const { return *this; }; inline Index rows() const { return m_matrix.rows(); } diff --git a/Eigen/src/Core/ConditionEstimator.h b/Eigen/src/Core/ConditionEstimator.h index 9027aa2f7..19c6102f7 100644 --- a/Eigen/src/Core/ConditionEstimator.h +++ b/Eigen/src/Core/ConditionEstimator.h @@ -27,7 +27,8 @@ template struct SignOrUnity { static inline Vector run(const Vector& v) { const RealVector v_abs = v.cwiseAbs(); - return (v_abs.array() == 0).select(Vector::Ones(v.size()), v.cwiseQuotient(v_abs)); + return (v_abs.array() == static_cast(0)) + .select(Vector::Ones(v.size()), v.cwiseQuotient(v_abs)); } }; @@ -35,7 +36,8 @@ struct SignOrUnity { template struct SignOrUnity { static inline Vector run(const Vector& v) { - return (v.array() < 0).select(-Vector::Ones(v.size()), Vector::Ones(v.size())); + return (v.array() < static_cast(0)) + .select(-Vector::Ones(v.size()), Vector::Ones(v.size())); } }; @@ -65,7 +67,7 @@ typename Decomposition::RealScalar ReciprocalConditionNumberEstimate( eigen_assert(matrix.cols() == dec.cols()); eigen_assert(matrix.rows() == matrix.cols()); if (dec.rows() == 0) { - return Decomposition::RealScalar(1); + return static_cast(1); } return ReciprocalConditionNumberEstimate(MatrixL1Norm(matrix), dec); } @@ -89,15 +91,20 @@ typename Decomposition::RealScalar ReciprocalConditionNumberEstimate( template typename Decomposition::RealScalar ReciprocalConditionNumberEstimate( typename Decomposition::RealScalar matrix_norm, const Decomposition& dec) { + typedef typename Decomposition::RealScalar RealScalar; eigen_assert(dec.rows() == dec.cols()); if (dec.rows() == 0) { - return 1; + return static_cast(1); } - if (matrix_norm == 0) { - return 0; + if (matrix_norm == static_cast(0)) { + return static_cast(0); } - const typename Decomposition::RealScalar inverse_matrix_norm = InverseMatrixL1NormEstimate(dec); - return inverse_matrix_norm == 0 ? 0 : (1 / inverse_matrix_norm) / matrix_norm; + const typename Decomposition::RealScalar inverse_matrix_norm = + InverseMatrixL1NormEstimate(dec); + return (inverse_matrix_norm == static_cast(0) + ? static_cast(0) + : (static_cast(1) / inverse_matrix_norm) / + matrix_norm); } /** @@ -115,7 +122,8 @@ typename Decomposition::RealScalar ReciprocalConditionNumberEstimate( * ||matrix||_1 * ||inv(matrix)||_1. The first term ||matrix||_1 can be * computed directly in O(n^2) operations. * - * Supports the following decompositions: FullPivLU, PartialPivLU, LDLT, and LLT. + * Supports the following decompositions: FullPivLU, PartialPivLU, LDLT, and + * LLT. * * \sa FullPivLU, PartialPivLU, LDLT, LLT. */ @@ -126,7 +134,8 @@ typename Decomposition::RealScalar InverseMatrixL1NormEstimate( typedef typename Decomposition::Scalar Scalar; typedef typename Decomposition::RealScalar RealScalar; typedef typename internal::plain_col_type::type Vector; - typedef typename internal::plain_col_type::type RealVector; + typedef typename internal::plain_col_type::type + RealVector; const bool is_complex = (NumTraits::IsComplex != 0); eigen_assert(dec.rows() == dec.cols()); @@ -188,7 +197,7 @@ typename Decomposition::RealScalar InverseMatrixL1NormEstimate( // exact cancellation (especially when op and op_adjoint correspond to a // sequence of backsubstitutions and permutations), which could cause // Hager's algorithm to vastly underestimate ||matrix||_1. - Scalar alternating_sign = 1; + Scalar alternating_sign(static_cast(1)); for (int i = 0; i < n; ++i) { v[i] = alternating_sign * (static_cast(1) + -- cgit v1.2.3 From d51803a728c95bf32ff9b920db95f5106749f719 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 7 Apr 2016 16:39:48 -0700 Subject: Use Index instead of int for indexing and sizes. --- Eigen/src/Core/ConditionEstimator.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/ConditionEstimator.h b/Eigen/src/Core/ConditionEstimator.h index 19c6102f7..d7f6ef53d 100644 --- a/Eigen/src/Core/ConditionEstimator.h +++ b/Eigen/src/Core/ConditionEstimator.h @@ -139,11 +139,11 @@ typename Decomposition::RealScalar InverseMatrixL1NormEstimate( const bool is_complex = (NumTraits::IsComplex != 0); eigen_assert(dec.rows() == dec.cols()); - const int n = dec.rows(); + const Index n = dec.rows(); if (n == 0) { return 0; } - Vector v = dec.solve(Vector::Ones(n) / n); + Vector v = dec.solve(Vector::Ones(n) / static_cast(n)); // lower_bound is a lower bound on // ||inv(matrix)||_1 = sup_v ||inv(matrix) v||_1 / ||v||_1 @@ -159,8 +159,8 @@ typename Decomposition::RealScalar InverseMatrixL1NormEstimate( RealScalar old_lower_bound = lower_bound; Vector sign_vector(n); Vector old_sign_vector; - int v_max_abs_index = -1; - int old_v_max_abs_index = v_max_abs_index; + Index v_max_abs_index = -1; + Index old_v_max_abs_index = v_max_abs_index; for (int k = 0; k < 4; ++k) { sign_vector = internal::SignOrUnity::run(v); if (k > 0 && !is_complex && sign_vector == old_sign_vector) { @@ -198,7 +198,7 @@ typename Decomposition::RealScalar InverseMatrixL1NormEstimate( // sequence of backsubstitutions and permutations), which could cause // Hager's algorithm to vastly underestimate ||matrix||_1. Scalar alternating_sign(static_cast(1)); - for (int i = 0; i < n; ++i) { + for (Index i = 0; i < n; ++i) { v[i] = alternating_sign * (static_cast(1) + (static_cast(i) / (static_cast(n - 1)))); -- cgit v1.2.3 From 283c51cd5edd5d706635b0bd05593f84cb8d618a Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 7 Apr 2016 16:45:40 -0700 Subject: Widen short-circuiting ReciprocalConditionNumberEstimate so we don't call InverseMatrixL1NormEstimate for dec.rows() <= 1. --- Eigen/src/Core/ConditionEstimator.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/ConditionEstimator.h b/Eigen/src/Core/ConditionEstimator.h index d7f6ef53d..c4c073fa2 100644 --- a/Eigen/src/Core/ConditionEstimator.h +++ b/Eigen/src/Core/ConditionEstimator.h @@ -66,7 +66,7 @@ typename Decomposition::RealScalar ReciprocalConditionNumberEstimate( eigen_assert(matrix.rows() == dec.rows()); eigen_assert(matrix.cols() == dec.cols()); eigen_assert(matrix.rows() == matrix.cols()); - if (dec.rows() == 0) { + if (dec.rows() <= 1) { return static_cast(1); } return ReciprocalConditionNumberEstimate(MatrixL1Norm(matrix), dec); @@ -93,7 +93,7 @@ typename Decomposition::RealScalar ReciprocalConditionNumberEstimate( typename Decomposition::RealScalar matrix_norm, const Decomposition& dec) { typedef typename Decomposition::RealScalar RealScalar; eigen_assert(dec.rows() == dec.cols()); - if (dec.rows() == 0) { + if (dec.rows() <= 1) { return static_cast(1); } if (matrix_norm == static_cast(0)) { -- cgit v1.2.3 From ee6c69733aeb06942cabf3bccd12715ef0e43ecf Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Sat, 9 Apr 2016 12:45:49 -0700 Subject: A few tiny adjustments to short-circuit logic. --- Eigen/src/Core/ConditionEstimator.h | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/Eigen/src/Core/ConditionEstimator.h b/Eigen/src/Core/ConditionEstimator.h index c4c073fa2..5fb0cdbd5 100644 --- a/Eigen/src/Core/ConditionEstimator.h +++ b/Eigen/src/Core/ConditionEstimator.h @@ -65,10 +65,6 @@ typename Decomposition::RealScalar ReciprocalConditionNumberEstimate( const Decomposition& dec) { eigen_assert(matrix.rows() == dec.rows()); eigen_assert(matrix.cols() == dec.cols()); - eigen_assert(matrix.rows() == matrix.cols()); - if (dec.rows() <= 1) { - return static_cast(1); - } return ReciprocalConditionNumberEstimate(MatrixL1Norm(matrix), dec); } @@ -93,18 +89,20 @@ typename Decomposition::RealScalar ReciprocalConditionNumberEstimate( typename Decomposition::RealScalar matrix_norm, const Decomposition& dec) { typedef typename Decomposition::RealScalar RealScalar; eigen_assert(dec.rows() == dec.cols()); - if (dec.rows() <= 1) { - return static_cast(1); + if (dec.rows() == 0) { + return RealScalar(1); + } + if (matrix_norm == RealScalar(0)) { + return RealScalar(0); } - if (matrix_norm == static_cast(0)) { - return static_cast(0); + if (dec.rows() == 1) { + return RealScalar(1); } const typename Decomposition::RealScalar inverse_matrix_norm = InverseMatrixL1NormEstimate(dec); - return (inverse_matrix_norm == static_cast(0) - ? static_cast(0) - : (static_cast(1) / inverse_matrix_norm) / - matrix_norm); + return (inverse_matrix_norm == RealScalar(0) + ? RealScalar(0) + : (RealScalar(1) / inverse_matrix_norm) / matrix_norm); } /** @@ -143,7 +141,7 @@ typename Decomposition::RealScalar InverseMatrixL1NormEstimate( if (n == 0) { return 0; } - Vector v = dec.solve(Vector::Ones(n) / static_cast(n)); + Vector v = dec.solve(Vector::Ones(n) / Scalar(n)); // lower_bound is a lower bound on // ||inv(matrix)||_1 = sup_v ||inv(matrix) v||_1 / ||v||_1 @@ -197,16 +195,15 @@ typename Decomposition::RealScalar InverseMatrixL1NormEstimate( // exact cancellation (especially when op and op_adjoint correspond to a // sequence of backsubstitutions and permutations), which could cause // Hager's algorithm to vastly underestimate ||matrix||_1. - Scalar alternating_sign(static_cast(1)); + Scalar alternating_sign(RealScalar(1)); for (Index i = 0; i < n; ++i) { v[i] = alternating_sign * - (static_cast(1) + - (static_cast(i) / (static_cast(n - 1)))); + (RealScalar(1) + (RealScalar(i) / (RealScalar(n - 1)))); alternating_sign = -alternating_sign; } v = dec.solve(v); const RealScalar alternate_lower_bound = - (2 * internal::VectorL1Norm(v)) / (3 * static_cast(n)); + (2 * internal::VectorL1Norm(v)) / (3 * RealScalar(n)); return numext::maxi(lower_bound, alternate_lower_bound); } -- cgit v1.2.3 From 096e355f8e1b12d3c1f50a8f69dfd7b01def54c5 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Sat, 9 Apr 2016 15:29:56 -0700 Subject: Add short-circuit to avoid calling matrix norm for empty matrix. --- Eigen/src/Core/ConditionEstimator.h | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/Eigen/src/Core/ConditionEstimator.h b/Eigen/src/Core/ConditionEstimator.h index 5fb0cdbd5..f53c2a837 100644 --- a/Eigen/src/Core/ConditionEstimator.h +++ b/Eigen/src/Core/ConditionEstimator.h @@ -65,6 +65,7 @@ typename Decomposition::RealScalar ReciprocalConditionNumberEstimate( const Decomposition& dec) { eigen_assert(matrix.rows() == dec.rows()); eigen_assert(matrix.cols() == dec.cols()); + if (dec.rows() == 0) return typename Decomposition::RealScalar(1); return ReciprocalConditionNumberEstimate(MatrixL1Norm(matrix), dec); } @@ -89,15 +90,9 @@ typename Decomposition::RealScalar ReciprocalConditionNumberEstimate( typename Decomposition::RealScalar matrix_norm, const Decomposition& dec) { typedef typename Decomposition::RealScalar RealScalar; eigen_assert(dec.rows() == dec.cols()); - if (dec.rows() == 0) { - return RealScalar(1); - } - if (matrix_norm == RealScalar(0)) { - return RealScalar(0); - } - if (dec.rows() == 1) { - return RealScalar(1); - } + if (dec.rows() == 0) return RealScalar(1); + if (matrix_norm == RealScalar(0)) return RealScalar(0); + if (dec.rows() == 1) return RealScalar(1); const typename Decomposition::RealScalar inverse_matrix_norm = InverseMatrixL1NormEstimate(dec); return (inverse_matrix_norm == RealScalar(0) -- cgit v1.2.3 From 748c4c4599918ef27b61bade7cea91c4ea8845e1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 11 Apr 2016 13:11:04 -0700 Subject: More accurate cost estimates for exp, log, tanh, and sqrt. --- Eigen/src/Core/functors/UnaryFunctors.h | 88 ++++++++++++++++++++++++++++----- 1 file changed, 76 insertions(+), 12 deletions(-) diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index 7ba0abedc..22ceb4c1c 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -234,9 +234,33 @@ template struct scalar_exp_op { template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pexp(a); } }; -template -struct functor_traits > -{ enum { Cost = 5 * NumTraits::MulCost, PacketAccess = packet_traits::HasExp }; }; +template +struct functor_traits > { + enum { + PacketAccess = packet_traits::HasExp, + // The following numbers are based on the AVX implementation. +#ifdef EIGEN_VECTORIZE_FMA + // Haswell can issue 2 add/mul/madd per cycle. + Cost = + (sizeof(Scalar) == 4 + // float: 8 pmadd, 4 pmul, 2 padd/psub, 6 other + ? (8 * NumTraits::AddCost + 6 * NumTraits::MulCost) + // double: 7 pmadd, 5 pmul, 3 padd/psub, 1 div, 13 other + : (14 * NumTraits::AddCost + + 6 * NumTraits::MulCost + + NumTraits::template Div::HasDiv>::Cost)), +#else + Cost = + (sizeof(Scalar) == 4 + // float: 7 pmadd, 6 pmul, 4 padd/psub, 10 other + ? (21 * NumTraits::AddCost + 13 * NumTraits::MulCost) + // double: 7 pmadd, 5 pmul, 3 padd/psub, 1 div, 13 other + : (23 * NumTraits::AddCost + + 12 * NumTraits::MulCost + + NumTraits::template Div::HasDiv>::Cost)) +#endif + }; +}; /** \internal * @@ -250,9 +274,24 @@ template struct scalar_log_op { template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog(a); } }; -template -struct functor_traits > -{ enum { Cost = 5 * NumTraits::MulCost, PacketAccess = packet_traits::HasLog }; }; +template +struct functor_traits > { + enum { + PacketAccess = packet_traits::HasLog, + Cost = + (PacketAccess + // The following numbers are based on the AVX implementation. +#ifdef EIGEN_VECTORIZE_FMA + // 8 pmadd, 6 pmul, 8 padd/psub, 16 other, can issue 2 add/mul/madd per cycle. + ? (20 * NumTraits::AddCost + 7 * NumTraits::MulCost) +#else + // 8 pmadd, 6 pmul, 8 padd/psub, 20 other + ? (36 * NumTraits::AddCost + 14 * NumTraits::MulCost) +#endif + // Measured cost of std::log. + : sizeof(Scalar)==4 ? 40 : 85) + }; +}; /** \internal * @@ -280,10 +319,19 @@ template struct scalar_sqrt_op { template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psqrt(a); } }; -template -struct functor_traits > -{ enum { - Cost = 5 * NumTraits::MulCost, +template +struct functor_traits > { + enum { +#if EIGEN_FAST_MATH + // The following numbers are based on the AVX implementation. + Cost = (sizeof(Scalar) == 8 ? 28 + // 4 pmul, 1 pmadd, 3 other + : (3 * NumTraits::AddCost + + 5 * NumTraits::MulCost)), +#else + // The following numbers are based on min VSQRT throughput on Haswell. + Cost = (sizeof(Scalar) == 8 ? 28 : 14), +#endif PacketAccess = packet_traits::HasSqrt }; }; @@ -574,8 +622,24 @@ template struct functor_traits > { enum { - Cost = 5 * NumTraits::MulCost, - PacketAccess = packet_traits::HasTanh + PacketAccess = packet_traits::HasTanh, + Cost = + (PacketAccess + // The following numbers are based on the AVX implementation, +#ifdef EIGEN_VECTORIZE_FMA + // Haswell can issue 2 add/mul/madd per cycle. + // 9 pmadd, 2 pmul, 1 div, 2 other + ? (2 * NumTraits::AddCost + 6 * NumTraits::MulCost + + NumTraits::template Div::HasDiv>::Cost) +#else + ? (11 * NumTraits::AddCost + + 11 * NumTraits::MulCost + + NumTraits::template Div::HasDiv>::Cost) +#endif + // This number assumes a naive implementation of tanh + : (6 * NumTraits::AddCost + 3 * NumTraits::MulCost + + 2 * NumTraits::template Div::HasDiv>::Cost + + functor_traits >::Cost)) }; }; -- cgit v1.2.3 From 3b76df64fc73d533eaed83b21c30d2cd15f0f0f2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 12 Apr 2016 10:58:51 -0700 Subject: Defer the decision to vectorize tensor CUDA code to the meta kernel. This makes it possible to decide to vectorize or not depending on the capability of the target cuda architecture. In particular, this enables us to vectorize the processing of fp16 when running on device of capability >= 5.3 --- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 109 +++++++++------------ 1 file changed, 45 insertions(+), 64 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 4f4e07aaf..eabfd91fe 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -147,98 +147,78 @@ class TensorExecutor // GPU: the evaluation of the expression is offloaded to a GPU. #if defined(EIGEN_USE_GPU) -template -class TensorExecutor { +template +class TensorExecutor { public: typedef typename Expression::Index Index; - static EIGEN_DEVICE_FUNC void run(const Expression& expr, const GpuDevice& device); + static void run(const Expression& expr, const GpuDevice& device); }; -template -class TensorExecutor { - public: - typedef typename Expression::Index Index; - static EIGEN_DEVICE_FUNC void run(const Expression& expr, const GpuDevice& device); -}; #if defined(__CUDACC__) +template +struct EigenMetaKernelEval { + static __device__ EIGEN_ALWAYS_INLINE + void run(Evaluator eval, Index first, Index last, Index step_size) { + for (Index i = first; i < last; i += step_size) { + eval.evalScalar(i); + } + } +}; + +template +struct EigenMetaKernelEval { + static __device__ EIGEN_ALWAYS_INLINE + void run(Evaluator eval, Index first, Index last, Index step_size) { + const Index PacketSize = unpacket_traits::size; + const Index vectorized_size = (last / PacketSize) * PacketSize; + const Index vectorized_step_size = step_size * PacketSize; + + // Use the vector path + for (Index i = first * PacketSize; i < vectorized_size; + i += vectorized_step_size) { + eval.evalPacket(i); + } + for (Index i = vectorized_size + first; i < last; i += step_size) { + eval.evalScalar(i); + } + } +}; template __global__ void __launch_bounds__(1024) -EigenMetaKernel_NonVectorizable(Evaluator memcopied_eval, Index size) { - // Cuda memcopies the kernel arguments. That's fine for POD, but for more - // complex types such as evaluators we should really conform to the C++ - // standard and call a proper copy constructor. - Evaluator eval(memcopied_eval); +EigenMetaKernel(Evaluator memcopied_eval, Index size) { const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; const Index step_size = blockDim.x * gridDim.x; - // Use the scalar path - for (Index i = first_index; i < size; i += step_size) { - eval.evalScalar(i); - } -} - -template -__global__ void -__launch_bounds__(1024) -EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) { // Cuda memcopies the kernel arguments. That's fine for POD, but for more // complex types such as evaluators we should really conform to the C++ // standard and call a proper copy constructor. Evaluator eval(memcopied_eval); - const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; - const Index step_size = blockDim.x * gridDim.x; - - // Use the vector path - const Index PacketSize = unpacket_traits::size; - const Index vectorized_step_size = step_size * PacketSize; - const Index vectorized_size = (size / PacketSize) * PacketSize; - for (Index i = first_index * PacketSize; i < vectorized_size; - i += vectorized_step_size) { - eval.evalPacket(i); - } - for (Index i = vectorized_size + first_index; i < size; i += step_size) { - eval.evalScalar(i); - } + const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned; + EigenMetaKernelEval::run(eval, first_index, size, step_size); } /*static*/ -template -EIGEN_DEVICE_FUNC inline void TensorExecutor::run(const Expression& expr, const GpuDevice& device) -{ +template +inline void TensorExecutor::run( + const Expression& expr, const GpuDevice& device) { TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { + if (needs_assign) { const int block_size = device.maxCudaThreadsPerBlock(); - const int max_blocks = numext::mini(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size); + const int max_blocks = device.getNumCudaMultiProcessors() * + device.maxCudaThreadsPerMultiProcessor() / block_size; const Index size = array_prod(evaluator.dimensions()); - // Create a least one block to ensure we won't crash if we're called with tensors of size 0. + // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. const int num_blocks = numext::maxi(numext::mini(max_blocks, (size + block_size - 1) / block_size), 1); - LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable, Index>), num_blocks, block_size, 0, device, evaluator, size); - } - evaluator.cleanup(); -} - -/*static*/ -template -EIGEN_DEVICE_FUNC inline void TensorExecutor::run(const Expression& expr, const GpuDevice& device) -{ - TensorEvaluator evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { - const int block_size = device.maxCudaThreadsPerBlock(); - const int max_blocks = numext::mini(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size); - const Index size = array_prod(evaluator.dimensions()); - // Create a least one block to ensure we won't crash if we're called with tensors of size 0. - const int num_blocks = numext::maxi(numext::mini(max_blocks, (size + block_size - 1) / block_size), 1); - LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable, Index>), num_blocks, block_size, 0, device, evaluator, size); + LAUNCH_CUDA_KERNEL( + (EigenMetaKernel, Index>), + num_blocks, block_size, 0, device, evaluator, size); } evaluator.cleanup(); } @@ -246,6 +226,7 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor::run(c #endif // __CUDACC__ #endif // EIGEN_USE_GPU + } // end namespace internal } // end namespace Eigen -- cgit v1.2.3 From e3a184785c3ea106b8f74471a325ea0945127639 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 12 Apr 2016 11:12:36 -0700 Subject: Fixed the zeta test --- unsupported/test/cxx11_tensor_cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index 134359611..db2d8ee4c 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -718,7 +718,7 @@ void test_cuda_zeta() assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); VERIFY_IS_EQUAL(out(0), expected_out(0)); - VERIFY_IS_APPROX_OR_LESS_THAN(out(3), expected_out(3)); + VERIFY((std::isnan)(out(3))); for (int i = 1; i < 6; ++i) { if (i != 3) { -- cgit v1.2.3 From b67c983291706bd5cc776b3202cacb6ca6508b76 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 12 Apr 2016 23:03:03 +0200 Subject: Enable the use of half-packet in coeff-based product. For instance, Matrix4f*Vector4f is now vectorized again when using AVX. --- Eigen/src/Core/ProductEvaluators.h | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 3ce86e8cd..443a58c27 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -410,8 +410,6 @@ struct product_evaluator, ProductTag, DenseShape, typedef Product XprType; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketScalar PacketScalar; - typedef typename XprType::PacketReturnType PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit product_evaluator(const XprType& xpr) @@ -437,16 +435,20 @@ struct product_evaluator, ProductTag, DenseShape, typedef evaluator LhsEtorType; typedef evaluator RhsEtorType; - + enum { RowsAtCompileTime = LhsNestedCleaned::RowsAtCompileTime, ColsAtCompileTime = RhsNestedCleaned::ColsAtCompileTime, InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(LhsNestedCleaned::ColsAtCompileTime, RhsNestedCleaned::RowsAtCompileTime), MaxRowsAtCompileTime = LhsNestedCleaned::MaxRowsAtCompileTime, - MaxColsAtCompileTime = RhsNestedCleaned::MaxColsAtCompileTime, - - PacketSize = packet_traits::size, + MaxColsAtCompileTime = RhsNestedCleaned::MaxColsAtCompileTime + }; + + typedef typename find_best_packet::type LhsVecPacketType; + typedef typename find_best_packet::type RhsVecPacketType; + enum { + LhsCoeffReadCost = LhsEtorType::CoeffReadCost, RhsCoeffReadCost = RhsEtorType::CoeffReadCost, CoeffReadCost = InnerSize==0 ? NumTraits::ReadCost @@ -459,19 +461,23 @@ struct product_evaluator, ProductTag, DenseShape, LhsFlags = LhsEtorType::Flags, RhsFlags = RhsEtorType::Flags, - LhsAlignment = LhsEtorType::Alignment, - RhsAlignment = RhsEtorType::Alignment, - LhsRowMajor = LhsFlags & RowMajorBit, RhsRowMajor = RhsFlags & RowMajorBit, + + LhsVecPacketSize = unpacket_traits::size, + RhsVecPacketSize = unpacket_traits::size, + + // + LhsAlignment = EIGEN_PLAIN_ENUM_MIN(LhsEtorType::Alignment,LhsVecPacketSize*int(sizeof(typename LhsNestedCleaned::Scalar))), + RhsAlignment = EIGEN_PLAIN_ENUM_MIN(RhsEtorType::Alignment,RhsVecPacketSize*int(sizeof(typename RhsNestedCleaned::Scalar))), SameType = is_same::value, CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit) - && (ColsAtCompileTime == Dynamic || ((ColsAtCompileTime % PacketSize) == 0) ), + && (ColsAtCompileTime == Dynamic || ((ColsAtCompileTime % RhsVecPacketSize) == 0) ), CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit) - && (RowsAtCompileTime == Dynamic || ((RowsAtCompileTime % PacketSize) == 0) ), + && (RowsAtCompileTime == Dynamic || ((RowsAtCompileTime % LhsVecPacketSize) == 0) ), EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1 : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0 @@ -491,10 +497,10 @@ struct product_evaluator, ProductTag, DenseShape, : 0, /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside - * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner - * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect - * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI. - */ + * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner + * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect + * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI. + */ CanVectorizeInner = SameType && LhsRowMajor && (!RhsRowMajor) -- cgit v1.2.3 From 53121c0119d046228c2911c39cf33157479be88d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 12 Apr 2016 14:11:52 -0700 Subject: Turned on the contraction benchmarks for fp16 --- bench/tensors/tensor_benchmarks_fp16_gpu.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bench/tensors/tensor_benchmarks_fp16_gpu.cu b/bench/tensors/tensor_benchmarks_fp16_gpu.cu index d34bd73ca..d6719b7a2 100644 --- a/bench/tensors/tensor_benchmarks_fp16_gpu.cu +++ b/bench/tensors/tensor_benchmarks_fp16_gpu.cu @@ -48,11 +48,11 @@ BM_FuncGPU(colReduction); BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000); -/*BM_FuncWithInputDimsGPU(contraction, N, N, N); +BM_FuncWithInputDimsGPU(contraction, N, N, N); BM_FuncWithInputDimsGPU(contraction, 64, N, N); BM_FuncWithInputDimsGPU(contraction, N, 64, N); BM_FuncWithInputDimsGPU(contraction, N, N, 64); -*/ + // Convolutions #define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \ -- cgit v1.2.3 From 25d05c4b8f54fbfb089be740601d20fc88cde58b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 12 Apr 2016 14:13:25 -0700 Subject: Fixed the vectorization logic test --- test/vectorization_logic.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp index 35fbb9781..283a86c7d 100644 --- a/test/vectorization_logic.cpp +++ b/test/vectorization_logic.cpp @@ -147,10 +147,10 @@ struct vectorization_logic VERIFY(test_assign(Matrix44c().col(1),Matrix44c().col(2)+Matrix44c().col(3), InnerVectorizedTraversal,CompleteUnrolling)); - + VERIFY(test_assign(Matrix44r().row(2),Matrix44r().row(1)+Matrix44r().row(1), InnerVectorizedTraversal,CompleteUnrolling)); - + if(PacketSize>1) { typedef Matrix Matrix33c; @@ -158,17 +158,17 @@ struct vectorization_logic LinearTraversal,CompleteUnrolling)); VERIFY(test_assign(Matrix33c().col(0),Matrix33c().col(1)+Matrix33c().col(1), LinearTraversal,CompleteUnrolling)); - - VERIFY(test_assign(Matrix3(),Matrix3().cwiseQuotient(Matrix3()), - PacketTraits::HasDiv ? LinearVectorizedTraversal : LinearTraversal,CompleteUnrolling)); - + + VERIFY(test_assign(Matrix3(),Matrix3().cwiseProduct(Matrix3()), + LinearVectorizedTraversal,CompleteUnrolling)); + VERIFY(test_assign(Matrix(),Matrix()+Matrix(), HalfPacketSize==1 ? InnerVectorizedTraversal : LinearTraversal,NoUnrolling)); - + VERIFY(test_assign(Matrix11(),Matrix().template block(2,3)+Matrix().template block(8,4), DefaultTraversal,PacketSize>4?InnerUnrolling:CompleteUnrolling)); } - + VERIFY(test_redux(Matrix3(), LinearVectorizedTraversal,CompleteUnrolling)); -- cgit v1.2.3 From aa1ba8bbd2144c61a310e6d7fd50e694453a6b96 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 12 Apr 2016 16:28:11 -0700 Subject: Don't put a command at the end of an enumerator list --- Eigen/src/Core/functors/UnaryFunctors.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index 22ceb4c1c..ae2889ab3 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -248,7 +248,7 @@ struct functor_traits > { // double: 7 pmadd, 5 pmul, 3 padd/psub, 1 div, 13 other : (14 * NumTraits::AddCost + 6 * NumTraits::MulCost + - NumTraits::template Div::HasDiv>::Cost)), + NumTraits::template Div::HasDiv>::Cost)) #else Cost = (sizeof(Scalar) == 4 -- cgit v1.2.3 From eaeb6ca93a8c2efdd1aca6906a8c3fdbfa1740bc Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 12 Apr 2016 16:29:00 -0700 Subject: Enable the benchmarks for algebraic and transcendental fnctions on fp16. --- bench/tensors/tensor_benchmarks_fp16_gpu.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bench/tensors/tensor_benchmarks_fp16_gpu.cu b/bench/tensors/tensor_benchmarks_fp16_gpu.cu index d6719b7a2..14876556e 100644 --- a/bench/tensors/tensor_benchmarks_fp16_gpu.cu +++ b/bench/tensors/tensor_benchmarks_fp16_gpu.cu @@ -29,8 +29,8 @@ BM_FuncGPU(padding); BM_FuncGPU(striding); BM_FuncGPU(broadcasting); BM_FuncGPU(coeffWiseOp); -//BM_FuncGPU(algebraicFunc); -//BM_FuncGPU(transcendentalFunc); +BM_FuncGPU(algebraicFunc); +BM_FuncGPU(transcendentalFunc); BM_FuncGPU(rowReduction); BM_FuncGPU(colReduction); -- cgit v1.2.3 From e9b12cc1f714949e3dbc5587950ff29570aef588 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 12 Apr 2016 20:53:18 -0700 Subject: Fixed compilation warnings generated by clang --- unsupported/test/cxx11_tensor_math.cpp | 4 ++-- unsupported/test/cxx11_tensor_mixed_indices.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/unsupported/test/cxx11_tensor_math.cpp b/unsupported/test/cxx11_tensor_math.cpp index d247bebaa..61c742a16 100644 --- a/unsupported/test/cxx11_tensor_math.cpp +++ b/unsupported/test/cxx11_tensor_math.cpp @@ -16,7 +16,7 @@ using Eigen::RowMajor; static void test_tanh() { - Tensor vec1({6}); + Tensor vec1(6); vec1.setRandom(); Tensor vec2 = vec1.tanh(); @@ -28,7 +28,7 @@ static void test_tanh() static void test_sigmoid() { - Tensor vec1({6}); + Tensor vec1(6); vec1.setRandom(); Tensor vec2 = vec1.sigmoid(); diff --git a/unsupported/test/cxx11_tensor_mixed_indices.cpp b/unsupported/test/cxx11_tensor_mixed_indices.cpp index 72f826216..4fba6fdd1 100644 --- a/unsupported/test/cxx11_tensor_mixed_indices.cpp +++ b/unsupported/test/cxx11_tensor_mixed_indices.cpp @@ -14,8 +14,8 @@ static void test_simple() { - Tensor vec1({6}); - Tensor vec2({6}); + Tensor vec1(6); + Tensor vec2(6); vec1(0) = 4.0; vec2(0) = 0.0; vec1(1) = 8.0; vec2(1) = 1.0; -- cgit v1.2.3 From bbb8854bf775a9691cacf6f0fb6e8bed141100c7 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 13 Apr 2016 13:02:34 +0200 Subject: Enable half-packet in reduxions. --- Eigen/src/Core/Redux.h | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index d170cae29..98b2fd868 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -27,8 +27,9 @@ template struct redux_traits { public: + typedef typename find_best_packet::type PacketType; enum { - PacketSize = packet_traits::size, + PacketSize = unpacket_traits::size, InnerMaxSize = int(Derived::IsRowMajor) ? Derived::MaxColsAtCompileTime : Derived::MaxRowsAtCompileTime @@ -137,12 +138,12 @@ template struct redux_vec_unroller { enum { - PacketSize = packet_traits::size, + PacketSize = redux_traits::PacketSize, HalfLength = Length/2 }; typedef typename Derived::Scalar Scalar; - typedef typename packet_traits::type PacketScalar; + typedef typename redux_traits::PacketType PacketScalar; static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func) { @@ -156,14 +157,14 @@ template struct redux_vec_unroller { enum { - index = Start * packet_traits::size, + index = Start * redux_traits::PacketSize, outer = index / int(Derived::InnerSizeAtCompileTime), inner = index % int(Derived::InnerSizeAtCompileTime), alignment = Derived::Alignment }; typedef typename Derived::Scalar Scalar; - typedef typename packet_traits::type PacketScalar; + typedef typename redux_traits::PacketType PacketScalar; static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&) { @@ -209,13 +210,13 @@ template struct redux_impl { typedef typename Derived::Scalar Scalar; - typedef typename packet_traits::type PacketScalar; + typedef typename redux_traits::PacketType PacketScalar; static Scalar run(const Derived &mat, const Func& func) { const Index size = mat.size(); - const Index packetSize = packet_traits::size; + const Index packetSize = redux_traits::PacketSize; const int packetAlignment = unpacket_traits::alignment; enum { alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned), @@ -268,7 +269,7 @@ template struct redux_impl { typedef typename Derived::Scalar Scalar; - typedef typename packet_traits::type PacketType; + typedef typename redux_traits::PacketType PacketType; EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func) { @@ -276,7 +277,7 @@ struct redux_impl const Index innerSize = mat.innerSize(); const Index outerSize = mat.outerSize(); enum { - packetSize = packet_traits::size + packetSize = redux_traits::PacketSize }; const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize; Scalar res; @@ -306,9 +307,10 @@ template struct redux_impl { typedef typename Derived::Scalar Scalar; - typedef typename packet_traits::type PacketScalar; + + typedef typename redux_traits::PacketType PacketScalar; enum { - PacketSize = packet_traits::size, + PacketSize = redux_traits::PacketSize, Size = Derived::SizeAtCompileTime, VectorizedSize = (Size / PacketSize) * PacketSize }; @@ -367,11 +369,11 @@ public: { return m_evaluator.coeff(index); } template - PacketReturnType packet(Index row, Index col) const + PacketType packet(Index row, Index col) const { return m_evaluator.template packet(row, col); } template - PacketReturnType packet(Index index) const + PacketType packet(Index index) const { return m_evaluator.template packet(index); } EIGEN_DEVICE_FUNC @@ -379,7 +381,7 @@ public: { return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } template - PacketReturnType packetByOuterInner(Index outer, Index inner) const + PacketType packetByOuterInner(Index outer, Index inner) const { return m_evaluator.template packet(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } const XprType & nestedExpression() const { return m_xpr; } -- cgit v1.2.3 From 06447e0a395109003d4ab197dcfc886065681f7c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 13 Apr 2016 18:15:49 +0200 Subject: Improve half-packet vectorization logic to distinguish linear versus inner traversal modes. --- Eigen/src/Core/AssignEvaluator.h | 56 +++++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 18 deletions(-) diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index 3de8aa9a2..9d4b315a0 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -29,13 +29,10 @@ struct copy_using_evaluator_traits { typedef typename DstEvaluator::XprType Dst; typedef typename Dst::Scalar DstScalar; - // TODO distinguish between linear traversal and inner-traversals - typedef typename find_best_packet::type PacketType; enum { DstFlags = DstEvaluator::Flags, - SrcFlags = SrcEvaluator::Flags, - RequiredAlignment = unpacket_traits::alignment + SrcFlags = SrcEvaluator::Flags }; public: @@ -55,10 +52,25 @@ private: : int(DstFlags)&RowMajorBit ? int(Dst::MaxColsAtCompileTime) : int(Dst::MaxRowsAtCompileTime), OuterStride = int(outer_stride_at_compile_time::ret), - MaxSizeAtCompileTime = Dst::SizeAtCompileTime, - PacketSize = unpacket_traits::size + MaxSizeAtCompileTime = Dst::SizeAtCompileTime + }; + + // TODO distinguish between linear traversal and inner-traversals + typedef typename find_best_packet::type LinearPacketType; + typedef typename find_best_packet::type InnerPacketType; + + enum { + LinearPacketSize = unpacket_traits::size, + InnerPacketSize = unpacket_traits::size }; +public: + enum { + LinearRequiredAlignment = unpacket_traits::alignment, + InnerRequiredAlignment = unpacket_traits::alignment + }; + +private: enum { DstIsRowMajor = DstFlags&RowMajorBit, SrcIsRowMajor = SrcFlags&RowMajorBit, @@ -67,16 +79,16 @@ private: && (int(DstFlags) & int(SrcFlags) & ActualPacketAccessBit) && (functor_traits::PacketAccess), MayInnerVectorize = MightVectorize - && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0 - && int(OuterStride)!=Dynamic && int(OuterStride)%int(PacketSize)==0 - && int(JointAlignment)>=int(RequiredAlignment), + && int(InnerSize)!=Dynamic && int(InnerSize)%int(InnerPacketSize)==0 + && int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0 + && int(JointAlignment)>=int(InnerRequiredAlignment), MayLinearize = StorageOrdersAgree && (int(DstFlags) & int(SrcFlags) & LinearAccessBit), MayLinearVectorize = MightVectorize && MayLinearize && DstHasDirectAccess - && ((int(DstAlignment)>=int(RequiredAlignment)) || MaxSizeAtCompileTime == Dynamic), + && ((int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic), /* If the destination isn't aligned, we have to do runtime checks and we don't unroll, so it's only good for large enough sizes. */ MaySliceVectorize = MightVectorize && DstHasDirectAccess - && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=3*PacketSize) + && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=3*InnerPacketSize) /* slice vectorization can be slow, so we only want it if the slices are big, which is indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block in a fixed-size matrix */ @@ -84,7 +96,8 @@ private: public: enum { - Traversal = int(MayInnerVectorize) ? int(InnerVectorizedTraversal) + Traversal = int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize) ? int(LinearVectorizedTraversal) + : int(MayInnerVectorize) ? int(InnerVectorizedTraversal) : int(MayLinearVectorize) ? int(LinearVectorizedTraversal) : int(MaySliceVectorize) ? int(SliceVectorizedTraversal) : int(MayLinearize) ? int(LinearTraversal) @@ -94,9 +107,14 @@ public: || int(Traversal) == SliceVectorizedTraversal }; + typedef typename conditional::type PacketType; + private: enum { - UnrollingLimit = EIGEN_UNROLLING_LIMIT * (Vectorized ? int(PacketSize) : 1), + ActualPacketSize = int(Traversal)==LinearVectorizedTraversal ? LinearPacketSize + : Vectorized ? InnerPacketSize + : 1, + UnrollingLimit = EIGEN_UNROLLING_LIMIT * ActualPacketSize, MayUnrollCompletely = int(Dst::SizeAtCompileTime) != Dynamic && int(Dst::SizeAtCompileTime) * int(SrcEvaluator::CoeffReadCost) <= int(UnrollingLimit), MayUnrollInner = int(InnerSize) != Dynamic @@ -112,7 +130,7 @@ public: : int(NoUnrolling) ) : int(Traversal) == int(LinearVectorizedTraversal) - ? ( bool(MayUnrollCompletely) && (int(DstAlignment)>=int(RequiredAlignment)) ? int(CompleteUnrolling) + ? ( bool(MayUnrollCompletely) && (int(DstAlignment)>=int(LinearRequiredAlignment)) ? int(CompleteUnrolling) : int(NoUnrolling) ) : int(Traversal) == int(LinearTraversal) ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) @@ -131,11 +149,13 @@ public: std::cerr.unsetf(std::ios::hex); EIGEN_DEBUG_VAR(DstAlignment) EIGEN_DEBUG_VAR(SrcAlignment) - EIGEN_DEBUG_VAR(RequiredAlignment) + EIGEN_DEBUG_VAR(LinearRequiredAlignment) + EIGEN_DEBUG_VAR(InnerRequiredAlignment) EIGEN_DEBUG_VAR(JointAlignment) EIGEN_DEBUG_VAR(InnerSize) EIGEN_DEBUG_VAR(InnerMaxSize) - EIGEN_DEBUG_VAR(PacketSize) + EIGEN_DEBUG_VAR(LinearPacketSize) + EIGEN_DEBUG_VAR(InnerPacketSize) EIGEN_DEBUG_VAR(StorageOrdersAgree) EIGEN_DEBUG_VAR(MightVectorize) EIGEN_DEBUG_VAR(MayLinearize) @@ -370,7 +390,7 @@ struct dense_assignment_loop typedef typename Kernel::Scalar Scalar; typedef typename Kernel::PacketType PacketType; enum { - requestedAlignment = Kernel::AssignmentTraits::RequiredAlignment, + requestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment, packetSize = unpacket_traits::size, dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment), dstAlignment = packet_traits::AlignedOnScalar ? int(requestedAlignment) @@ -484,7 +504,7 @@ struct dense_assignment_loop typedef typename Kernel::PacketType PacketType; enum { packetSize = unpacket_traits::size, - requestedAlignment = int(Kernel::AssignmentTraits::RequiredAlignment), + requestedAlignment = int(Kernel::AssignmentTraits::InnerRequiredAlignment), alignable = packet_traits::AlignedOnScalar || int(Kernel::AssignmentTraits::DstAlignment)>=sizeof(Scalar), dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment), dstAlignment = alignable ? int(requestedAlignment) -- cgit v1.2.3 From 512ba0ac760c8dd0bd667cbe74f53c440b7e835f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 13 Apr 2016 18:16:35 +0200 Subject: Add regression unit tests for half-packet vectorization --- test/vectorization_logic.cpp | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp index 283a86c7d..f1a74d765 100644 --- a/test/vectorization_logic.cpp +++ b/test/vectorization_logic.cpp @@ -22,7 +22,11 @@ template bool test_assign(const Dst&, const Src&, int traversal, int unrolling) { typedef internal::copy_using_evaluator_traits,internal::evaluator, internal::assign_op > traits; - bool res = traits::Traversal==traversal && traits::Unrolling==unrolling; + bool res = traits::Traversal==traversal; + if(unrolling==InnerUnrolling+CompleteUnrolling) + res = res && (traits::Unrolling==InnerUnrolling || traits::Unrolling==CompleteUnrolling); + else + res = res && traits::Unrolling==unrolling; if(!res) { std::cerr << "Src: " << demangle_flags(Src::Flags) << std::endl; @@ -167,8 +171,20 @@ struct vectorization_logic VERIFY(test_assign(Matrix11(),Matrix().template block(2,3)+Matrix().template block(8,4), DefaultTraversal,PacketSize>4?InnerUnrolling:CompleteUnrolling)); + + VERIFY(test_assign(Vector1(),Matrix11()*Vector1(), + InnerVectorizedTraversal,CompleteUnrolling)); + + VERIFY(test_assign(Matrix11(),Matrix11().lazyProduct(Matrix11()), + InnerVectorizedTraversal,InnerUnrolling+CompleteUnrolling)); } + VERIFY(test_redux(Vector1(), + LinearVectorizedTraversal,CompleteUnrolling)); + + VERIFY(test_redux(Matrix(), + LinearVectorizedTraversal,CompleteUnrolling)); + VERIFY(test_redux(Matrix3(), LinearVectorizedTraversal,CompleteUnrolling)); @@ -226,6 +242,7 @@ struct vectorization_logic_half typedef Matrix Vector1; typedef Matrix Matrix11; typedef Matrix Matrix57; + typedef Matrix Matrix35; typedef Matrix Matrix57u; // typedef Matrix Matrix44; // typedef Matrix Matrix44u; @@ -291,12 +308,24 @@ struct vectorization_logic_half VERIFY(test_assign(Matrix11(),Matrix().template block(2,3)+Matrix().template block(8,4), DefaultTraversal,PacketSize>4?InnerUnrolling:CompleteUnrolling)); + + VERIFY(test_assign(Vector1(),Matrix11()*Vector1(), + InnerVectorizedTraversal,CompleteUnrolling)); + + VERIFY(test_assign(Matrix11(),Matrix11().lazyProduct(Matrix11()), + InnerVectorizedTraversal,InnerUnrolling+CompleteUnrolling)); } + VERIFY(test_redux(Vector1(), + LinearVectorizedTraversal,CompleteUnrolling)); + + VERIFY(test_redux(Matrix(), + LinearVectorizedTraversal,CompleteUnrolling)); + VERIFY(test_redux(Matrix3(), LinearVectorizedTraversal,CompleteUnrolling)); - VERIFY(test_redux(Matrix57(), + VERIFY(test_redux(Matrix35(), LinearVectorizedTraversal,CompleteUnrolling)); VERIFY(test_redux(Matrix57().template block(1,0), -- cgit v1.2.3 From 2630d97c625d642a205ecf00723efd8e28b9639e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 13 Apr 2016 18:26:08 +0200 Subject: Fix detection of same matrices when both matrices are not handled by extract_data. --- Eigen/src/Core/ProductEvaluators.h | 5 +++-- Eigen/src/Core/SolveTriangular.h | 3 ++- Eigen/src/Core/TriangularMatrix.h | 3 ++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 443a58c27..bd63b87ff 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -467,7 +467,7 @@ struct product_evaluator, ProductTag, DenseShape, LhsVecPacketSize = unpacket_traits::size, RhsVecPacketSize = unpacket_traits::size, - // + // Here, we don't care about alignment larger than the usable packet size. LhsAlignment = EIGEN_PLAIN_ENUM_MIN(LhsEtorType::Alignment,LhsVecPacketSize*int(sizeof(typename LhsNestedCleaned::Scalar))), RhsAlignment = EIGEN_PLAIN_ENUM_MIN(RhsEtorType::Alignment,RhsVecPacketSize*int(sizeof(typename RhsNestedCleaned::Scalar))), @@ -1006,7 +1006,8 @@ struct transposition_matrix_product const Index size = tr.size(); StorageIndex j = 0; - if(!(is_same::value && extract_data(dst) == extract_data(mat))) + const typename Dest::Scalar *dst_data = internal::extract_data(dst); + if(!(is_same::value && dst_data == extract_data(mat))) dst = mat; for(Index k=(Transposed?size-1:0) ; Transposed?k>=0:k struct triangular_solv template inline void evalTo(Dest& dst) const { - if(!(is_same::value && extract_data(dst) == extract_data(m_rhs))) + const typename Dest::Scalar *dst_data = internal::extract_data(dst); + if(!(is_same::value && dst_data!=0 && dst_data == extract_data(m_rhs))) dst = m_rhs; m_triangularMatrix.template solveInPlace(dst); } diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h index e6d137e40..fafc2a0c6 100644 --- a/Eigen/src/Core/TriangularMatrix.h +++ b/Eigen/src/Core/TriangularMatrix.h @@ -532,7 +532,8 @@ template class TriangularViewImpl<_Mat template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _solve_impl(const RhsType &rhs, DstType &dst) const { - if(!(internal::is_same::value && internal::extract_data(dst) == internal::extract_data(rhs))) + const typename DstType::Scalar *dst_data = internal::extract_data(dst); + if(!(internal::is_same::value && dst_data!=0 && dst_data == internal::extract_data(rhs))) dst = rhs; this->solveInPlace(dst); } -- cgit v1.2.3 From b7716c032893f4cc6f0e635bcffc2d8bc3d956ab Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 13 Apr 2016 18:32:56 +0200 Subject: Fix incomplete previous patch on matrix comparision. --- Eigen/src/Core/ProductEvaluators.h | 2 +- Eigen/src/Householder/HouseholderSequence.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index bd63b87ff..2cdcf1c02 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -1007,7 +1007,7 @@ struct transposition_matrix_product StorageIndex j = 0; const typename Dest::Scalar *dst_data = internal::extract_data(dst); - if(!(is_same::value && dst_data == extract_data(mat))) + if(!(is_same::value && dst_data!=0 && dst_data == extract_data(mat))) dst = mat; for(Index k=(Transposed?size-1:0) ; Transposed?k>=0:k class HouseholderS { workspace.resize(rows()); Index vecs = m_length; + const typename Dest::Scalar *dst_data = internal::extract_data(dst); if( internal::is_same::type,Dest>::value - && internal::extract_data(dst) == internal::extract_data(m_vectors)) + && dst_data!=0 && dst_data == internal::extract_data(m_vectors)) { // in-place dst.diagonal().setOnes(); -- cgit v1.2.3 From 6f960b83ff1396bd46f02672d73e308db497e774 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 13 Apr 2016 18:47:12 +0200 Subject: Make use of is_same_dense helper instead of extract_data to detect input/outputs are the same. --- Eigen/src/Core/ProductEvaluators.h | 3 +-- Eigen/src/Core/SolveTriangular.h | 3 +-- Eigen/src/Core/TriangularMatrix.h | 3 +-- Eigen/src/Householder/HouseholderSequence.h | 4 +--- 4 files changed, 4 insertions(+), 9 deletions(-) diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 2cdcf1c02..d9fd888cf 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -1006,8 +1006,7 @@ struct transposition_matrix_product const Index size = tr.size(); StorageIndex j = 0; - const typename Dest::Scalar *dst_data = internal::extract_data(dst); - if(!(is_same::value && dst_data!=0 && dst_data == extract_data(mat))) + if(!is_same_dense(dst,mat)) dst = mat; for(Index k=(Transposed?size-1:0) ; Transposed?k>=0:k struct triangular_solv template inline void evalTo(Dest& dst) const { - const typename Dest::Scalar *dst_data = internal::extract_data(dst); - if(!(is_same::value && dst_data!=0 && dst_data == extract_data(m_rhs))) + if(!is_same_dense(dst,m_rhs)) dst = m_rhs; m_triangularMatrix.template solveInPlace(dst); } diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h index fafc2a0c6..5c5e5028e 100644 --- a/Eigen/src/Core/TriangularMatrix.h +++ b/Eigen/src/Core/TriangularMatrix.h @@ -532,8 +532,7 @@ template class TriangularViewImpl<_Mat template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _solve_impl(const RhsType &rhs, DstType &dst) const { - const typename DstType::Scalar *dst_data = internal::extract_data(dst); - if(!(internal::is_same::value && dst_data!=0 && dst_data == internal::extract_data(rhs))) + if(!internal::is_same_dense(dst,rhs)) dst = rhs; this->solveInPlace(dst); } diff --git a/Eigen/src/Householder/HouseholderSequence.h b/Eigen/src/Householder/HouseholderSequence.h index 3f5f9293b..e9f3ebf88 100644 --- a/Eigen/src/Householder/HouseholderSequence.h +++ b/Eigen/src/Householder/HouseholderSequence.h @@ -243,9 +243,7 @@ template class HouseholderS { workspace.resize(rows()); Index vecs = m_length; - const typename Dest::Scalar *dst_data = internal::extract_data(dst); - if( internal::is_same::type,Dest>::value - && dst_data!=0 && dst_data == internal::extract_data(m_vectors)) + if(is_same_dense(dst,m_vectors)) { // in-place dst.diagonal().setOnes(); -- cgit v1.2.3 From 42a3352a3b0565e0837d830a38dacdf1157d0271 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 13 Apr 2016 19:02:02 +0200 Subject: Workaround a division by zero when outerstride==0 --- Eigen/src/Core/products/TriangularSolverMatrix.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h index 208593718..1bed66ed8 100644 --- a/Eigen/src/Core/products/TriangularSolverMatrix.h +++ b/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -83,7 +83,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix0 ? l2/(4 * sizeof(Scalar) * otherStride) : 0; + Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * std::max(otherStride,size)) : 0; subcols = std::max((subcols/Traits::nr)*Traits::nr, Traits::nr); for(Index k2=IsLower ? 0 : size; -- cgit v1.2.3 From 473c8380ea14d9451978ab8cc0812a127a063390 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 13 Apr 2016 11:03:37 -0700 Subject: Added constructors to convert unsigned integers into fp16 --- Eigen/src/Core/arch/CUDA/Half.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 281b8e4c6..dd4cb10d9 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -70,12 +70,18 @@ struct half : public __half { explicit EIGEN_DEVICE_FUNC half(bool b) : __half(internal::raw_uint16_to_half(b ? 0x3c00 : 0)) {} + explicit EIGEN_DEVICE_FUNC half(unsigned int ui) + : __half(internal::float_to_half_rtne(static_cast(ui))) {} explicit EIGEN_DEVICE_FUNC half(int i) : __half(internal::float_to_half_rtne(static_cast(i))) {} + explicit EIGEN_DEVICE_FUNC half(unsigned long ul) + : __half(internal::float_to_half_rtne(static_cast(ul))) {} explicit EIGEN_DEVICE_FUNC half(long l) : __half(internal::float_to_half_rtne(static_cast(l))) {} explicit EIGEN_DEVICE_FUNC half(long long ll) : __half(internal::float_to_half_rtne(static_cast(ll))) {} + explicit EIGEN_DEVICE_FUNC half(unsigned long long ull) + : __half(internal::float_to_half_rtne(static_cast(ull))) {} explicit EIGEN_DEVICE_FUNC half(float f) : __half(internal::float_to_half_rtne(f)) {} explicit EIGEN_DEVICE_FUNC half(double d) -- cgit v1.2.3 From bf3f6688f0ff1f6256da9200356a63c550ccae1b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 13 Apr 2016 11:55:08 -0700 Subject: Added support for computing cos, sin, tan, and tanh on GPU. --- Eigen/src/Core/MathFunctions.h | 74 ++++++++++++++++++++++++++------- Eigen/src/Core/functors/UnaryFunctors.h | 8 ++-- 2 files changed, 63 insertions(+), 19 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 8e7dd2b73..b27d51ac1 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -1023,21 +1023,6 @@ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double log(const double &x) { return ::log(x); } #endif -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -T tan(const T &x) { - EIGEN_USING_STD_MATH(tan); - return tan(x); -} - -#ifdef __CUDACC__ -template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -float tan(const float &x) { return ::tanf(x); } - -template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -double tan(const double &x) { return ::tan(x); } -#endif - template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename NumTraits::Real abs(const T &x) { @@ -1068,6 +1053,65 @@ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double exp(const double &x) { return ::exp(x); } #endif +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T cos(const T &x) { + EIGEN_USING_STD_MATH(cos); + return cos(x); +} + +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float cos(const float &x) { return ::cosf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double cos(const double &x) { return ::cos(x); } +#endif + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T sin(const T &x) { + EIGEN_USING_STD_MATH(sin); + return sin(x); +} + +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float sin(const float &x) { return ::sinf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double sin(const double &x) { return ::sin(x); } +#endif + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T tan(const T &x) { + EIGEN_USING_STD_MATH(tan); + return tan(x); +} + +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float tan(const float &x) { return ::tanf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double tan(const double &x) { return ::tan(x); } +#endif + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T tanh(const T &x) { + EIGEN_USING_STD_MATH(tanh); + return tanh(x); +} + +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float tanh(const float &x) { return ::tanhf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double tanh(const double &x) { return ::tanh(x); } +#endif template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index ae2889ab3..f06290623 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -361,7 +361,7 @@ struct functor_traits > */ template struct scalar_cos_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_cos_op) - EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { using std::cos; return cos(a); } + EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return numext::cos(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pcos(a); } }; @@ -380,7 +380,7 @@ struct functor_traits > */ template struct scalar_sin_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_sin_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::sin; return sin(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::sin(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psin(a); } }; @@ -400,7 +400,7 @@ struct functor_traits > */ template struct scalar_tan_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_tan_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::tan; return tan(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::tan(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::ptan(a); } }; @@ -614,7 +614,7 @@ struct functor_traits > */ template struct scalar_tanh_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_tanh_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::tanh; return tanh(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::tanh(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::ptanh(a); } }; -- cgit v1.2.3 From a95e1a273e9e194c3bbd62634c42dc2b52911f21 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 13 Apr 2016 22:00:38 +0200 Subject: Fix warning in unit tests --- test/vectorization_logic.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp index f1a74d765..ee446c3c1 100644 --- a/test/vectorization_logic.cpp +++ b/test/vectorization_logic.cpp @@ -24,9 +24,9 @@ bool test_assign(const Dst&, const Src&, int traversal, int unrolling) typedef internal::copy_using_evaluator_traits,internal::evaluator, internal::assign_op > traits; bool res = traits::Traversal==traversal; if(unrolling==InnerUnrolling+CompleteUnrolling) - res = res && (traits::Unrolling==InnerUnrolling || traits::Unrolling==CompleteUnrolling); + res = res && (int(traits::Unrolling)==InnerUnrolling || int(traits::Unrolling)==CompleteUnrolling); else - res = res && traits::Unrolling==unrolling; + res = res && int(traits::Unrolling)==unrolling; if(!res) { std::cerr << "Src: " << demangle_flags(Src::Flags) << std::endl; -- cgit v1.2.3 From f4e12272f15ecde2751d1be615527925b19dee0e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 13 Apr 2016 22:18:02 +0200 Subject: Fix corner case in unit test. --- test/swap.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/test/swap.cpp b/test/swap.cpp index 5d6f0e6af..f76e3624d 100644 --- a/test/swap.cpp +++ b/test/swap.cpp @@ -74,10 +74,13 @@ template void swap(const MatrixType& m) m1 = m1_copy; m3 = m3_copy; - // test assertion on mismatching size -- matrix case - VERIFY_RAISES_ASSERT(m1.swap(m1.row(0))); - // test assertion on mismatching size -- xpr case - VERIFY_RAISES_ASSERT(m1.row(0).swap(m1)); + if(m1.rows()>1) + { + // test assertion on mismatching size -- matrix case + VERIFY_RAISES_ASSERT(m1.swap(m1.row(0))); + // test assertion on mismatching size -- xpr case + VERIFY_RAISES_ASSERT(m1.row(0).swap(m1)); + } } void test_swap() -- cgit v1.2.3 From feef39e2d1bfd2a703ff9125b60e899802f0c3d9 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 13 Apr 2016 22:49:51 +0200 Subject: Fix underflow in JacoviSVD's complex to real preconditioner --- Eigen/src/SVD/JacobiSVD.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h index bf5ff48c3..88bc0688e 100644 --- a/Eigen/src/SVD/JacobiSVD.h +++ b/Eigen/src/SVD/JacobiSVD.h @@ -368,9 +368,15 @@ struct svd_precondition_2x2_block_to_be_real if(n==0) { - z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q); - work_matrix.row(p) *= z; - if(svd.computeU()) svd.m_matrixU.col(p) *= conj(z); + // make sure firt column is zero (deflation) + work_matrix.coeffRef(p,p) = work_matrix.coeffRef(q,p) = Scalar(0); + if(work_matrix.coeff(p,q)!=Scalar(0)) + { + // work_matrix.coeff(p,q) can be zero if work_matrix.coeff(q,p) is not zero but small enough to underflow when computing n + z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q); + work_matrix.row(p) *= z; + if(svd.computeU()) svd.m_matrixU.col(p) *= conj(z); + } if(work_matrix.coeff(q,q)!=Scalar(0)) { z = abs(work_matrix.coeff(q,q)) / work_matrix.coeff(q,q); -- cgit v1.2.3 From 7d1391d049891a63475164c4f984c073ddd3bd00 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 13 Apr 2016 22:50:54 +0200 Subject: Turn a converge check to a warning --- unsupported/test/levenberg_marquardt.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unsupported/test/levenberg_marquardt.cpp b/unsupported/test/levenberg_marquardt.cpp index 6dc17bd17..64f168c16 100644 --- a/unsupported/test/levenberg_marquardt.cpp +++ b/unsupported/test/levenberg_marquardt.cpp @@ -792,7 +792,9 @@ void testNistMGH10(void) MGH10_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + ++g_test_level; VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall); + --g_test_level; // was: VERIFY_IS_EQUAL(info, 1); // check norm^2 -- cgit v1.2.3 From 2c9e4fa4171fafccf51cd6980aafbe59209256af Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 13 Apr 2016 22:56:12 +0200 Subject: Add debug output for random unit test --- test/rand.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/rand.cpp b/test/rand.cpp index 6790acf15..eeec34191 100644 --- a/test/rand.cpp +++ b/test/rand.cpp @@ -29,6 +29,9 @@ template void check_all_in_range(Scalar x, Scalar y) { mask( check_in_range(x,y)-x )++; } + for(Index i=0; i0).all() ); } -- cgit v1.2.3 From 87ca15c4e8810777fd884d7f661f2c178fccd441 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 13 Apr 2016 14:12:38 -0700 Subject: Added support for sin, cos, tan, and tanh on fp16 --- Eigen/src/Core/arch/CUDA/Half.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index dd4cb10d9..4fbf8f412 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -424,13 +424,30 @@ template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrt(const Eigen::h template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half pow(const Eigen::half& a, const Eigen::half& b) { return Eigen::half(::powf(float(a), float(b))); } +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sin(const Eigen::half& a) { + return Eigen::half(::sinf(float(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half cos(const Eigen::half& a) { + return Eigen::half(::cosf(float(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half tan(const Eigen::half& a) { + return Eigen::half(::tanf(float(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half tanh(const Eigen::half& a) { + return Eigen::half(::tanhf(float(a))); +} template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floor(const Eigen::half& a) { return Eigen::half(::floorf(float(a))); } template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceil(const Eigen::half& a) { return Eigen::half(::ceilf(float(a))); } - +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erf(const Eigen::half& a) { + return Eigen::half(erff(float(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erfc(const Eigen::half& a) { + return Eigen::half(erfcf(float(a))); +} } // end namespace numext } // end namespace Eigen -- cgit v1.2.3 From 29862532599ca4d0e02ab759813604fa1f2e2067 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 13 Apr 2016 14:24:06 -0700 Subject: Cleaned up the implementation of digamma --- Eigen/src/Core/SpecialFunctions.h | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index adb055b15..a3857ae1f 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -281,20 +281,18 @@ struct digamma_impl { */ Scalar p, q, nz, s, w, y; - bool negative; + bool negative = false; const Scalar maxnum = NumTraits::infinity(); - const Scalar m_pi = EIGEN_PI; + const Scalar m_pi(EIGEN_PI); - negative = 0; - nz = 0.0; - - const Scalar zero = 0.0; - const Scalar one = 1.0; - const Scalar half = 0.5; + const Scalar zero = Scalar(0); + const Scalar one = Scalar(1); + const Scalar half = Scalar(0.5); + nz = zero; if (x <= zero) { - negative = one; + negative = true; q = x; p = numext::floor(q); if (p == q) { @@ -463,7 +461,7 @@ template struct igammac_impl { EIGEN_DEVICE_FUNC static Scalar run(Scalar a, Scalar x) { - /* igamc() + /* igamc() * * Incomplete gamma integral (modified for Eigen) * @@ -618,7 +616,7 @@ template struct igamma_impl { EIGEN_DEVICE_FUNC static Scalar run(Scalar a, Scalar x) { - /* igam() + /* igam() * Incomplete gamma integral * * -- cgit v1.2.3 From 39211ba46b7fa449f4f10265d9d6da8a3e4b7b43 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 13 Apr 2016 23:43:26 +0200 Subject: Fix JacobiSVD for complex when the complex-to-real update already gives a diagonal 2x2 block. --- Eigen/src/SVD/JacobiSVD.h | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h index 88bc0688e..f08776bc6 100644 --- a/Eigen/src/SVD/JacobiSVD.h +++ b/Eigen/src/SVD/JacobiSVD.h @@ -350,7 +350,7 @@ template struct svd_precondition_2x2_block_to_be_real { typedef JacobiSVD SVD; - static void run(typename SVD::WorkMatrixType&, SVD&, Index, Index) {} + static bool run(typename SVD::WorkMatrixType&, SVD&, Index, Index, const typename MatrixType::RealScalar&) { return true; } }; template @@ -359,7 +359,7 @@ struct svd_precondition_2x2_block_to_be_real typedef JacobiSVD SVD; typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; - static void run(typename SVD::WorkMatrixType& work_matrix, SVD& svd, Index p, Index q) + static bool run(typename SVD::WorkMatrixType& work_matrix, SVD& svd, Index p, Index q, const typename MatrixType::RealScalar& precision) { using std::sqrt; Scalar z; @@ -368,7 +368,7 @@ struct svd_precondition_2x2_block_to_be_real if(n==0) { - // make sure firt column is zero (deflation) + // make sure first column is zero work_matrix.coeffRef(p,p) = work_matrix.coeffRef(q,p) = Scalar(0); if(work_matrix.coeff(p,q)!=Scalar(0)) { @@ -404,6 +404,12 @@ struct svd_precondition_2x2_block_to_be_real if(svd.computeU()) svd.m_matrixU.col(q) *= conj(z); } } + + const RealScalar considerAsZero = RealScalar(2) * std::numeric_limits::denorm_min(); + RealScalar threshold = numext::maxi(considerAsZero, + precision * numext::maxi(abs(work_matrix.coeff(p,p)), abs(work_matrix.coeff(q,q)))); + // return true if we still have some work to do + return numext::abs(work_matrix(p,q)) > threshold || numext::abs(work_matrix(q,p)) > threshold; } }; @@ -735,18 +741,20 @@ JacobiSVD::compute(const MatrixType& matrix, unsig if(abs(m_workMatrix.coeff(p,q))>threshold || abs(m_workMatrix.coeff(q,p)) > threshold) { finished = false; - // perform SVD decomposition of 2x2 sub-matrix corresponding to indices p,q to make it diagonal - internal::svd_precondition_2x2_block_to_be_real::run(m_workMatrix, *this, p, q); - JacobiRotation j_left, j_right; - internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right); - - // accumulate resulting Jacobi rotations - m_workMatrix.applyOnTheLeft(p,q,j_left); - if(computeU()) m_matrixU.applyOnTheRight(p,q,j_left.transpose()); - - m_workMatrix.applyOnTheRight(p,q,j_right); - if(computeV()) m_matrixV.applyOnTheRight(p,q,j_right); + // the complex to real operation returns true is the updated 2x2 block is not already diagonal + if(internal::svd_precondition_2x2_block_to_be_real::run(m_workMatrix, *this, p, q, precision)) + { + JacobiRotation j_left, j_right; + internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right); + + // accumulate resulting Jacobi rotations + m_workMatrix.applyOnTheLeft(p,q,j_left); + if(computeU()) m_matrixU.applyOnTheRight(p,q,j_left.transpose()); + + m_workMatrix.applyOnTheRight(p,q,j_right); + if(computeV()) m_matrixV.applyOnTheRight(p,q,j_right); + } } } } -- cgit v1.2.3 From d6105b53b865156198f471e8aa9a531cdb4e65da Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 13 Apr 2016 15:26:02 -0700 Subject: Added basic implementation of the lgamma, digamma, igamma, igammac, polygamma, and zeta function for fp16 --- Eigen/src/Core/arch/CUDA/Half.h | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 4fbf8f412..dfab34d32 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -442,12 +442,31 @@ template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floor(const Eigen:: template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceil(const Eigen::half& a) { return Eigen::half(::ceilf(float(a))); } +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half lgamma(const Eigen::half& a) { + return Eigen::half(Eigen::numext::lgamma(static_cast(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half digamma(const Eigen::half& a) { + return Eigen::half(Eigen::numext::digamma(static_cast(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half zeta(const Eigen::half& x, const Eigen::half& q) { + return Eigen::half(Eigen::numext::zeta(static_cast(x), static_cast(q))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half polygamma(const Eigen::half& n, const Eigen::half& x) { + return Eigen::half(Eigen::numext::polygamma(static_cast(n), static_cast(x))); +} template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erf(const Eigen::half& a) { - return Eigen::half(erff(float(a))); + return Eigen::half(Eigen::numext::erf(static_cast(a))); } template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erfc(const Eigen::half& a) { - return Eigen::half(erfcf(float(a))); + return Eigen::half(Eigen::numext::erfc(static_cast(a))); } +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma(const Eigen::half& a, const Eigen::half& x) { + return Eigen::half(Eigen::numext::igamma(static_cast(a), static_cast(x))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igammac(const Eigen::half& a, const Eigen::half& x) { + return Eigen::half(Eigen::numext::igammac(static_cast(a), static_cast(x))); +} + } // end namespace numext } // end namespace Eigen -- cgit v1.2.3 From 10b69810d17ae1e7a9278264f0aaafd966d4396f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 13 Apr 2016 16:00:51 -0700 Subject: Improved support for trigonometric functions on GPU --- Eigen/src/Core/MathFunctions.h | 76 +++++++++++++++++++++++++++++++++ Eigen/src/Core/functors/UnaryFunctors.h | 10 ++--- 2 files changed, 81 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index b27d51ac1..5771abf7d 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -1098,6 +1098,82 @@ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double tan(const double &x) { return ::tan(x); } #endif +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T acos(const T &x) { + EIGEN_USING_STD_MATH(acos); + return acos(x); +} + +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float acos(const float &x) { return ::acosf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double acos(const double &x) { return ::acos(x); } +#endif + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T asin(const T &x) { + EIGEN_USING_STD_MATH(asin); + return asin(x); +} + +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float asin(const float &x) { return ::asinf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double asin(const double &x) { return ::asin(x); } +#endif + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T atan(const T &x) { + EIGEN_USING_STD_MATH(atan); + return atan(x); +} + +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float atan(const float &x) { return ::atanf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double atan(const double &x) { return ::atan(x); } +#endif + + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T cosh(const T &x) { + EIGEN_USING_STD_MATH(cosh); + return cosh(x); +} + +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float cosh(const float &x) { return ::coshf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double cosh(const double &x) { return ::cosh(x); } +#endif + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T sinh(const T &x) { + EIGEN_USING_STD_MATH(sinh); + return sinh(x); +} + +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float sinh(const float &x) { return ::sinhf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double sinh(const double &x) { return ::sinh(x); } +#endif + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tanh(const T &x) { diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index f06290623..5baba1494 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -419,7 +419,7 @@ struct functor_traits > */ template struct scalar_acos_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_acos_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::acos; return acos(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::acos(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pacos(a); } }; @@ -438,7 +438,7 @@ struct functor_traits > */ template struct scalar_asin_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_asin_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::asin; return asin(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::asin(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pasin(a); } }; @@ -594,7 +594,7 @@ struct functor_traits > */ template struct scalar_atan_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_atan_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::atan; return atan(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::atan(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::patan(a); } }; @@ -649,7 +649,7 @@ struct functor_traits > */ template struct scalar_sinh_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_sinh_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::sinh; return sinh(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::sinh(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psinh(a); } }; @@ -668,7 +668,7 @@ struct functor_traits > */ template struct scalar_cosh_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_cosh_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::cosh; return cosh(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::cosh(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pcosh(a); } }; -- cgit v1.2.3 From 36f5a10198a8bc745633121fc4e8794a00b559e7 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 13 Apr 2016 18:44:48 -0700 Subject: Properly gate the definition of the error and gamma functions for fp16 --- Eigen/src/Core/arch/CUDA/Half.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index dfab34d32..482b654ac 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -442,6 +442,8 @@ template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floor(const Eigen:: template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceil(const Eigen::half& a) { return Eigen::half(::ceilf(float(a))); } + +#ifdef EIGEN_HAS_C99_MATH template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half lgamma(const Eigen::half& a) { return Eigen::half(Eigen::numext::lgamma(static_cast(a))); } @@ -466,7 +468,7 @@ template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma(const Eigen: template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igammac(const Eigen::half& a, const Eigen::half& x) { return Eigen::half(Eigen::numext::igammac(static_cast(a), static_cast(x))); } - +#endif } // end namespace numext } // end namespace Eigen -- cgit v1.2.3 From d402adc3d7f32115b51f46675ea39d3fa8868843 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 14 Apr 2016 15:18:08 +0200 Subject: Better use .data() than &coeffRef(0) --- Eigen/src/SuperLUSupport/SuperLUSupport.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/SuperLUSupport/SuperLUSupport.h b/Eigen/src/SuperLUSupport/SuperLUSupport.h index 0ae3017cc..7e2efd452 100644 --- a/Eigen/src/SuperLUSupport/SuperLUSupport.h +++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h @@ -986,7 +986,7 @@ void SuperILU::_solve_impl(const MatrixBase &b, MatrixBase Date: Thu, 14 Apr 2016 15:18:56 +0200 Subject: remove useless include --- bench/BenchTimer.h | 1 - 1 file changed, 1 deletion(-) diff --git a/bench/BenchTimer.h b/bench/BenchTimer.h index 64666d75f..ea28496b7 100644 --- a/bench/BenchTimer.h +++ b/bench/BenchTimer.h @@ -22,7 +22,6 @@ # endif # include #elif defined(__APPLE__) -#include #include #else # include -- cgit v1.2.3 From 3551dea887ce60756c28796e83bb7c080f2b2782 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 14 Apr 2016 16:45:41 +0200 Subject: Cleaning pass on rcond estimator. --- Eigen/src/Cholesky/LDLT.h | 39 ++++----- Eigen/src/Cholesky/LLT.h | 41 ++++----- Eigen/src/Core/ConditionEstimator.h | 161 ++++++++++++++---------------------- Eigen/src/LU/FullPivLU.h | 4 +- Eigen/src/LU/PartialPivLU.h | 4 +- 5 files changed, 99 insertions(+), 150 deletions(-) diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h index 90ed32fac..538aff956 100644 --- a/Eigen/src/Cholesky/LDLT.h +++ b/Eigen/src/Cholesky/LDLT.h @@ -193,12 +193,12 @@ template class LDLT LDLT& compute(const EigenBase& matrix); /** \returns an estimate of the reciprocal condition number of the matrix of - * which *this is the LDLT decomposition. + * which \c *this is the LDLT decomposition. */ RealScalar rcond() const { eigen_assert(m_isInitialized && "LDLT is not initialized."); - return ReciprocalConditionNumberEstimate(m_l1_norm, *this); + return internal::rcond_estimate_helper(m_l1_norm, *this); } template @@ -216,10 +216,12 @@ template class LDLT MatrixType reconstructedMatrix() const; - /** \returns the decomposition itself to allow generic code to do - * ldlt.adjoint().solve(rhs). - */ - const LDLT& adjoint() const { return *this; }; + /** \returns the adjoint of \c *this, that is, a const reference to the decomposition itself as the underlying matrix is self-adjoint. + * + * This method is provided for compatibility with other matrix decompositions, thus enabling generic code such as: + * \code x = decomposition.adjoint().solve(b) \endcode + */ + const LDLT& adjoint() const { return *this; }; inline Index rows() const { return m_matrix.rows(); } inline Index cols() const { return m_matrix.cols(); } @@ -456,22 +458,15 @@ LDLT& LDLT::compute(const EigenBase() + - m_matrix.row(col).head(col).template lpNorm<1>(); - if (abs_col_sum > m_l1_norm) { - m_l1_norm = abs_col_sum; - } - } - } else { - for (int col = 0; col < a.cols(); ++col) { - const RealScalar abs_col_sum = m_matrix.col(col).head(col).template lpNorm<1>() + - m_matrix.row(col).tail(size - col).template lpNorm<1>(); - if (abs_col_sum > m_l1_norm) { - m_l1_norm = abs_col_sum; - } - } + // TODO move this code to SelfAdjointView + for (Index col = 0; col < size; ++col) { + RealScalar abs_col_sum; + if (_UpLo == Lower) + abs_col_sum = m_matrix.col(col).tail(size - col).template lpNorm<1>() + m_matrix.row(col).head(col).template lpNorm<1>(); + else + abs_col_sum = m_matrix.col(col).head(col).template lpNorm<1>() + m_matrix.row(col).tail(size - col).template lpNorm<1>(); + if (abs_col_sum > m_l1_norm) + m_l1_norm = abs_col_sum; } m_transpositions.resize(size); diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index f88afe8b5..19578b216 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -136,13 +136,13 @@ template class LLT LLT& compute(const EigenBase& matrix); /** \returns an estimate of the reciprocal condition number of the matrix of - * which *this is the Cholesky decomposition. - */ + * which \c *this is the Cholesky decomposition. + */ RealScalar rcond() const { eigen_assert(m_isInitialized && "LLT is not initialized."); eigen_assert(m_info == Success && "LLT failed because matrix appears to be negative"); - return ReciprocalConditionNumberEstimate(m_l1_norm, *this); + return internal::rcond_estimate_helper(m_l1_norm, *this); } /** \returns the LLT decomposition matrix @@ -169,10 +169,12 @@ template class LLT return m_info; } - /** \returns the decomposition itself to allow generic code to do - * llt.adjoint().solve(rhs). - */ - const LLT& adjoint() const { return *this; }; + /** \returns the adjoint of \c *this, that is, a const reference to the decomposition itself as the underlying matrix is self-adjoint. + * + * This method is provided for compatibility with other matrix decompositions, thus enabling generic code such as: + * \code x = decomposition.adjoint().solve(b) \endcode + */ + const LLT& adjoint() const { return *this; }; inline Index rows() const { return m_matrix.rows(); } inline Index cols() const { return m_matrix.cols(); } @@ -411,22 +413,15 @@ LLT& LLT::compute(const EigenBase // Compute matrix L1 norm = max abs column sum. m_l1_norm = RealScalar(0); - if (_UpLo == Lower) { - for (int col = 0; col < size; ++col) { - const RealScalar abs_col_sum = m_matrix.col(col).tail(size - col).template lpNorm<1>() + - m_matrix.row(col).head(col).template lpNorm<1>(); - if (abs_col_sum > m_l1_norm) { - m_l1_norm = abs_col_sum; - } - } - } else { - for (int col = 0; col < a.cols(); ++col) { - const RealScalar abs_col_sum = m_matrix.col(col).head(col).template lpNorm<1>() + - m_matrix.row(col).tail(size - col).template lpNorm<1>(); - if (abs_col_sum > m_l1_norm) { - m_l1_norm = abs_col_sum; - } - } + // TODO move this code to SelfAdjointView + for (Index col = 0; col < size; ++col) { + RealScalar abs_col_sum; + if (_UpLo == Lower) + abs_col_sum = m_matrix.col(col).tail(size - col).template lpNorm<1>() + m_matrix.row(col).head(col).template lpNorm<1>(); + else + abs_col_sum = m_matrix.col(col).head(col).template lpNorm<1>() + m_matrix.row(col).tail(size - col).template lpNorm<1>(); + if (abs_col_sum > m_l1_norm) + m_l1_norm = abs_col_sum; } m_isInitialized = true; diff --git a/Eigen/src/Core/ConditionEstimator.h b/Eigen/src/Core/ConditionEstimator.h index f53c2a837..68c5e918e 100644 --- a/Eigen/src/Core/ConditionEstimator.h +++ b/Eigen/src/Core/ConditionEstimator.h @@ -13,139 +13,97 @@ namespace Eigen { namespace internal { -template -inline typename MatrixType::RealScalar MatrixL1Norm(const MatrixType& matrix) { - return matrix.cwiseAbs().colwise().sum().maxCoeff(); -} - -template -inline typename Vector::RealScalar VectorL1Norm(const Vector& v) { - return v.template lpNorm<1>(); -} template -struct SignOrUnity { +struct rcond_compute_sign { static inline Vector run(const Vector& v) { const RealVector v_abs = v.cwiseAbs(); return (v_abs.array() == static_cast(0)) - .select(Vector::Ones(v.size()), v.cwiseQuotient(v_abs)); + .select(Vector::Ones(v.size()), v.cwiseQuotient(v_abs)); } }; // Partial specialization to avoid elementwise division for real vectors. template -struct SignOrUnity { +struct rcond_compute_sign { static inline Vector run(const Vector& v) { return (v.array() < static_cast(0)) - .select(-Vector::Ones(v.size()), Vector::Ones(v.size())); + .select(-Vector::Ones(v.size()), Vector::Ones(v.size())); } }; -} // namespace internal - -/** \class ConditionEstimator - * \ingroup Core_Module - * - * \brief Condition number estimator. - * - * Computing a decomposition of a dense matrix takes O(n^3) operations, while - * this method estimates the condition number quickly and reliably in O(n^2) - * operations. - * - * \returns an estimate of the reciprocal condition number - * (1 / (||matrix||_1 * ||inv(matrix)||_1)) of matrix, given the matrix and - * its decomposition. Supports the following decompositions: FullPivLU, - * PartialPivLU, LDLT, and LLT. - * - * \sa FullPivLU, PartialPivLU, LDLT, LLT. - */ +/** \brief Reciprocal condition number estimator. + * + * Computing a decomposition of a dense matrix takes O(n^3) operations, while + * this method estimates the condition number quickly and reliably in O(n^2) + * operations. + * + * \returns an estimate of the reciprocal condition number + * (1 / (||matrix||_1 * ||inv(matrix)||_1)) of matrix, given ||matrix||_1 and + * its decomposition. Supports the following decompositions: FullPivLU, + * PartialPivLU, LDLT, and LLT. + * + * \sa FullPivLU, PartialPivLU, LDLT, LLT. + */ template -typename Decomposition::RealScalar ReciprocalConditionNumberEstimate( - const typename Decomposition::MatrixType& matrix, - const Decomposition& dec) { - eigen_assert(matrix.rows() == dec.rows()); - eigen_assert(matrix.cols() == dec.cols()); - if (dec.rows() == 0) return typename Decomposition::RealScalar(1); - return ReciprocalConditionNumberEstimate(MatrixL1Norm(matrix), dec); -} - -/** \class ConditionEstimator - * \ingroup Core_Module - * - * \brief Condition number estimator. - * - * Computing a decomposition of a dense matrix takes O(n^3) operations, while - * this method estimates the condition number quickly and reliably in O(n^2) - * operations. - * - * \returns an estimate of the reciprocal condition number - * (1 / (||matrix||_1 * ||inv(matrix)||_1)) of matrix, given ||matrix||_1 and - * its decomposition. Supports the following decompositions: FullPivLU, - * PartialPivLU, LDLT, and LLT. - * - * \sa FullPivLU, PartialPivLU, LDLT, LLT. - */ -template -typename Decomposition::RealScalar ReciprocalConditionNumberEstimate( - typename Decomposition::RealScalar matrix_norm, const Decomposition& dec) { +typename Decomposition::RealScalar +rcond_estimate_helper(typename Decomposition::RealScalar matrix_norm, const Decomposition& dec) +{ typedef typename Decomposition::RealScalar RealScalar; eigen_assert(dec.rows() == dec.cols()); - if (dec.rows() == 0) return RealScalar(1); + if (dec.rows() == 0) return RealScalar(1); if (matrix_norm == RealScalar(0)) return RealScalar(0); - if (dec.rows() == 1) return RealScalar(1); - const typename Decomposition::RealScalar inverse_matrix_norm = - InverseMatrixL1NormEstimate(dec); - return (inverse_matrix_norm == RealScalar(0) - ? RealScalar(0) - : (RealScalar(1) / inverse_matrix_norm) / matrix_norm); + if (dec.rows() == 1) return RealScalar(1); + const RealScalar inverse_matrix_norm = rcond_invmatrix_L1_norm_estimate(dec); + return (inverse_matrix_norm == RealScalar(0) ? RealScalar(0) + : (RealScalar(1) / inverse_matrix_norm) / matrix_norm); } /** - * \returns an estimate of ||inv(matrix)||_1 given a decomposition of - * matrix that implements .solve() and .adjoint().solve() methods. - * - * The method implements Algorithms 4.1 and 5.1 from - * http://www.maths.manchester.ac.uk/~higham/narep/narep135.pdf - * which also forms the basis for the condition number estimators in - * LAPACK. Since at most 10 calls to the solve method of dec are - * performed, the total cost is O(dims^2), as opposed to O(dims^3) - * needed to compute the inverse matrix explicitly. - * - * The most common usage is in estimating the condition number - * ||matrix||_1 * ||inv(matrix)||_1. The first term ||matrix||_1 can be - * computed directly in O(n^2) operations. - * - * Supports the following decompositions: FullPivLU, PartialPivLU, LDLT, and - * LLT. - * - * \sa FullPivLU, PartialPivLU, LDLT, LLT. - */ + * \returns an estimate of ||inv(matrix)||_1 given a decomposition of + * \a matrix that implements .solve() and .adjoint().solve() methods. + * + * This function implements Algorithms 4.1 and 5.1 from + * http://www.maths.manchester.ac.uk/~higham/narep/narep135.pdf + * which also forms the basis for the condition number estimators in + * LAPACK. Since at most 10 calls to the solve method of dec are + * performed, the total cost is O(dims^2), as opposed to O(dims^3) + * needed to compute the inverse matrix explicitly. + * + * The most common usage is in estimating the condition number + * ||matrix||_1 * ||inv(matrix)||_1. The first term ||matrix||_1 can be + * computed directly in O(n^2) operations. + * + * Supports the following decompositions: FullPivLU, PartialPivLU, LDLT, and + * LLT. + * + * \sa FullPivLU, PartialPivLU, LDLT, LLT. + */ template -typename Decomposition::RealScalar InverseMatrixL1NormEstimate( - const Decomposition& dec) { +typename Decomposition::RealScalar rcond_invmatrix_L1_norm_estimate(const Decomposition& dec) +{ typedef typename Decomposition::MatrixType MatrixType; typedef typename Decomposition::Scalar Scalar; typedef typename Decomposition::RealScalar RealScalar; typedef typename internal::plain_col_type::type Vector; - typedef typename internal::plain_col_type::type - RealVector; + typedef typename internal::plain_col_type::type RealVector; const bool is_complex = (NumTraits::IsComplex != 0); eigen_assert(dec.rows() == dec.cols()); const Index n = dec.rows(); - if (n == 0) { + if (n == 0) return 0; - } + Vector v = dec.solve(Vector::Ones(n) / Scalar(n)); // lower_bound is a lower bound on // ||inv(matrix)||_1 = sup_v ||inv(matrix) v||_1 / ||v||_1 // and is the objective maximized by the ("super-") gradient ascent // algorithm below. - RealScalar lower_bound = internal::VectorL1Norm(v); - if (n == 1) { + RealScalar lower_bound = v.template lpNorm<1>(); + if (n == 1) return lower_bound; - } + // Gradient ascent algorithm follows: We know that the optimum is achieved at // one of the simplices v = e_i, so in each iteration we follow a // super-gradient to move towards the optimal one. @@ -154,8 +112,9 @@ typename Decomposition::RealScalar InverseMatrixL1NormEstimate( Vector old_sign_vector; Index v_max_abs_index = -1; Index old_v_max_abs_index = v_max_abs_index; - for (int k = 0; k < 4; ++k) { - sign_vector = internal::SignOrUnity::run(v); + for (int k = 0; k < 4; ++k) + { + sign_vector = internal::rcond_compute_sign::run(v); if (k > 0 && !is_complex && sign_vector == old_sign_vector) { // Break if the solution stagnated. break; @@ -169,7 +128,7 @@ typename Decomposition::RealScalar InverseMatrixL1NormEstimate( } // Move to the new simplex e_j, where j = v_max_abs_index. v = dec.solve(Vector::Unit(n, v_max_abs_index)); // v = inv(matrix) * e_j. - lower_bound = internal::VectorL1Norm(v); + lower_bound = v.template lpNorm<1>(); if (lower_bound <= old_lower_bound) { // Break if the gradient step did not increase the lower_bound. break; @@ -192,16 +151,16 @@ typename Decomposition::RealScalar InverseMatrixL1NormEstimate( // Hager's algorithm to vastly underestimate ||matrix||_1. Scalar alternating_sign(RealScalar(1)); for (Index i = 0; i < n; ++i) { - v[i] = alternating_sign * - (RealScalar(1) + (RealScalar(i) / (RealScalar(n - 1)))); + v[i] = alternating_sign * (RealScalar(1) + (RealScalar(i) / (RealScalar(n - 1)))); alternating_sign = -alternating_sign; } v = dec.solve(v); - const RealScalar alternate_lower_bound = - (2 * internal::VectorL1Norm(v)) / (3 * RealScalar(n)); + const RealScalar alternate_lower_bound = (2 * v.template lpNorm<1>()) / (3 * RealScalar(n)); return numext::maxi(lower_bound, alternate_lower_bound); } +} // namespace internal + } // namespace Eigen #endif diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h index 978a54eff..64b9eb7f1 100644 --- a/Eigen/src/LU/FullPivLU.h +++ b/Eigen/src/LU/FullPivLU.h @@ -231,13 +231,13 @@ template class FullPivLU return Solve(*this, b.derived()); } - /** \returns an estimate of the reciprocal condition number of the matrix of which *this is + /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is the LU decomposition. */ inline RealScalar rcond() const { eigen_assert(m_isInitialized && "PartialPivLU is not initialized."); - return ReciprocalConditionNumberEstimate(m_l1_norm, *this); + return internal::rcond_estimate_helper(m_l1_norm, *this); } /** \returns the determinant of the matrix of which diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h index b22dd75fe..2e6d91939 100644 --- a/Eigen/src/LU/PartialPivLU.h +++ b/Eigen/src/LU/PartialPivLU.h @@ -151,13 +151,13 @@ template class PartialPivLU return Solve(*this, b.derived()); } - /** \returns an estimate of the reciprocal condition number of the matrix of which *this is + /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is the LU decomposition. */ inline RealScalar rcond() const { eigen_assert(m_isInitialized && "PartialPivLU is not initialized."); - return ReciprocalConditionNumberEstimate(m_l1_norm, *this); + return internal::rcond_estimate_helper(m_l1_norm, *this); } /** \returns the inverse of the matrix of which *this is the LU decomposition. -- cgit v1.2.3 From f6003f08737eee960a70541a750e1675a470cdcf Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 14 Apr 2016 09:47:26 -0700 Subject: Made the test msvc friendly --- unsupported/test/cxx11_float16.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/unsupported/test/cxx11_float16.cpp b/unsupported/test/cxx11_float16.cpp index 2dc0872d8..db5f2130b 100644 --- a/unsupported/test/cxx11_float16.cpp +++ b/unsupported/test/cxx11_float16.cpp @@ -122,6 +122,8 @@ void test_comparison() VERIFY(half(1.0f) != half(2.0f)); // Comparisons with NaNs and infinities. +#if !EIGEN_COMP_MSVC + // Visual Studio errors out on divisions by 0 VERIFY(!(half(0.0 / 0.0) == half(0.0 / 0.0))); VERIFY(half(0.0 / 0.0) != half(0.0 / 0.0)); @@ -132,6 +134,7 @@ void test_comparison() VERIFY(half(1.0) < half(1.0 / 0.0)); VERIFY(half(1.0) > half(-1.0 / 0.0)); +#endif } void test_functions() -- cgit v1.2.3 From c7167fee0eaf188fbe7ce7b877971b928f798a7a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 14 Apr 2016 10:08:33 -0700 Subject: Added support for fp16 to the sigmoid function --- unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index b7c13f67f..4ef48c64c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -64,7 +64,7 @@ struct scalar_sigmoid_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { const T one = T(1); - return one / (one + std::exp(-x)); + return one / (one + numext::exp(-x)); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -799,7 +799,7 @@ class GaussianGenerator { T offset = coordinates[i] - m_means[i]; tmp += offset * offset / m_two_sigmas[i]; } - return std::exp(-tmp); + return numext::exp(-tmp); } private: -- cgit v1.2.3 From 5c13765ee333bb78b5c7baeb515eed97c59b6c1d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 14 Apr 2016 10:24:52 -0700 Subject: Added ability to printf fp16 --- Eigen/src/Core/arch/CUDA/Half.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 482b654ac..8249ce2eb 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -510,6 +510,11 @@ static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (isfinite)(const Eigen::half& a namespace std { +ostream& operator << (ostream& os, const Eigen::half& v) { + os << static_cast(v); + return os; +} + #if __cplusplus > 199711L template <> struct hash { -- cgit v1.2.3 From 7b3d7acebeadb443d8e3ac9756359d507324cc82 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 14 Apr 2016 10:25:50 -0700 Subject: Added support for fp16 to test_isApprox, test_isMuchSmallerThan, and test_isApproxOrLessThan --- test/main.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/test/main.h b/test/main.h index bba5e7570..dbb496b89 100644 --- a/test/main.h +++ b/test/main.h @@ -316,9 +316,9 @@ inline bool test_isMuchSmallerThan(const float& a, const float& b) { return internal::isMuchSmallerThan(a, b, test_precision()); } inline bool test_isApproxOrLessThan(const float& a, const float& b) { return internal::isApproxOrLessThan(a, b, test_precision()); } + inline bool test_isApprox(const double& a, const double& b) { return internal::isApprox(a, b, test_precision()); } - inline bool test_isMuchSmallerThan(const double& a, const double& b) { return internal::isMuchSmallerThan(a, b, test_precision()); } inline bool test_isApproxOrLessThan(const double& a, const double& b) @@ -359,6 +359,12 @@ inline bool test_isApproxOrLessThan(const long double& a, const long double& b) { return internal::isApproxOrLessThan(a, b, test_precision()); } #endif // EIGEN_TEST_NO_LONGDOUBLE +inline bool test_isApprox(const half& a, const half& b) +{ return internal::isApprox(a, b, test_precision()); } +inline bool test_isMuchSmallerThan(const half& a, const half& b) +{ return internal::isMuchSmallerThan(a, b, test_precision()); } +inline bool test_isApproxOrLessThan(const half& a, const half& b) +{ return internal::isApproxOrLessThan(a, b, test_precision()); } // test_relative_error returns the relative difference between a and b as a real scalar as used in isApprox. template @@ -426,9 +432,7 @@ template typename NumTraits::Real test_relative_error(const T1 &a, const T2 &b, typename internal::enable_if::Real>::value, T1>::type* = 0) { typedef typename NumTraits::Real RealScalar; - using std::min; - using std::sqrt; - return sqrt(RealScalar(numext::abs2(a-b))/RealScalar((min)(numext::abs2(a),numext::abs2(b)))); + return numext::sqrt(RealScalar(numext::abs2(a-b))/RealScalar((numext::mini)(numext::abs2(a),numext::abs2(b)))); } template -- cgit v1.2.3 From 72510c80e1a7406af915f0851e6bfbe605d3f436 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 14 Apr 2016 10:27:24 -0700 Subject: Added basic test for trigonometric functions on fp16 --- unsupported/test/cxx11_float16.cpp | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/unsupported/test/cxx11_float16.cpp b/unsupported/test/cxx11_float16.cpp index db5f2130b..2d9376d29 100644 --- a/unsupported/test/cxx11_float16.cpp +++ b/unsupported/test/cxx11_float16.cpp @@ -137,7 +137,7 @@ void test_comparison() #endif } -void test_functions() +void test_basic_functions() { VERIFY_IS_EQUAL(float(numext::abs(half(3.5f))), 3.5f); VERIFY_IS_EQUAL(float(numext::abs(half(-3.5f))), 3.5f); @@ -149,10 +149,32 @@ void test_functions() VERIFY_IS_APPROX(float(numext::log(half(10.0f))), 2.30273f); } +void test_trigonometric_functions() +{ + VERIFY_IS_APPROX(numext::cos(half(0.0f)), half(cosf(0.0f))); + VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI)), half(cosf(EIGEN_PI))); + //VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI/2)), half(cosf(EIGEN_PI/2))); + //VERIFY_IS_APPROX(numext::cos(half(3*EIGEN_PI/2)), half(cosf(3*EIGEN_PI/2))); + VERIFY_IS_APPROX(numext::cos(half(3.5f)), half(cosf(3.5f))); + + VERIFY_IS_APPROX(numext::sin(half(0.0f)), half(sinf(0.0f))); + // VERIFY_IS_APPROX(numext::sin(half(EIGEN_PI)), half(sinf(EIGEN_PI))); + VERIFY_IS_APPROX(numext::sin(half(EIGEN_PI/2)), half(sinf(EIGEN_PI/2))); + VERIFY_IS_APPROX(numext::sin(half(3*EIGEN_PI/2)), half(sinf(3*EIGEN_PI/2))); + VERIFY_IS_APPROX(numext::sin(half(3.5f)), half(sinf(3.5f))); + + VERIFY_IS_APPROX(numext::tan(half(0.0f)), half(tanf(0.0f))); + // VERIFY_IS_APPROX(numext::tan(half(EIGEN_PI)), half(tanf(EIGEN_PI))); + // VERIFY_IS_APPROX(numext::tan(half(EIGEN_PI/2)), half(tanf(EIGEN_PI/2))); + //VERIFY_IS_APPROX(numext::tan(half(3*EIGEN_PI/2)), half(tanf(3*EIGEN_PI/2))); + VERIFY_IS_APPROX(numext::tan(half(3.5f)), half(tanf(3.5f))); +} + void test_cxx11_float16() { CALL_SUBTEST(test_conversion()); CALL_SUBTEST(test_arithmetic()); CALL_SUBTEST(test_comparison()); - CALL_SUBTEST(test_functions()); + CALL_SUBTEST(test_basic_functions()); + CALL_SUBTEST(test_trigonometric_functions()); } -- cgit v1.2.3 From 6f23e945f6fd75d8d7b48d83f01976f91da0c24f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 14 Apr 2016 10:32:52 -0700 Subject: Added simple test for numext::sqrt and numext::pow on fp16 --- unsupported/test/cxx11_float16.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/unsupported/test/cxx11_float16.cpp b/unsupported/test/cxx11_float16.cpp index 2d9376d29..b437868d7 100644 --- a/unsupported/test/cxx11_float16.cpp +++ b/unsupported/test/cxx11_float16.cpp @@ -142,6 +142,12 @@ void test_basic_functions() VERIFY_IS_EQUAL(float(numext::abs(half(3.5f))), 3.5f); VERIFY_IS_EQUAL(float(numext::abs(half(-3.5f))), 3.5f); + VERIFY_IS_APPROX(float(numext::sqrt(half(0.0f))), 0.0f); + VERIFY_IS_APPROX(float(numext::sqrt(half(4.0f))), 2.0f); + + VERIFY_IS_APPROX(float(numext::pow(half(0.0f), half(1.0f))), 0.0f); + VERIFY_IS_APPROX(float(numext::pow(half(2.0f), half(2.0f))), 4.0f); + VERIFY_IS_EQUAL(float(numext::exp(half(0.0f))), 1.0f); VERIFY_IS_APPROX(float(numext::exp(half(EIGEN_PI))), float(20.0 + EIGEN_PI)); -- cgit v1.2.3 From 2b6e3de02f6f141c6bab523c54cda432d796eec7 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 14 Apr 2016 11:39:18 -0700 Subject: Added tests to validate flooring and ceiling of fp16 --- unsupported/test/cxx11_float16.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/unsupported/test/cxx11_float16.cpp b/unsupported/test/cxx11_float16.cpp index b437868d7..273dcbc11 100644 --- a/unsupported/test/cxx11_float16.cpp +++ b/unsupported/test/cxx11_float16.cpp @@ -142,6 +142,12 @@ void test_basic_functions() VERIFY_IS_EQUAL(float(numext::abs(half(3.5f))), 3.5f); VERIFY_IS_EQUAL(float(numext::abs(half(-3.5f))), 3.5f); + VERIFY_IS_EQUAL(float(numext::floor(half(3.5f))), 3.0f); + VERIFY_IS_EQUAL(float(numext::floor(half(-3.5f))), -4.0f); + + VERIFY_IS_EQUAL(float(numext::ceil(half(3.5f))), 4.0f); + VERIFY_IS_EQUAL(float(numext::ceil(half(-3.5f))), -3.0f); + VERIFY_IS_APPROX(float(numext::sqrt(half(0.0f))), 0.0f); VERIFY_IS_APPROX(float(numext::sqrt(half(4.0f))), 2.0f); -- cgit v1.2.3 From 5912ad877c6fe0072c56e8d2f70b315a1f4da6ce Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 14 Apr 2016 11:40:14 -0700 Subject: Silenced a compilation warning --- unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h index 3e56589c3..5950f38e2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h @@ -53,9 +53,7 @@ struct TensorUInt128 template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE explicit TensorUInt128(const T& x) : high(0), low(x) { - typedef typename conditional::type UnsignedT; - typedef typename conditional::type UnsignedLow; - eigen_assert(static_cast(x) <= static_cast(NumTraits::highest())); + eigen_assert((static_cast::type>(x) <= static_cast::type>(NumTraits::highest()))); eigen_assert(x >= 0); } -- cgit v1.2.3 From 5379d2b5944f2c26ff0ddce65fc6f99f5182f7b7 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 14 Apr 2016 11:40:48 -0700 Subject: Inline the << operator on half floats --- Eigen/src/Core/arch/CUDA/Half.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 8249ce2eb..bdf97dcd6 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -510,7 +510,7 @@ static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (isfinite)(const Eigen::half& a namespace std { -ostream& operator << (ostream& os, const Eigen::half& v) { +EIGEN_STRONG_INLINE ostream& operator << (ostream& os, const Eigen::half& v) { os << static_cast(v); return os; } -- cgit v1.2.3 From 7718749fee835095f0671fa6ce5d257609f8e56b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 14 Apr 2016 11:51:54 -0700 Subject: Force the inlining of the << operator on half floats --- Eigen/src/Core/arch/CUDA/Half.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index bdf97dcd6..9ecc4fd88 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -510,7 +510,7 @@ static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (isfinite)(const Eigen::half& a namespace std { -EIGEN_STRONG_INLINE ostream& operator << (ostream& os, const Eigen::half& v) { +EIGEN_ALWAYS_INLINE ostream& operator << (ostream& os, const Eigen::half& v) { os << static_cast(v); return os; } -- cgit v1.2.3 From 20f387fafa5dbab90c240612e33e5c13d215ac5f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 14 Apr 2016 22:46:55 +0200 Subject: Improve numerical robustness of JacoviSVD: - avoid noise amplification in complex to real conversion - compare off-diagonal entries to the current biggest diagonal entry: no need to bother about a 2x2 block containing ridiculously small entries compared to the rest of the matrix. --- Eigen/src/SVD/JacobiSVD.h | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h index f08776bc6..1940c8294 100644 --- a/Eigen/src/SVD/JacobiSVD.h +++ b/Eigen/src/SVD/JacobiSVD.h @@ -350,7 +350,8 @@ template struct svd_precondition_2x2_block_to_be_real { typedef JacobiSVD SVD; - static bool run(typename SVD::WorkMatrixType&, SVD&, Index, Index, const typename MatrixType::RealScalar&) { return true; } + typedef typename MatrixType::RealScalar RealScalar; + static bool run(typename SVD::WorkMatrixType&, SVD&, Index, Index, RealScalar&) { return true; } }; template @@ -359,25 +360,30 @@ struct svd_precondition_2x2_block_to_be_real typedef JacobiSVD SVD; typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; - static bool run(typename SVD::WorkMatrixType& work_matrix, SVD& svd, Index p, Index q, const typename MatrixType::RealScalar& precision) + static bool run(typename SVD::WorkMatrixType& work_matrix, SVD& svd, Index p, Index q, RealScalar& maxDiagEntry) { using std::sqrt; + using std::abs; Scalar z; JacobiRotation rot; RealScalar n = sqrt(numext::abs2(work_matrix.coeff(p,p)) + numext::abs2(work_matrix.coeff(q,p))); - + + const RealScalar considerAsZero = (std::numeric_limits::min)(); + const RealScalar precision = NumTraits::epsilon(); + if(n==0) { // make sure first column is zero work_matrix.coeffRef(p,p) = work_matrix.coeffRef(q,p) = Scalar(0); - if(work_matrix.coeff(p,q)!=Scalar(0)) + + if(abs(numext::imag(work_matrix.coeff(p,q)))>considerAsZero) { // work_matrix.coeff(p,q) can be zero if work_matrix.coeff(q,p) is not zero but small enough to underflow when computing n z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q); work_matrix.row(p) *= z; if(svd.computeU()) svd.m_matrixU.col(p) *= conj(z); } - if(work_matrix.coeff(q,q)!=Scalar(0)) + if(abs(numext::imag(work_matrix.coeff(q,q)))>considerAsZero) { z = abs(work_matrix.coeff(q,q)) / work_matrix.coeff(q,q); work_matrix.row(q) *= z; @@ -391,13 +397,13 @@ struct svd_precondition_2x2_block_to_be_real rot.s() = work_matrix.coeff(q,p) / n; work_matrix.applyOnTheLeft(p,q,rot); if(svd.computeU()) svd.m_matrixU.applyOnTheRight(p,q,rot.adjoint()); - if(work_matrix.coeff(p,q) != Scalar(0)) + if(abs(numext::imag(work_matrix.coeff(p,q)))>considerAsZero) { z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q); work_matrix.col(q) *= z; if(svd.computeV()) svd.m_matrixV.col(q) *= z; } - if(work_matrix.coeff(q,q) != Scalar(0)) + if(abs(numext::imag(work_matrix.coeff(q,q)))>considerAsZero) { z = abs(work_matrix.coeff(q,q)) / work_matrix.coeff(q,q); work_matrix.row(q) *= z; @@ -405,11 +411,11 @@ struct svd_precondition_2x2_block_to_be_real } } - const RealScalar considerAsZero = RealScalar(2) * std::numeric_limits::denorm_min(); - RealScalar threshold = numext::maxi(considerAsZero, - precision * numext::maxi(abs(work_matrix.coeff(p,p)), abs(work_matrix.coeff(q,q)))); - // return true if we still have some work to do - return numext::abs(work_matrix(p,q)) > threshold || numext::abs(work_matrix(q,p)) > threshold; + // update largest diagonal entry + maxDiagEntry = numext::maxi(maxDiagEntry,numext::maxi(abs(work_matrix.coeff(p,p)), abs(work_matrix.coeff(q,q)))); + // and check whether the 2x2 block is already diagonal + RealScalar threshold = numext::maxi(considerAsZero, precision * maxDiagEntry); + return abs(work_matrix.coeff(p,q))>threshold || abs(work_matrix.coeff(q,p)) > threshold; } }; @@ -426,7 +432,6 @@ void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q, JacobiRotation rot1; RealScalar t = m.coeff(0,0) + m.coeff(1,1); RealScalar d = m.coeff(1,0) - m.coeff(0,1); - if(d == RealScalar(0)) { rot1.s() = RealScalar(0); @@ -719,6 +724,7 @@ JacobiSVD::compute(const MatrixType& matrix, unsig } /*** step 2. The main Jacobi SVD iteration. ***/ + RealScalar maxDiagEntry = m_workMatrix.cwiseAbs().diagonal().maxCoeff(); bool finished = false; while(!finished) @@ -734,16 +740,13 @@ JacobiSVD::compute(const MatrixType& matrix, unsig // if this 2x2 sub-matrix is not diagonal already... // notice that this comparison will evaluate to false if any NaN is involved, ensuring that NaN's don't // keep us iterating forever. Similarly, small denormal numbers are considered zero. - RealScalar threshold = numext::maxi(considerAsZero, - precision * numext::maxi(abs(m_workMatrix.coeff(p,p)), - abs(m_workMatrix.coeff(q,q)))); - // We compare both values to threshold instead of calling max to be robust to NaN (See bug 791) + RealScalar threshold = numext::maxi(considerAsZero, precision * maxDiagEntry); if(abs(m_workMatrix.coeff(p,q))>threshold || abs(m_workMatrix.coeff(q,p)) > threshold) { finished = false; // perform SVD decomposition of 2x2 sub-matrix corresponding to indices p,q to make it diagonal // the complex to real operation returns true is the updated 2x2 block is not already diagonal - if(internal::svd_precondition_2x2_block_to_be_real::run(m_workMatrix, *this, p, q, precision)) + if(internal::svd_precondition_2x2_block_to_be_real::run(m_workMatrix, *this, p, q, maxDiagEntry)) { JacobiRotation j_left, j_right; internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right); @@ -754,6 +757,9 @@ JacobiSVD::compute(const MatrixType& matrix, unsig m_workMatrix.applyOnTheRight(p,q,j_right); if(computeV()) m_matrixV.applyOnTheRight(p,q,j_right); + + // keep track of the largest diagonal coefficient + maxDiagEntry = numext::maxi(maxDiagEntry,numext::maxi(abs(m_workMatrix.coeff(p,p)), abs(m_workMatrix.coeff(q,q)))); } } } -- cgit v1.2.3 From 68897c52f3c8cd37824f87cd9582cf98a5c9eb32 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 14 Apr 2016 22:47:30 +0200 Subject: Add extreme values to the imaginary part for SVD unit tests. --- test/svd_fill.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/test/svd_fill.h b/test/svd_fill.h index 7e44b3d05..1bbe645ee 100644 --- a/test/svd_fill.h +++ b/test/svd_fill.h @@ -80,6 +80,8 @@ void svd_fill_random(MatrixType &m, int Option = 0) Index i = internal::random(0,m.rows()-1); Index j = internal::random(0,m.cols()-1); m(j,i) = m(i,j) = samples(internal::random(0,samples.size()-1)); + if(NumTraits::IsComplex) + *(&numext::real_ref(m(j,i))+1) = *(&numext::real_ref(m(i,j))+1) = samples.real()(internal::random(0,samples.size()-1)); } } } @@ -91,8 +93,14 @@ void svd_fill_random(MatrixType &m, int Option = 0) if(!(dup && unit_uv)) { Index n = internal::random(0,m.size()-1); - for(Index i=0; i(0,m.rows()-1), internal::random(0,m.cols()-1)) = samples(internal::random(0,samples.size()-1)); + for(Index k=0; k(0,m.rows()-1); + Index j = internal::random(0,m.cols()-1); + m(i,j) = samples(internal::random(0,samples.size()-1)); + if(NumTraits::IsComplex) + *(&numext::real_ref(m(i,j))+1) = samples.real()(internal::random(0,samples.size()-1)); + } } } } -- cgit v1.2.3 From 235e83aba608cf3d94b033bfbf551f8c136a3fab Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 14 Apr 2016 13:57:35 -0700 Subject: Eigen cost model part 1. This implements a basic recursive framework to estimate the cost of evaluating tensor expressions. --- unsupported/Eigen/CXX11/Tensor | 2 + unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h | 5 + unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h | 25 ++- .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 51 ++++-- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 63 ++++--- .../Eigen/CXX11/src/Tensor/TensorConcatenation.h | 15 ++ .../Eigen/CXX11/src/Tensor/TensorContraction.h | 4 + .../Eigen/CXX11/src/Tensor/TensorConversion.h | 17 +- .../Eigen/CXX11/src/Tensor/TensorConvolution.h | 46 +++++- .../Eigen/CXX11/src/Tensor/TensorCostModel.h | 181 +++++++++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorCustomOp.h | 25 ++- unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 15 +- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 36 ++++ unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h | 12 +- .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 12 +- .../Eigen/CXX11/src/Tensor/TensorFunctors.h | 13 ++ .../Eigen/CXX11/src/Tensor/TensorGenerator.h | 8 + .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 31 ++-- .../Eigen/CXX11/src/Tensor/TensorInflation.h | 31 ++-- .../Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 4 + .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 9 + unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 62 +++++-- unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h | 30 ++-- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 33 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h | 40 +++-- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 36 ++-- .../Eigen/CXX11/src/Tensor/TensorStriding.h | 47 +++--- .../Eigen/CXX11/src/Tensor/TensorVolumePatch.h | 27 +-- 28 files changed, 695 insertions(+), 185 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 16132398d..a4eae4201 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -26,6 +26,7 @@ * \endcode */ +#include #include #include @@ -84,6 +85,7 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorBase.h" +#include "src/Tensor/TensorCostModel.h" #include "src/Tensor/TensorEvaluator.h" #include "src/Tensor/TensorExpr.h" #include "src/Tensor/TensorReduction.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index f1ec04c49..babafe108 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -112,6 +112,11 @@ struct TensorEvaluator, Device> return CoeffReturnType(index, m_impl.coeff(index)); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 199d2ce41..5abff0800 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -89,6 +89,12 @@ template struct TensorEvaluator, Device> { typedef TensorAssignOp XprType; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename TensorEvaluator::Dimensions Dimensions; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, @@ -104,12 +110,6 @@ struct TensorEvaluator, Device> EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); } - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef typename TensorEvaluator::Dimensions Dimensions; - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { // The dimensions of the lhs and the rhs tensors should be equal to prevent @@ -150,6 +150,19 @@ struct TensorEvaluator, Device> return m_leftImpl.template packet(index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + // We assume that evalPacket or evalScalar is called to perform the + // assignment and account for the cost of the write here, but reduce left + // cost by one load because we are using m_leftImpl.coeffRef. + TensorOpCost left = m_leftImpl.costPerCoeff(vectorized); + return m_rightImpl.costPerCoeff(vectorized) + + TensorOpCost( + numext::maxi(0.0, left.bytes_loaded() - sizeof(CoeffReturnType)), + left.bytes_stored(), left.compute_cycles()) + + TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_leftImpl.data(); } private: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index b6e6db12a..c771496e2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -101,6 +101,9 @@ struct TensorEvaluator, Device> typedef DSizes Dimensions; typedef typename XprType::Scalar Scalar; typedef typename TensorEvaluator::Dimensions InputDimensions; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = false, @@ -140,9 +143,6 @@ struct TensorEvaluator, Device> } } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -247,9 +247,8 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); const Index originalIndex = index; @@ -284,12 +283,12 @@ struct TensorEvaluator, Device> // Todo: this could be extended to the second dimension if we're not // broadcasting alongside the first dimension, and so on. - if (innermostLoc + packetSize <= m_impl.dimensions()[0]) { + if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) { return m_impl.template packet(inputIndex); } else { - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; values[0] = m_impl.coeff(inputIndex); - for (int i = 1; i < packetSize; ++i) { + for (int i = 1; i < PacketSize; ++i) { values[i] = coeffColMajor(originalIndex+i); } PacketReturnType rslt = internal::pload(values); @@ -300,9 +299,8 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); const Index originalIndex = index; @@ -337,12 +335,12 @@ struct TensorEvaluator, Device> // Todo: this could be extended to the second dimension if we're not // broadcasting alongside the first dimension, and so on. - if (innermostLoc + packetSize <= m_impl.dimensions()[NumDims-1]) { + if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims-1]) { return m_impl.template packet(inputIndex); } else { - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; values[0] = m_impl.coeff(inputIndex); - for (int i = 1; i < packetSize; ++i) { + for (int i = 1; i < PacketSize; ++i) { values[i] = coeffRowMajor(originalIndex+i); } PacketReturnType rslt = internal::pload(values); @@ -350,6 +348,29 @@ struct TensorEvaluator, Device> } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + double compute_cost = TensorOpCost::AddCost(); + if (NumDims > 0) { + for (int i = NumDims - 1; i > 0; --i) { + compute_cost += TensorOpCost::DivCost(); + if (internal::index_statically_eq()(i, 1)) { + compute_cost += + TensorOpCost::MulCost() + TensorOpCost::AddCost(); + } else { + if (!internal::index_statically_eq()(i, 1)) { + compute_cost += TensorOpCost::MulCost() + + TensorOpCost::ModCost() + + TensorOpCost::AddCost(); + } + } + compute_cost += + TensorOpCost::MulCost() + TensorOpCost::AddCost(); + } + } + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index c21a98fe0..2742dbb95 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -134,6 +134,10 @@ struct TensorEvaluator, Device> typedef typename XprType::Index Index; typedef DSizes Dimensions; typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + enum { // Alignment can't be guaranteed at compile time since it depends on the @@ -180,9 +184,6 @@ struct TensorEvaluator, Device> m_inputOffset = m_stride * op.offset(); } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -202,17 +203,16 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == 0) || (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(m_stride == 1); Index inputIndex = index * m_inputStride + m_inputOffset; - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = m_impl.coeff(inputIndex); inputIndex += m_inputStride; } @@ -226,13 +226,13 @@ struct TensorEvaluator, Device> } else { const Index idx = index / m_stride; const Index rem = index - idx * m_stride; - if (rem + packetSize <= m_stride) { + if (rem + PacketSize <= m_stride) { Index inputIndex = idx * m_inputStride + m_inputOffset + rem; return m_impl.template packet(inputIndex); } else { // Cross the stride boundary. Fallback to slow path. - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index); ++index; } @@ -242,6 +242,28 @@ struct TensorEvaluator, Device> } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + double cost = 0; + if ((static_cast(Layout) == static_cast(ColMajor) && + m_dim.actualDim() == 0) || + (static_cast(Layout) == static_cast(RowMajor) && + m_dim.actualDim() == NumInputDims - 1)) { + cost += TensorOpCost::MulCost() + TensorOpCost::AddCost(); + } else if ((static_cast(Layout) == static_cast(ColMajor) && + m_dim.actualDim() == NumInputDims - 1) || + (static_cast(Layout) == static_cast(RowMajor) && + m_dim.actualDim() == 0)) { + cost += TensorOpCost::AddCost(); + } else { + cost += 3 * TensorOpCost::MulCost() + TensorOpCost::DivCost() + + 3 * TensorOpCost::AddCost(); + } + + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, cost, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { CoeffReturnType* result = const_cast(m_impl.data()); if (((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumDims) || @@ -298,6 +320,9 @@ struct TensorEvaluator, Device> typedef typename XprType::Index Index; typedef DSizes Dimensions; typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = false, @@ -309,9 +334,6 @@ struct TensorEvaluator, Device> : Base(op, device) { } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { return this->m_impl.coeffRef(this->srcCoeff(index)); @@ -320,17 +342,16 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - static const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) if ((static_cast(this->Layout) == static_cast(ColMajor) && this->m_dim.actualDim() == 0) || (static_cast(this->Layout) == static_cast(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(this->m_stride == 1); - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; internal::pstore(values, x); Index inputIndex = index * this->m_inputStride + this->m_inputOffset; - for (int i = 0; i < packetSize; ++i) { + for (int i = 0; i < PacketSize; ++i) { this->m_impl.coeffRef(inputIndex) = values[i]; inputIndex += this->m_inputStride; } @@ -342,14 +363,14 @@ struct TensorEvaluator, Device> } else { const Index idx = index / this->m_stride; const Index rem = index - idx * this->m_stride; - if (rem + packetSize <= this->m_stride) { + if (rem + PacketSize <= this->m_stride) { const Index inputIndex = idx * this->m_inputStride + this->m_inputOffset + rem; this->m_impl.template writePacket(inputIndex, x); } else { // Cross stride boundary. Fallback to slow path. - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; internal::pstore(values, x); - for (int i = 0; i < packetSize; ++i) { + for (int i = 0; i < PacketSize; ++i) { this->coeffRef(index) = values[i]; ++index; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 7738f18fb..839c6e3e5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -260,6 +260,21 @@ struct TensorEvaluator() + + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost() + + TensorOpCost::ModCost()); + const double lhs_size = m_leftImpl.dimensions().TotalSize(); + const double rhs_size = m_rightImpl.dimensions().TotalSize(); + return (lhs_size / (lhs_size + rhs_size)) * + m_leftImpl.costPerCoeff(vectorized) + + (rhs_size / (lhs_size + rhs_size)) * + m_rightImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index f070ba61e..1627d4870 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -440,6 +440,10 @@ struct TensorContractionEvaluatorBase return m_result[index]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return internal::ploadt(m_result + index); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index a96776a77..a2f1f71f5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -177,7 +177,6 @@ template struct ConversionSubExprEval struct TensorEvaluator, Device> @@ -190,6 +189,7 @@ struct TensorEvaluator, Device> typedef typename internal::remove_all::Scalar>::type SrcType; typedef typename PacketType::type PacketReturnType; typedef typename PacketType::type PacketSourceType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = false, @@ -231,6 +231,21 @@ struct TensorEvaluator, Device> return converter.template packet(index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double cast_cost = TensorOpCost::CastCost(); + if (vectorized) { + const double SrcCoeffRatio = + internal::type_casting_traits::SrcCoeffRatio; + const double TgtCoeffRatio = + internal::type_casting_traits::TgtCoeffRatio; + return m_impl.costPerCoeff(vectorized) * (SrcCoeffRatio / PacketSize) + + TensorOpCost(0, 0, TgtCoeffRatio * (cast_cost / PacketSize)); + } else { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, cast_cost); + } + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 4fe1fb943..680fc0a68 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -297,6 +297,11 @@ struct TensorEvaluator Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + enum { IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, @@ -367,10 +372,6 @@ struct TensorEvaluator::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { @@ -405,7 +406,6 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const { - const int PacketSize = internal::unpacket_traits::size; Index indices[2] = {index, index+PacketSize-1}; Index startInputs[2] = {0, 0}; if (static_cast(Layout) == static_cast(ColMajor)) { @@ -448,6 +448,23 @@ struct TensorEvaluator() + TensorOpCost::MulCost(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, + PacketSize)); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } private: @@ -1044,6 +1061,25 @@ struct TensorEvaluator(m_buf+index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost + // model. + const double kernel_size = m_kernelImpl.dimensions().TotalSize(); + // We ignore the use of fused multiply-add. + const double convolve_compute_cost = + TensorOpCost::AddCost() + TensorOpCost::MulCost(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, + PacketSize)); + } + private: // No assignment (copies are needed by the kernels) TensorEvaluator& operator = (const TensorEvaluator&); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h new file mode 100644 index 000000000..366352853 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h @@ -0,0 +1,181 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Rasmus Munk Larsen +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H +#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H + +#if !defined(EIGEN_USE_GPU) +#define EIGEN_USE_COST_MODEL +#endif + +namespace Eigen { + +/** \class TensorEvaluator + * \ingroup CXX11_Tensor_Module + * + * \brief A cost model used to limit the number of threads used for evaluating + * tensor expression. + * + */ + +// Class storing the cost of evaluating a tensor expression in terms of the +// estimated number of operand bytes loads, bytes stored, and compute cycles. +class TensorOpCost { + public: + // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple + // model based on minimal reciprocal throughput numbers from Intel or + // Agner Fog's tables would be better than what is there now. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int MulCost() { + return internal::functor_traits< + internal::scalar_product_op>::Cost; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int AddCost() { + return internal::functor_traits>::Cost; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int DivCost() { + return internal::functor_traits< + internal::scalar_quotient_op>::Cost; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int ModCost() { + return internal::functor_traits>::Cost; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int CastCost() { + return internal::functor_traits< + internal::scalar_cast_op>::Cost; + } + + TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {} + TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles) + : bytes_loaded_(bytes_loaded), + bytes_stored_(bytes_stored), + compute_cycles_(compute_cycles) {} + + TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles, + bool vectorized, double packet_size) + : bytes_loaded_(bytes_loaded), + bytes_stored_(bytes_stored), + compute_cycles_(vectorized ? compute_cycles / packet_size + : compute_cycles) { + using std::isfinite; + eigen_assert(bytes_loaded >= 0 && (isfinite)(bytes_loaded)); + eigen_assert(bytes_stored >= 0 && (isfinite)(bytes_stored)); + eigen_assert(compute_cycles >= 0 && (isfinite)(compute_cycles)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const { + return bytes_loaded_; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const { + return bytes_stored_; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const { + return compute_cycles_; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost( + double load_cost, double store_cost, double compute_cost) const { + return load_cost * bytes_loaded_ + store_cost * bytes_stored_ + + compute_cost * compute_cycles_; + } + + // TODO(rmlarsen): Define min in terms of total cost, not elementwise. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMin( + const TensorOpCost& rhs) { + bytes_loaded_ = numext::mini(bytes_loaded_, rhs.bytes_loaded()); + bytes_stored_ = numext::mini(bytes_stored_, rhs.bytes_stored()); + compute_cycles_ = numext::mini(compute_cycles_, rhs.compute_cycles()); + return *this; + } + + // TODO(rmlarsen): Define max in terms of total cost, not elementwise. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMax( + const TensorOpCost& rhs) { + bytes_loaded_ = numext::maxi(bytes_loaded_, rhs.bytes_loaded()); + bytes_stored_ = numext::maxi(bytes_stored_, rhs.bytes_stored()); + compute_cycles_ = numext::maxi(compute_cycles_, rhs.compute_cycles()); + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=( + const TensorOpCost& rhs) { + bytes_loaded_ += rhs.bytes_loaded(); + bytes_stored_ += rhs.bytes_stored(); + compute_cycles_ += rhs.compute_cycles(); + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) { + bytes_loaded_ *= rhs; + bytes_stored_ *= rhs; + compute_cycles_ *= rhs; + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+( + TensorOpCost lhs, const TensorOpCost& rhs) { + lhs += rhs; + return lhs; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*( + TensorOpCost lhs, double rhs) { + lhs *= rhs; + return lhs; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*( + double lhs, TensorOpCost rhs) { + rhs *= lhs; + return rhs; + } + + friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) { + return os << "[bytes_loaded = " << tc.bytes_loaded() + << ", bytes_stored = " << tc.bytes_stored() + << ", compute_cycles = " << tc.compute_cycles() << "]"; + } + + private: + double bytes_loaded_; + double bytes_stored_; + double compute_cycles_; +}; + +// TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads +// in [1:max_threads] instead of just switching multi-threading off for small +// work units. +template +class TensorCostModel { + public: + // Costs in device cycles. + static const int kLoadCycles = 3; + static const int kStoreCycles = 3; + // Scaling from Eigen compute cost to device cycles. + static const int kDeviceCyclesPerComputeCycle = 1; + + // Implements a simple "binary" policy: Return 1 if total cost is below + // kMinWorkToParallelize and max_threads otherwise. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int numThreads( + double output_size, const TensorOpCost& cost_per_coeff, int max_threads) { + // Compute total cost C in device cycles. + const double total_cost = + output_size * + cost_per_coeff.total_cost(kLoadCycles, kStoreCycles, + kDeviceCyclesPerComputeCycle); + // Smallest work unit to parallelize. + const double kMinParallelCost = 1e6; + return total_cost < kMinParallelCost ? 1 : max_threads; + } +}; + +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h index b58e513b4..e020d076f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -83,8 +83,10 @@ struct TensorEvaluator, Devi typedef typename internal::traits::Index Index; static const int NumDims = internal::traits::NumDimensions; typedef DSizes Dimensions; - typedef - typename internal::remove_const::type Scalar; + typedef typename internal::remove_const::type Scalar; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = false, @@ -101,9 +103,6 @@ struct TensorEvaluator, Devi m_dimensions = op.func().dimensions(op.expression()); } - typedef typename internal::remove_const::type CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { @@ -134,6 +133,11 @@ struct TensorEvaluator, Devi return internal::ploadt(m_result + index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): Extend CustomOp API to return its cost estimate. + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } protected: @@ -236,6 +240,9 @@ struct TensorEvaluator::NumDimensions; typedef DSizes Dimensions; typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = false, @@ -252,9 +259,6 @@ struct TensorEvaluator::type CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { @@ -284,6 +288,11 @@ struct TensorEvaluator(m_result + index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): Extend CustomOp API to return its cost estimate. + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 1fb27a65b..893542d29 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -88,6 +88,10 @@ struct TensorEvaluator, Device> typedef TensorEvalToOp XprType; typedef typename ArgType::Scalar Scalar; typedef typename TensorEvaluator::Dimensions Dimensions; + typedef typename XprType::Index Index; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = true, @@ -104,10 +108,6 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() { } - typedef typename XprType::Index Index; - typedef typename internal::remove_const::type CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* scalar) { @@ -138,6 +138,13 @@ struct TensorEvaluator, Device> return internal::ploadt(m_buffer + index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // We assume that evalPacket or evalScalar is called to perform the + // assignment and account for the cost of the write here. + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_buffer; } private: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 947a8ed88..f1f9a90df 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -101,6 +101,11 @@ struct TensorEvaluator } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, + internal::unpacket_traits::size); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; } protected: @@ -219,6 +224,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename internal::traits::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } @@ -237,6 +243,12 @@ struct TensorEvaluator, Device> return m_functor.template packetOp(index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, + internal::unpacket_traits::size); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } private: @@ -270,6 +282,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename internal::traits::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } @@ -293,6 +306,12 @@ struct TensorEvaluator, Device> return m_functor.packetOp(m_argImpl.template packet(index)); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double functor_cost = internal::functor_traits::Cost; + return m_argImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } private: @@ -330,6 +349,7 @@ struct TensorEvaluator::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const @@ -358,6 +378,14 @@ struct TensorEvaluator(index), m_rightImpl.template packet(index)); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double functor_cost = internal::functor_traits::Cost; + return m_leftImpl.costPerCoeff(vectorized) + + m_rightImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } private: @@ -398,6 +426,7 @@ struct TensorEvaluator typedef typename XprType::Index Index; typedef typename internal::traits::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const @@ -435,6 +464,13 @@ struct TensorEvaluator m_elseImpl.template packet(index)); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + return m_condImpl.costPerCoeff(vectorized) + + m_thenImpl.costPerCoeff(vectorized) + .cwiseMax(m_elseImpl.costPerCoeff(vectorized)); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } private: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index d6db45ade..ece2ed91b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -129,6 +129,7 @@ struct TensorEvaluator, D typedef typename internal::conditional::type OutputScalar; typedef OutputScalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = false, @@ -176,7 +177,6 @@ struct TensorEvaluator, D } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { if (m_data) { m_device.deallocate(m_data); @@ -189,11 +189,17 @@ struct TensorEvaluator, D return m_data[index]; } - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const { + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType + packet(Index index) const { return internal::ploadt(m_data + index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index 14f480901..d2b0b30a1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -83,6 +83,10 @@ struct TensorEvaluator, Device> typedef TensorForcedEvalOp XprType; typedef typename ArgType::Scalar Scalar; typedef typename TensorEvaluator::Dimensions Dimensions; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = true, @@ -95,10 +99,6 @@ struct TensorEvaluator, Device> : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) { } - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { @@ -132,6 +132,10 @@ struct TensorEvaluator, Device> return internal::ploadt(m_buffer + index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return m_buffer; } private: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index b7c13f67f..ccaa757d1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -594,6 +594,8 @@ template <> class UniformRandomGenerator > { template struct functor_traits > { enum { + // Rough estimate. + Cost = 100 * NumTraits::MulCost, PacketAccess = UniformRandomGenerator::PacketAccess }; }; @@ -774,6 +776,8 @@ template class NormalRandomGenerator { template struct functor_traits > { enum { + // Rough estimate. + Cost = 100 * NumTraits::MulCost, PacketAccess = NormalRandomGenerator::PacketAccess }; }; @@ -807,6 +811,15 @@ class GaussianGenerator { array m_two_sigmas; }; +template +struct functor_traits > { + enum { + Cost = NumDims * (2 * NumTraits::AddCost + NumTraits::MulCost + + functor_traits >::Cost) + + functor_traits >::Cost, + PacketAccess = GaussianGenerator::PacketAccess + }; +}; } // end namespace internal } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index e4154bd0b..9e5791bd3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -145,6 +145,14 @@ struct TensorEvaluator, Device> return rslt; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): This is just a placeholder. Define interface to make + // generators return their cost. + return TensorOpCost(0, 0, TensorOpCost::AddCost() + + TensorOpCost::MulCost()); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 72594a05c..0b769f278 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -159,6 +159,9 @@ struct TensorEvaluator, Device> typedef TensorEvaluator, Device> Self; typedef TensorEvaluator Impl; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = false, @@ -307,9 +310,6 @@ struct TensorEvaluator, Device> } } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -362,15 +362,14 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const Index packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) { return packetWithPossibleZero(index); } - const Index indices[2] = {index, index + packetSize - 1}; + const Index indices[2] = {index, index + PacketSize - 1}; const Index patchIndex = indices[0] / m_fastPatchStride; if (patchIndex != indices[1] / m_fastPatchStride) { return packetWithPossibleZero(index); @@ -434,12 +433,24 @@ struct TensorEvaluator, Device> Index rowInflateStride() const { return m_row_inflate_strides; } Index colInflateStride() const { return m_col_inflate_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + // We conservatively estimate the cost for the code path where the computed + // index is inside the original image and + // TensorEvaluator::CoordAccess is false. + const double compute_cost = 3 * TensorOpCost::DivCost() + + 6 * TensorOpCost::MulCost() + + 8 * TensorOpCost::MulCost(); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + const int PacketSize = internal::unpacket_traits::size; + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload(values); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index 368e6f685..de2f67d74 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -81,6 +81,10 @@ struct TensorEvaluator, Device> typedef typename XprType::Index Index; static const int NumDims = internal::array_size::Dimensions>::value; typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = /*TensorEvaluator::IsAligned*/ false, @@ -123,11 +127,6 @@ struct TensorEvaluator, Device> } } - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -190,18 +189,30 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload(values); return rslt; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = NumDims * (3 * TensorOpCost::DivCost() + + 3 * TensorOpCost::MulCost() + + 2 * TensorOpCost::AddCost()); + const double input_size = m_impl.dimensions().TotalSize(); + const double output_size = m_dimensions.TotalSize(); + if (output_size == 0) + return TensorOpCost(); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(sizeof(CoeffReturnType) * input_size / output_size, 0, + compute_cost, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index 9b85914ff..63a8476ef 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -155,6 +155,10 @@ struct TensorEvaluator, Device> return m_impl.template packet(index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); } const TensorEvaluator& impl() const { return m_impl; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index a9c222ea0..bfa65a607 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -142,6 +142,10 @@ struct TensorEvaluator, Device> return m_impl.template packet(index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast(m_impl.data()); } const TensorEvaluator& impl() const { return m_impl; } @@ -449,6 +453,11 @@ struct TensorEvaluator, Devi } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { Scalar* result = m_impl.data(); if (result) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index a595a0175..88b838b27 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -87,6 +87,10 @@ struct TensorEvaluator, Device typedef typename XprType::Index Index; static const int NumDims = internal::array_size::value; typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = false, @@ -129,10 +133,6 @@ struct TensorEvaluator, Device } } - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { @@ -224,21 +224,51 @@ struct TensorEvaluator, Device return m_impl.coeff(inputIndex); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + TensorOpCost cost = m_impl.costPerCoeff(vectorized); + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumDims; ++i) + updateCostPerDimension(cost, i, i == 0); + } else { + for (int i = NumDims - 1; i >= 0; --i) + updateCostPerDimension(cost, i, i == NumDims - 1); + } + return cost; + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + private: + void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const { + const double in = static_cast(m_impl.dimensions()[i]); + const double out = in + m_padding[i].first + m_padding[i].second; + if (out == 0) + return; + const double reduction = in / out; + cost *= reduction; + if (first) { + cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost() + + reduction * (1 * TensorOpCost::AddCost())); + } else { + cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost() + + 2 * TensorOpCost::MulCost() + + reduction * (2 * TensorOpCost::MulCost() + + 1 * TensorOpCost::DivCost())); + } + } + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); const Index initialIndex = index; Index inputIndex = 0; for (int i = NumDims - 1; i > 0; --i) { const Index first = index; - const Index last = index + packetSize - 1; + const Index last = index + PacketSize - 1; const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i]; const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i]; const Index lastPaddedRight = m_outputStrides[i+1]; @@ -263,7 +293,7 @@ struct TensorEvaluator, Device } } - const Index last = index + packetSize - 1; + const Index last = index + PacketSize - 1; const Index first = index; const Index lastPaddedLeft = m_padding[0].first; const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second); @@ -288,16 +318,15 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); const Index initialIndex = index; Index inputIndex = 0; for (int i = 0; i < NumDims - 1; ++i) { const Index first = index; - const Index last = index + packetSize - 1; + const Index last = index + PacketSize - 1; const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1]; const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1]; const Index lastPaddedRight = m_outputStrides[i]; @@ -322,7 +351,7 @@ struct TensorEvaluator, Device } } - const Index last = index + packetSize - 1; + const Index last = index + PacketSize - 1; const Index first = index; const Index lastPaddedLeft = m_padding[NumDims-1].first; const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second); @@ -347,9 +376,8 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload(values); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 0bf460f4e..a87e45330 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -85,6 +85,10 @@ struct TensorEvaluator, Device> static const int NumDims = internal::array_size::Dimensions>::value + 1; typedef DSizes Dimensions; typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + enum { IsAligned = false, @@ -137,9 +141,6 @@ struct TensorEvaluator, Device> } } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -183,12 +184,11 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); Index output_stride_index = (static_cast(Layout) == static_cast(ColMajor)) ? NumDims - 1 : 0; - Index indices[2] = {index, index + packetSize - 1}; + Index indices[2] = {index, index + PacketSize - 1}; Index patchIndices[2] = {indices[0] / m_outputStrides[output_stride_index], indices[1] / m_outputStrides[output_stride_index]}; Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[output_stride_index], @@ -229,15 +229,15 @@ struct TensorEvaluator, Device> inputIndices[0] += (patchIndices[0] + patchOffsets[0]); inputIndices[1] += (patchIndices[1] + patchOffsets[1]); - if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + if (inputIndices[1] - inputIndices[0] == PacketSize - 1) { PacketReturnType rslt = m_impl.template packet(inputIndices[0]); return rslt; } else { - EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; + EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize]; values[0] = m_impl.coeff(inputIndices[0]); - values[packetSize-1] = m_impl.coeff(inputIndices[1]); - for (int i = 1; i < packetSize-1; ++i) { + values[PacketSize-1] = m_impl.coeff(inputIndices[1]); + for (int i = 1; i < PacketSize-1; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload(values); @@ -245,6 +245,14 @@ struct TensorEvaluator, Device> } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = NumDims * (TensorOpCost::DivCost() + + TensorOpCost::MulCost() + + 2 * TensorOpCost::AddCost()); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 00f870328..1c9e7ab66 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -411,6 +411,9 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef TensorEvaluator, Device> Self; static const bool InputPacketAccess = TensorEvaluator::PacketAccess; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = false, @@ -495,9 +498,6 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - typedef typename internal::remove_const::type CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) { m_impl.evalSubExprsIfNeeded(NULL); @@ -584,16 +584,15 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index + packetSize - 1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; if (ReducingInnerMostDims) { const Index num_values_to_reduce = (static_cast(Layout) == static_cast(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1]; const Index firstIndex = firstInput(index); - for (Index i = 0; i < packetSize; ++i) { + for (Index i = 0; i < PacketSize; ++i) { Op reducer(m_reducer); values[i] = internal::InnerMostDimReducer::reduce(*this, firstIndex + i * num_values_to_reduce, num_values_to_reduce, reducer); @@ -602,18 +601,18 @@ struct TensorEvaluator, Device> const Index firstIndex = firstInput(index); const int innermost_dim = (static_cast(Layout) == static_cast(ColMajor)) ? 0 : NumOutputDims - 1; // TBD: extend this the the n innermost dimensions that we preserve. - if (((firstIndex % m_dimensions[innermost_dim]) + packetSize - 1) < m_dimensions[innermost_dim]) { + if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) { Op reducer(m_reducer); typename Self::PacketReturnType accum = reducer.template initializePacket(); internal::InnerMostDimPreserver::reduce(*this, firstIndex, reducer, &accum); return reducer.finalizePacket(accum); } else { - for (int i = 0; i < packetSize; ++i) { + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index + i); } } } else { - for (int i = 0; i < packetSize; ++i) { + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index + i); } } @@ -621,6 +620,18 @@ struct TensorEvaluator, Device> return rslt; } + // Must be called after evalSubExprsIfNeeded(). + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + if (RunningFullReduction && m_result) { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } else { + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const double compute_cost = num_values_to_reduce * internal::functor_traits::Cost; + return m_impl.costPerCoeff(vectorized) * num_values_to_reduce + + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } private: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index 96d92038c..1a59cc8f7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -104,6 +104,10 @@ struct TensorEvaluator, Device typedef typename XprType::Index Index; static const int NumDims = internal::array_size::value; typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = false, @@ -135,10 +139,6 @@ struct TensorEvaluator, Device } } - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -195,21 +195,33 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); // TODO(ndjaitly): write a better packing routine that uses // local structure. EIGEN_ALIGN_MAX typename internal::remove_const::type - values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload(values); return rslt; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + double compute_cost = NumDims * (2 * TensorOpCost::AddCost() + + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost()); + for (int i = 0; i < NumDims; ++i) { + if (m_reverse[i]) { + compute_cost += 2 * TensorOpCost::AddCost(); + } + } + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: @@ -246,6 +258,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return this->m_dimensions; } @@ -256,14 +269,13 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); // This code is pilfered from TensorMorphing.h - EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; + EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize]; internal::pstore(values, x); - for (int i = 0; i < packetSize; ++i) { + for (int i = 0; i < PacketSize; ++i) { this->coeffRef(index+i) = values[i]; } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index c19833ea5..e76533710 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -104,6 +104,9 @@ struct TensorEvaluator, Device> static const int NumDims = internal::array_size::Dimensions>::value; typedef DSizes Dimensions; typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = false, @@ -145,9 +148,6 @@ struct TensorEvaluator, Device> } } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -166,18 +166,25 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload(values); return rslt; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = NumDims * (2 * TensorOpCost::AddCost() + + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost()); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: @@ -219,6 +226,9 @@ struct TensorEvaluator, Device> static const int NumDims = internal::array_size::Dimensions>::value; typedef DSizes Dimensions; typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = false, @@ -230,9 +240,6 @@ struct TensorEvaluator, Device> : Base(op, device) { } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { return this->m_impl.coeffRef(this->srcCoeff(index)); @@ -241,12 +248,11 @@ struct TensorEvaluator, Device> template EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - static const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; internal::pstore(values, x); - for (int i = 0; i < packetSize; ++i) { + for (int i = 0; i < PacketSize; ++i) { this->coeffRef(index+i) = values[i]; } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 085f8fd3d..23248c626 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -103,6 +103,10 @@ struct TensorEvaluator, Device> typedef typename XprType::Index Index; static const int NumDims = internal::array_size::Dimensions>::value; typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = /*TensorEvaluator::IsAligned*/false, @@ -142,10 +146,6 @@ struct TensorEvaluator, Device> } } - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -164,12 +164,11 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); Index inputIndices[] = {0, 0}; - Index indices[] = {index, index + packetSize - 1}; + Index indices[] = {index, index + PacketSize - 1}; if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / m_outputStrides[i]; @@ -193,15 +192,15 @@ struct TensorEvaluator, Device> inputIndices[0] += indices[0] * m_inputStrides[NumDims-1]; inputIndices[1] += indices[1] * m_inputStrides[NumDims-1]; } - if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + if (inputIndices[1] - inputIndices[0] == PacketSize - 1) { PacketReturnType rslt = m_impl.template packet(inputIndices[0]); return rslt; } else { - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; values[0] = m_impl.coeff(inputIndices[0]); - values[packetSize-1] = m_impl.coeff(inputIndices[1]); - for (int i = 1; i < packetSize-1; ++i) { + values[PacketSize-1] = m_impl.coeff(inputIndices[1]); + for (int i = 1; i < PacketSize-1; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload(values); @@ -209,6 +208,14 @@ struct TensorEvaluator, Device> } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = NumDims * (2 * TensorOpCost::AddCost() + + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost()); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: @@ -266,6 +273,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { @@ -275,12 +283,11 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < this->dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < this->dimensions().TotalSize()); Index inputIndices[] = {0, 0}; - Index indices[] = {index, index + packetSize - 1}; + Index indices[] = {index, index + PacketSize - 1}; if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / this->m_outputStrides[i]; @@ -304,15 +311,15 @@ struct TensorEvaluator, Device> inputIndices[0] += indices[0] * this->m_inputStrides[NumDims-1]; inputIndices[1] += indices[1] * this->m_inputStrides[NumDims-1]; } - if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + if (inputIndices[1] - inputIndices[0] == PacketSize - 1) { this->m_impl.template writePacket(inputIndices[0], x); } else { - EIGEN_ALIGN_MAX Scalar values[packetSize]; + EIGEN_ALIGN_MAX Scalar values[PacketSize]; internal::pstore(values, x); this->m_impl.coeffRef(inputIndices[0]) = values[0]; - this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; - for (int i = 1; i < packetSize-1; ++i) { + this->m_impl.coeffRef(inputIndices[1]) = values[PacketSize-1]; + for (int i = 1; i < PacketSize-1; ++i) { this->coeffRef(index+i) = values[i]; } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index 5bdfbad46..f8db43dbe 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -171,6 +171,9 @@ struct TensorEvaluator, D static const int NumDims = NumInputDims + 1; typedef DSizes Dimensions; typedef typename internal::remove_const::type Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = false, @@ -336,9 +339,6 @@ struct TensorEvaluator, D } } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -408,16 +408,15 @@ struct TensorEvaluator, D template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const Index packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1 || m_in_plane_strides != 1 || m_plane_inflate_strides != 1) { return packetWithPossibleZero(index); } - const Index indices[2] = {index, index + packetSize - 1}; + const Index indices[2] = {index, index + PacketSize - 1}; const Index patchIndex = indices[0] / m_fastPatchStride; if (patchIndex != indices[1] / m_fastPatchStride) { return packetWithPossibleZero(index); @@ -495,6 +494,14 @@ struct TensorEvaluator, D return packetWithPossibleZero(index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double compute_cost = + 10 * TensorOpCost::DivCost() + 21 * TensorOpCost::MulCost() + + 8 * TensorOpCost::AddCost(); + return TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } const TensorEvaluator& impl() const { return m_impl; } @@ -518,9 +525,9 @@ struct TensorEvaluator, D protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + const int PacketSize = internal::unpacket_traits::size; + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload(values); -- cgit v1.2.3 From 78a51abc123dff49d6e1b1a6dd5b193e92ae0817 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 14 Apr 2016 15:23:10 -0700 Subject: Added a more scalable non blocking thread pool --- unsupported/Eigen/CXX11/ThreadPool | 69 ++++++ .../Eigen/CXX11/src/ThreadPool/EventCount.h | 234 +++++++++++++++++++++ .../CXX11/src/ThreadPool/NonBlockingThreadPool.h | 232 ++++++++++++++++++++ unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h | 210 ++++++++++++++++++ .../Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h | 127 +++++++++++ .../Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h | 38 ++++ .../CXX11/src/ThreadPool/ThreadPoolInterface.h | 26 +++ 7 files changed, 936 insertions(+) create mode 100644 unsupported/Eigen/CXX11/ThreadPool create mode 100644 unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h create mode 100644 unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h create mode 100644 unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h create mode 100644 unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h create mode 100644 unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h create mode 100644 unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h diff --git a/unsupported/Eigen/CXX11/ThreadPool b/unsupported/Eigen/CXX11/ThreadPool new file mode 100644 index 000000000..85028603c --- /dev/null +++ b/unsupported/Eigen/CXX11/ThreadPool @@ -0,0 +1,69 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_THREADPOOL_MODULE +#define EIGEN_CXX11_THREADPOOL_MODULE + +//#include +#include "Core" + +#include + +/** \defgroup CXX11_ThreadPool_Module C++11 ThreadPool Module + * + * This module provides 2 threadpool implementations + * - a simple reference implementation + * - a faster non blocking implementation + * + * This module requires C++11. + * + * \code + * #include + * \endcode + */ + +//#include + +//#include "src/Core/util/EmulateArray.h" +//#include "src/Core/util/MaxSizeVector.h" +//#include "third_party/eigen3/Eigen/src/Core/util/Macros.h" + + +// Emulate the cxx11 functionality that we need if the compiler doesn't support it. +// Visual studio 2015 doesn't advertise itself as cxx11 compliant, although it +// supports enough of the standard for our needs + +#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900 +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "src/ThreadPool/EventCount.h" +#include "src/ThreadPool/RunQueue.h" +#include "src/ThreadPool/ThreadPoolInterface.h" +#include "src/ThreadPool/ThreadEnvironment.h" +#include "src/ThreadPool/SimpleThreadPool.h" +#include "src/ThreadPool/NonBlockingThreadPool.h" + +#endif + +#include + +#endif // EIGEN_CXX11_THREADPOOL_MODULE + diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h new file mode 100644 index 000000000..16eee1a41 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h @@ -0,0 +1,234 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Dmitry Vyukov +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_ +#define EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_ + +namespace Eigen { + +// EventCount allows to wait for arbitrary predicates in non-blocking +// algorithms. Think of condition variable, but wait predicate does not need to +// be protected by a mutex. Usage: +// Waiting thread does: +// +// if (predicate) +// return act(); +// EventCount::Waiter& w = waiters[my_index]; +// ec.Prewait(&w); +// if (predicate) { +// ec.CancelWait(&w); +// return act(); +// } +// ec.CommitWait(&w); +// +// Notifying thread does: +// +// predicate = true; +// ec.Notify(true); +// +// Notify is cheap if there are no waiting threads. Prewait/CommitWait are not +// cheap, but they are executed only if the preceeding predicate check has +// failed. +// +// Algorihtm outline: +// There are two main variables: predicate (managed by user) and state_. +// Operation closely resembles Dekker mutual algorithm: +// https://en.wikipedia.org/wiki/Dekker%27s_algorithm +// Waiting thread sets state_ then checks predicate, Notifying thread sets +// predicate then checks state_. Due to seq_cst fences in between these +// operations it is guaranteed than either waiter will see predicate change +// and won't block, or notifying thread will see state_ change and will unblock +// the waiter, or both. But it can't happen that both threads don't see each +// other changes, which would lead to deadlock. +class EventCount { + public: + class Waiter; + + EventCount(std::vector& waiters) : waiters_(waiters) { + eigen_assert(waiters.size() < (1 << kWaiterBits) - 1); + // Initialize epoch to something close to overflow to test overflow. + state_ = kStackMask | (kEpochMask - kEpochInc * waiters.size() * 2); + } + + ~EventCount() { + // Ensure there are no waiters. + eigen_assert((state_.load() & (kStackMask | kWaiterMask)) == kStackMask); + } + + // Prewait prepares for waiting. + // After calling this function the thread must re-check the wait predicate + // and call either CancelWait or CommitWait passing the same Waiter object. + void Prewait(Waiter* w) { + w->epoch = state_.fetch_add(kWaiterInc, std::memory_order_relaxed); + std::atomic_thread_fence(std::memory_order_seq_cst); + } + + // CommitWait commits waiting. + void CommitWait(Waiter* w) { + w->state = Waiter::kNotSignaled; + // Modification epoch of this waiter. + uint64_t epoch = + (w->epoch & kEpochMask) + + (((w->epoch & kWaiterMask) >> kWaiterShift) << kEpochShift); + uint64_t state = state_.load(std::memory_order_seq_cst); + for (;;) { + if (int64_t((state & kEpochMask) - epoch) < 0) { + // The preceeding waiter has not decided on its fate. Wait until it + // calls either CancelWait or CommitWait, or is notified. + std::this_thread::yield(); + state = state_.load(std::memory_order_seq_cst); + continue; + } + // We've already been notified. + if (int64_t((state & kEpochMask) - epoch) > 0) return; + // Remove this thread from prewait counter and add it to the waiter list. + eigen_assert((state & kWaiterMask) != 0); + uint64_t newstate = state - kWaiterInc + kEpochInc; + newstate = (newstate & ~kStackMask) | (w - &waiters_[0]); + if ((state & kStackMask) == kStackMask) + w->next.store(nullptr, std::memory_order_relaxed); + else + w->next.store(&waiters_[state & kStackMask], std::memory_order_relaxed); + if (state_.compare_exchange_weak(state, newstate, + std::memory_order_release)) + break; + } + Park(w); + } + + // CancelWait cancels effects of the previous Prewait call. + void CancelWait(Waiter* w) { + uint64_t epoch = + (w->epoch & kEpochMask) + + (((w->epoch & kWaiterMask) >> kWaiterShift) << kEpochShift); + uint64_t state = state_.load(std::memory_order_relaxed); + for (;;) { + if (int64_t((state & kEpochMask) - epoch) < 0) { + // The preceeding waiter has not decided on its fate. Wait until it + // calls either CancelWait or CommitWait, or is notified. + std::this_thread::yield(); + state = state_.load(std::memory_order_relaxed); + continue; + } + // We've already been notified. + if (int64_t((state & kEpochMask) - epoch) > 0) return; + // Remove this thread from prewait counter. + eigen_assert((state & kWaiterMask) != 0); + if (state_.compare_exchange_weak(state, state - kWaiterInc + kEpochInc, + std::memory_order_relaxed)) + return; + } + } + + // Notify wakes one or all waiting threads. + // Must be called after changing the associated wait predicate. + void Notify(bool all) { + std::atomic_thread_fence(std::memory_order_seq_cst); + uint64_t state = state_.load(std::memory_order_acquire); + for (;;) { + // Easy case: no waiters. + if ((state & kStackMask) == kStackMask && (state & kWaiterMask) == 0) + return; + uint64_t waiters = (state & kWaiterMask) >> kWaiterShift; + uint64_t newstate; + if (all) { + // Reset prewait counter and empty wait list. + newstate = (state & kEpochMask) + (kEpochInc * waiters) + kStackMask; + } else if (waiters) { + // There is a thread in pre-wait state, unblock it. + newstate = state + kEpochInc - kWaiterInc; + } else { + // Pop a waiter from list and unpark it. + Waiter* w = &waiters_[state & kStackMask]; + Waiter* wnext = w->next.load(std::memory_order_relaxed); + uint64_t next = kStackMask; + if (wnext != nullptr) next = wnext - &waiters_[0]; + // Note: we don't add kEpochInc here. ABA problem on the lock-free stack + // can't happen because a waiter is re-pushed onto the stack only after + // it was in the pre-wait state which inevitably leads to epoch + // increment. + newstate = (state & kEpochMask) + next; + } + if (state_.compare_exchange_weak(state, newstate, + std::memory_order_acquire)) { + if (!all && waiters) return; // unblocked pre-wait thread + if ((state & kStackMask) == kStackMask) return; + Waiter* w = &waiters_[state & kStackMask]; + if (!all) w->next.store(nullptr, std::memory_order_relaxed); + Unpark(w); + return; + } + } + } + + class Waiter { + friend class EventCount; + std::atomic next; + std::mutex mu; + std::condition_variable cv; + uint64_t epoch; + unsigned state; + enum { + kNotSignaled, + kWaiting, + kSignaled, + }; + // Prevent false sharing with other Waiter objects in the same vector. + char pad_[128]; + }; + + private: + // State_ layout: + // - low kStackBits is a stack of waiters committed wait. + // - next kWaiterBits is count of waiters in prewait state. + // - next kEpochBits is modification counter. + static const uint64_t kStackBits = 16; + static const uint64_t kStackMask = (1ull << kStackBits) - 1; + static const uint64_t kWaiterBits = 16; + static const uint64_t kWaiterShift = 16; + static const uint64_t kWaiterMask = ((1ull << kWaiterBits) - 1) + << kWaiterShift; + static const uint64_t kWaiterInc = 1ull << kWaiterBits; + static const uint64_t kEpochBits = 32; + static const uint64_t kEpochShift = 32; + static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift; + static const uint64_t kEpochInc = 1ull << kEpochShift; + std::atomic state_; + std::vector& waiters_; + + void Park(Waiter* w) { + std::unique_lock lock(w->mu); + while (w->state != Waiter::kSignaled) { + w->state = Waiter::kWaiting; + w->cv.wait(lock); + } + } + + void Unpark(Waiter* waiters) { + Waiter* next = nullptr; + for (Waiter* w = waiters; w; w = next) { + next = w->next.load(std::memory_order_relaxed); + unsigned state; + { + std::unique_lock lock(w->mu); + state = w->state; + w->state = Waiter::kSignaled; + } + // Avoid notifying if it wasn't waiting. + if (state == Waiter::kWaiting) w->cv.notify_one(); + } + } + + EventCount(const EventCount&) = delete; + void operator=(const EventCount&) = delete; +}; + +} // namespace Eigen + +#endif // EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_ diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h new file mode 100644 index 000000000..18dec5393 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h @@ -0,0 +1,232 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Dmitry Vyukov +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H +#define EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H + + +namespace Eigen { + +template +class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { + public: + typedef typename Environment::Task Task; + typedef RunQueue Queue; + + NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment()) + : env_(env), + threads_(num_threads), + queues_(num_threads), + waiters_(num_threads), + blocked_(), + spinning_(), + done_(), + ec_(waiters_) { + for (int i = 0; i < num_threads; i++) queues_.push_back(new Queue()); + for (int i = 0; i < num_threads; i++) + threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); })); + } + + ~NonBlockingThreadPoolTempl() { + done_.store(true, std::memory_order_relaxed); + // Now if all threads block without work, they will start exiting. + // But note that threads can continue to work arbitrary long, + // block, submit new work, unblock and otherwise live full life. + ec_.Notify(true); + + // Join threads explicitly to avoid destruction order issues. + for (size_t i = 0; i < threads_.size(); i++) delete threads_[i]; + for (size_t i = 0; i < threads_.size(); i++) delete queues_[i]; + } + + void Schedule(std::function fn) { + Task t = env_.CreateTask(std::move(fn)); + PerThread* pt = GetPerThread(); + if (pt->pool == this) { + // Worker thread of this pool, push onto the thread's queue. + Queue* q = queues_[pt->index]; + t = q->PushFront(std::move(t)); + } else { + // A free-standing thread (or worker of another pool), push onto a random + // queue. + Queue* q = queues_[Rand(&pt->rand) % queues_.size()]; + t = q->PushBack(std::move(t)); + } + // Note: below we touch this after making w available to worker threads. + // Strictly speaking, this can lead to a racy-use-after-free. Consider that + // Schedule is called from a thread that is neither main thread nor a worker + // thread of this pool. Then, execution of w directly or indirectly + // completes overall computations, which in turn leads to destruction of + // this. We expect that such scenario is prevented by program, that is, + // this is kept alive while any threads can potentially be in Schedule. + if (!t.f) + ec_.Notify(false); + else + env_.ExecuteTask(t); // Push failed, execute directly. + } + + private: + typedef typename Environment::EnvThread Thread; + + struct PerThread { + bool inited; + NonBlockingThreadPoolTempl* pool; // Parent pool, or null for normal threads. + unsigned index; // Worker thread index in pool. + unsigned rand; // Random generator state. + }; + + Environment env_; + MaxSizeVector threads_; + MaxSizeVector queues_; + std::vector waiters_; + std::atomic blocked_; + std::atomic spinning_; + std::atomic done_; + EventCount ec_; + + // Main worker thread loop. + void WorkerLoop(unsigned index) { + PerThread* pt = GetPerThread(); + pt->pool = this; + pt->index = index; + Queue* q = queues_[index]; + EventCount::Waiter* waiter = &waiters_[index]; + std::vector stolen; + for (;;) { + Task t; + if (!stolen.empty()) { + t = std::move(stolen.back()); + stolen.pop_back(); + } + if (!t.f) t = q->PopFront(); + if (!t.f) { + if (Steal(&stolen)) { + t = std::move(stolen.back()); + stolen.pop_back(); + while (stolen.size()) { + Task t1 = q->PushFront(std::move(stolen.back())); + stolen.pop_back(); + if (t1.f) { + // There is not much we can do in this case. Just execute the + // remaining directly. + stolen.push_back(std::move(t1)); + break; + } + } + } + } + if (t.f) { + env_.ExecuteTask(t); + continue; + } + // Leave one thread spinning. This reduces latency. + if (!spinning_ && !spinning_.exchange(true)) { + bool nowork = true; + for (int i = 0; i < 1000; i++) { + if (!OutOfWork()) { + nowork = false; + break; + } + } + spinning_ = false; + if (!nowork) continue; + } + if (!WaitForWork(waiter)) return; + } + } + + // Steal tries to steal work from other worker threads in best-effort manner. + bool Steal(std::vector* stolen) { + if (queues_.size() == 1) return false; + PerThread* pt = GetPerThread(); + unsigned lastq = pt->index; + for (unsigned i = queues_.size(); i > 0; i--) { + unsigned victim = Rand(&pt->rand) % queues_.size(); + if (victim == lastq && queues_.size() > 2) { + i++; + continue; + } + // Steal half of elements from a victim queue. + // It is typical to steal just one element, but that assumes that work is + // recursively subdivided in halves so that the stolen element is exactly + // half of work. If work elements are equally-sized, then is makes sense + // to steal half of elements at once and then work locally for a while. + if (queues_[victim]->PopBackHalf(stolen)) return true; + lastq = victim; + } + // Just to make sure that we did not miss anything. + for (unsigned i = queues_.size(); i > 0; i--) + if (queues_[i - 1]->PopBackHalf(stolen)) return true; + return false; + } + + // WaitForWork blocks until new work is available, or if it is time to exit. + bool WaitForWork(EventCount::Waiter* waiter) { + // We already did best-effort emptiness check in Steal, so prepare blocking. + ec_.Prewait(waiter); + // Now do reliable emptiness check. + if (!OutOfWork()) { + ec_.CancelWait(waiter); + return true; + } + // Number of blocked threads is used as termination condition. + // If we are shutting down and all worker threads blocked without work, + // that's we are done. + blocked_++; + if (done_ && blocked_ == threads_.size()) { + ec_.CancelWait(waiter); + // Almost done, but need to re-check queues. + // Consider that all queues are empty and all worker threads are preempted + // right after incrementing blocked_ above. Now a free-standing thread + // submits work and calls destructor (which sets done_). If we don't + // re-check queues, we will exit leaving the work unexecuted. + if (!OutOfWork()) { + // Note: we must not pop from queues before we decrement blocked_, + // otherwise the following scenario is possible. Consider that instead + // of checking for emptiness we popped the only element from queues. + // Now other worker threads can start exiting, which is bad if the + // work item submits other work. So we just check emptiness here, + // which ensures that all worker threads exit at the same time. + blocked_--; + return true; + } + // Reached stable termination state. + ec_.Notify(true); + return false; + } + ec_.CommitWait(waiter); + blocked_--; + return true; + } + + bool OutOfWork() { + for (unsigned i = 0; i < queues_.size(); i++) + if (!queues_[i]->Empty()) return false; + return true; + } + + PerThread* GetPerThread() { + static thread_local PerThread per_thread_; + PerThread* pt = &per_thread_; + if (pt->inited) return pt; + pt->inited = true; + pt->rand = std::hash()(std::this_thread::get_id()); + return pt; + } + + static unsigned Rand(unsigned* state) { + return *state = *state * 1103515245 + 12345; + } +}; + +typedef NonBlockingThreadPoolTempl NonBlockingThreadPool; + +} // namespace Eigen + +#endif // EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h new file mode 100644 index 000000000..aaa1d92c7 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h @@ -0,0 +1,210 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Dmitry Vyukov +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_ +#define EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_ + + +namespace Eigen { + +// RunQueue is a fixed-size, partially non-blocking deque or Work items. +// Operations on front of the queue must be done by a single thread (owner), +// operations on back of the queue can be done by multiple threads concurrently. +// +// Algorithm outline: +// All remote threads operating on the queue back are serialized by a mutex. +// This ensures that at most two threads access state: owner and one remote +// thread (Size aside). The algorithm ensures that the occupied region of the +// underlying array is logically continuous (can wraparound, but no stray +// occupied elements). Owner operates on one end of this region, remote thread +// operates on the other end. Synchronization between these threads +// (potential consumption of the last element and take up of the last empty +// element) happens by means of state variable in each element. States are: +// empty, busy (in process of insertion of removal) and ready. Threads claim +// elements (empty->busy and ready->busy transitions) by means of a CAS +// operation. The finishing transition (busy->empty and busy->ready) are done +// with plain store as the element is exclusively owned by the current thread. +// +// Note: we could permit only pointers as elements, then we would not need +// separate state variable as null/non-null pointer value would serve as state, +// but that would require malloc/free per operation for large, complex values +// (and this is designed to store std::function<()>). +template +class RunQueue { + public: + RunQueue() : front_(), back_() { + // require power-of-two for fast masking + eigen_assert((kSize & (kSize - 1)) == 0); + eigen_assert(kSize > 2); // why would you do this? + eigen_assert(kSize <= (64 << 10)); // leave enough space for counter + for (unsigned i = 0; i < kSize; i++) + array_[i].state.store(kEmpty, std::memory_order_relaxed); + } + + ~RunQueue() { eigen_assert(Size() == 0); } + + // PushFront inserts w at the beginning of the queue. + // If queue is full returns w, otherwise returns default-constructed Work. + Work PushFront(Work w) { + unsigned front = front_.load(std::memory_order_relaxed); + Elem* e = &array_[front & kMask]; + uint8_t s = e->state.load(std::memory_order_relaxed); + if (s != kEmpty || + !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) + return w; + front_.store(front + 1 + (kSize << 1), std::memory_order_relaxed); + e->w = std::move(w); + e->state.store(kReady, std::memory_order_release); + return Work(); + } + + // PopFront removes and returns the first element in the queue. + // If the queue was empty returns default-constructed Work. + Work PopFront() { + unsigned front = front_.load(std::memory_order_relaxed); + Elem* e = &array_[(front - 1) & kMask]; + uint8_t s = e->state.load(std::memory_order_relaxed); + if (s != kReady || + !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) + return Work(); + Work w = std::move(e->w); + e->state.store(kEmpty, std::memory_order_release); + front = ((front - 1) & kMask2) | (front & ~kMask2); + front_.store(front, std::memory_order_relaxed); + return w; + } + + // PushBack adds w at the end of the queue. + // If queue is full returns w, otherwise returns default-constructed Work. + Work PushBack(Work w) { + std::unique_lock lock(mutex_); + unsigned back = back_.load(std::memory_order_relaxed); + Elem* e = &array_[(back - 1) & kMask]; + uint8_t s = e->state.load(std::memory_order_relaxed); + if (s != kEmpty || + !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) + return w; + back = ((back - 1) & kMask2) | (back & ~kMask2); + back_.store(back, std::memory_order_relaxed); + e->w = std::move(w); + e->state.store(kReady, std::memory_order_release); + return Work(); + } + + // PopBack removes and returns the last elements in the queue. + // Can fail spuriously. + Work PopBack() { + if (Empty()) return 0; + std::unique_lock lock(mutex_, std::try_to_lock); + if (!lock) return Work(); + unsigned back = back_.load(std::memory_order_relaxed); + Elem* e = &array_[back & kMask]; + uint8_t s = e->state.load(std::memory_order_relaxed); + if (s != kReady || + !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) + return Work(); + Work w = std::move(e->w); + e->state.store(kEmpty, std::memory_order_release); + back_.store(back + 1 + (kSize << 1), std::memory_order_relaxed); + return w; + } + + // PopBackHalf removes and returns half last elements in the queue. + // Returns number of elements removed. But can also fail spuriously. + unsigned PopBackHalf(std::vector* result) { + if (Empty()) return 0; + std::unique_lock lock(mutex_, std::try_to_lock); + if (!lock) return 0; + unsigned back = back_.load(std::memory_order_relaxed); + unsigned size = Size(); + unsigned mid = back; + if (size > 1) mid = back + (size - 1) / 2; + unsigned n = 0; + unsigned start = 0; + for (; static_cast(mid - back) >= 0; mid--) { + Elem* e = &array_[mid & kMask]; + uint8_t s = e->state.load(std::memory_order_relaxed); + if (n == 0) { + if (s != kReady || + !e->state.compare_exchange_strong(s, kBusy, + std::memory_order_acquire)) + continue; + start = mid; + } else { + // Note: no need to store temporal kBusy, we exclusively own these + // elements. + eigen_assert(s == kReady); + } + result->push_back(std::move(e->w)); + e->state.store(kEmpty, std::memory_order_release); + n++; + } + if (n != 0) + back_.store(start + 1 + (kSize << 1), std::memory_order_relaxed); + return n; + } + + // Size returns current queue size. + // Can be called by any thread at any time. + unsigned Size() const { + // Emptiness plays critical role in thread pool blocking. So we go to great + // effort to not produce false positives (claim non-empty queue as empty). + for (;;) { + // Capture a consistent snapshot of front/tail. + unsigned front = front_.load(std::memory_order_acquire); + unsigned back = back_.load(std::memory_order_acquire); + unsigned front1 = front_.load(std::memory_order_relaxed); + if (front != front1) continue; + int size = (front & kMask2) - (back & kMask2); + // Fix overflow. + if (size < 0) size += 2 * kSize; + // Order of modification in push/pop is crafted to make the queue look + // larger than it is during concurrent modifications. E.g. pop can + // decrement size before the corresponding push has incremented it. + // So the computed size can be up to kSize + 1, fix it. + if (size > kSize) size = kSize; + return size; + } + } + + // Empty tests whether container is empty. + // Can be called by any thread at any time. + bool Empty() const { return Size() == 0; } + + private: + static const unsigned kMask = kSize - 1; + static const unsigned kMask2 = (kSize << 1) - 1; + struct Elem { + std::atomic state; + Work w; + }; + enum { + kEmpty, + kBusy, + kReady, + }; + std::mutex mutex_; + // Low log(kSize) + 1 bits in front_ and back_ contain rolling index of + // front/back, repsectively. The remaining bits contain modification counters + // that are incremented on Push operations. This allows us to (1) distinguish + // between empty and full conditions (if we would use log(kSize) bits for + // position, these conditions would be indistinguishable); (2) obtain + // consistent snapshot of front_/back_ for Size operation using the + // modification counters. + std::atomic front_; + std::atomic back_; + Elem array_[kSize]; + + RunQueue(const RunQueue&) = delete; + void operator=(const RunQueue&) = delete; +}; + +} // namespace Eigen + +#endif // EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_ diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h new file mode 100644 index 000000000..17fd1658b --- /dev/null +++ b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h @@ -0,0 +1,127 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H +#define EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H + +namespace Eigen { + +// The implementation of the ThreadPool type ensures that the Schedule method +// runs the functions it is provided in FIFO order when the scheduling is done +// by a single thread. +// Environment provides a way to create threads and also allows to intercept +// task submission and execution. +template +class SimpleThreadPoolTempl : public ThreadPoolInterface { + public: + // Construct a pool that contains "num_threads" threads. + explicit SimpleThreadPoolTempl(int num_threads, Environment env = Environment()) + : env_(env), threads_(num_threads), waiters_(num_threads) { + for (int i = 0; i < num_threads; i++) { + threads_.push_back(env.CreateThread([this]() { WorkerLoop(); })); + } + } + + // Wait until all scheduled work has finished and then destroy the + // set of threads. + ~SimpleThreadPoolTempl() { + { + // Wait for all work to get done. + std::unique_lock l(mu_); + while (!pending_.empty()) { + empty_.wait(l); + } + exiting_ = true; + + // Wakeup all waiters. + for (auto w : waiters_) { + w->ready = true; + w->task.f = nullptr; + w->cv.notify_one(); + } + } + + // Wait for threads to finish. + for (auto t : threads_) { + delete t; + } + } + + // Schedule fn() for execution in the pool of threads. The functions are + // executed in the order in which they are scheduled. + void Schedule(std::function fn) { + Task t = env_.CreateTask(std::move(fn)); + std::unique_lock l(mu_); + if (waiters_.empty()) { + pending_.push_back(std::move(t)); + } else { + Waiter* w = waiters_.back(); + waiters_.pop_back(); + w->ready = true; + w->task = std::move(t); + w->cv.notify_one(); + } + } + + protected: + void WorkerLoop() { + std::unique_lock l(mu_); + Waiter w; + Task t; + while (!exiting_) { + if (pending_.empty()) { + // Wait for work to be assigned to me + w.ready = false; + waiters_.push_back(&w); + while (!w.ready) { + w.cv.wait(l); + } + t = w.task; + w.task.f = nullptr; + } else { + // Pick up pending work + t = std::move(pending_.front()); + pending_.pop_front(); + if (pending_.empty()) { + empty_.notify_all(); + } + } + if (t.f) { + mu_.unlock(); + env_.ExecuteTask(t); + t.f = nullptr; + mu_.lock(); + } + } + } + + private: + typedef typename Environment::Task Task; + typedef typename Environment::EnvThread Thread; + + struct Waiter { + std::condition_variable cv; + Task task; + bool ready; + }; + + Environment env_; + std::mutex mu_; + MaxSizeVector threads_; // All threads + MaxSizeVector waiters_; // Stack of waiting threads. + std::deque pending_; // Queue of pending work + std::condition_variable empty_; // Signaled on pending_.empty() + bool exiting_ = false; +}; + +typedef SimpleThreadPoolTempl SimpleThreadPool; + +} // namespace Eigen + +#endif // EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h new file mode 100644 index 000000000..d2204ad5b --- /dev/null +++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h @@ -0,0 +1,38 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H +#define EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H + +namespace Eigen { + +struct StlThreadEnvironment { + struct Task { + std::function f; + }; + + // EnvThread constructor must start the thread, + // destructor must join the thread. + class EnvThread { + public: + EnvThread(std::function f) : thr_(f) {} + ~EnvThread() { thr_.join(); } + + private: + std::thread thr_; + }; + + EnvThread* CreateThread(std::function f) { return new EnvThread(f); } + Task CreateTask(std::function f) { return Task{std::move(f)}; } + void ExecuteTask(const Task& t) { t.f(); } +}; + +} // namespace Eigen + +#endif // EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h new file mode 100644 index 000000000..38b40aceb --- /dev/null +++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h @@ -0,0 +1,26 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H +#define EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H + +namespace Eigen { + +// This defines an interface that ThreadPoolDevice can take to use +// custom thread pools underneath. +class ThreadPoolInterface { + public: + virtual void Schedule(std::function fn) = 0; + + virtual ~ThreadPoolInterface() {} +}; + +} // namespace Eigen + +#endif // EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H -- cgit v1.2.3 From a8e8837ba7f76c5cad3f2206cd32f90ee48eea96 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 14 Apr 2016 15:23:49 -0700 Subject: Added tests for the non blocking thread pool --- unsupported/test/cxx11_eventcount.cpp | 128 ++++++++++++++++++++ unsupported/test/cxx11_runqueue.cpp | 214 ++++++++++++++++++++++++++++++++++ 2 files changed, 342 insertions(+) create mode 100644 unsupported/test/cxx11_eventcount.cpp create mode 100644 unsupported/test/cxx11_runqueue.cpp diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp new file mode 100644 index 000000000..a58c21a30 --- /dev/null +++ b/unsupported/test/cxx11_eventcount.cpp @@ -0,0 +1,128 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Dmitry Vyukov +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" +#include + +static void test_basic_eventcount() +{ + std::vector waiters(1); + EventCount ec(waiters); + EventCount::Waiter& w = waiters[0]; + ec.Notify(false); + ec.Prewait(&w); + ec.Notify(true); + ec.CommitWait(&w); + ec.Prewait(&w); + ec.CancelWait(&w); +} + +// Fake bounded counter-based queue. +struct TestQueue { + std::atomic val_; + static const int kQueueSize = 10; + + TestQueue() : val_() {} + + ~TestQueue() { VERIFY_IS_EQUAL(val_.load(), 0); } + + bool Push() { + int val = val_.load(std::memory_order_relaxed); + for (;;) { + VERIFY_GE(val, 0); + VERIFY_LE(val, kQueueSize); + if (val == kQueueSize) return false; + if (val_.compare_exchange_weak(val, val + 1, std::memory_order_relaxed)) + return true; + } + } + + bool Pop() { + int val = val_.load(std::memory_order_relaxed); + for (;;) { + VERIFY_GE(val, 0); + VERIFY_LE(val, kQueueSize); + if (val == 0) return false; + if (val_.compare_exchange_weak(val, val - 1, std::memory_order_relaxed)) + return true; + } + } + + bool Empty() { return val_.load(std::memory_order_relaxed) == 0; } +}; + +const int TestQueue::kQueueSize; + +// A number of producers send messages to a set of consumers using a set of +// fake queues. Ensure that it does not crash, consumers don't deadlock and +// number of blocked and unblocked threads match. +static void test_stress_eventcount() +{ + const int kThreads = std::thread::hardware_concurrency(); + const int kEvents = 1 << 16; + const int kQueues = 10; + + std::vector waiters(kThreads); + EventCount ec(waiters); + TestQueue queues[kQueues]; + + std::vector> producers; + for (int i = 0; i < kThreads; i++) { + producers.emplace_back(new std::thread([&ec, &queues]() { + unsigned rnd = std::hash()(std::this_thread::get_id()); + for (int i = 0; i < kEvents; i++) { + unsigned idx = rand_r(&rnd) % kQueues; + if (queues[idx].Push()) { + ec.Notify(false); + continue; + } + std::this_thread::yield(); + i--; + } + })); + } + + std::vector> consumers; + for (int i = 0; i < kThreads; i++) { + consumers.emplace_back(new std::thread([&ec, &queues, &waiters, i]() { + EventCount::Waiter& w = waiters[i]; + unsigned rnd = std::hash()(std::this_thread::get_id()); + for (int i = 0; i < kEvents; i++) { + unsigned idx = rand_r(&rnd) % kQueues; + if (queues[idx].Pop()) continue; + i--; + ec.Prewait(&w); + bool empty = true; + for (int q = 0; q < kQueues; q++) { + if (!queues[q].Empty()) { + empty = false; + break; + } + } + if (!empty) { + ec.CancelWait(&w); + continue; + } + ec.CommitWait(&w); + } + })); + } + + for (int i = 0; i < kThreads; i++) { + producers[i]->join(); + consumers[i]->join(); + } +} + +void test_cxx11_eventcount() +{ + CALL_SUBTEST(test_basic_eventcount()); + CALL_SUBTEST(test_stress_eventcount()); +} diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp new file mode 100644 index 000000000..5a0a13103 --- /dev/null +++ b/unsupported/test/cxx11_runqueue.cpp @@ -0,0 +1,214 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Dmitry Vyukov +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +static void test_basic_runqueue() +{ + RunQueue q; + // Check empty state. + VERIFY(q.Empty()); + VERIFY_IS_EQUAL(0, q.Size()); + VERIFY_IS_EQUAL(0, q.PopFront()); + std::vector stolen; + VERIFY_IS_EQUAL(0, q.PopBackHalf(&stolen)); + VERIFY_IS_EQUAL(0, stolen.size()); + // Push one front, pop one front. + VERIFY_IS_EQUAL(0, q.PushFront(1)); + VERIFY_IS_EQUAL(1, q.Size()); + VERIFY_IS_EQUAL(1, q.PopFront()); + VERIFY_IS_EQUAL(0, q.Size()); + // Push front to overflow. + VERIFY_IS_EQUAL(0, q.PushFront(2)); + VERIFY_IS_EQUAL(1, q.Size()); + VERIFY_IS_EQUAL(0, q.PushFront(3)); + VERIFY_IS_EQUAL(2, q.Size()); + VERIFY_IS_EQUAL(0, q.PushFront(4)); + VERIFY_IS_EQUAL(3, q.Size()); + VERIFY_IS_EQUAL(0, q.PushFront(5)); + VERIFY_IS_EQUAL(4, q.Size()); + VERIFY_IS_EQUAL(6, q.PushFront(6)); + VERIFY_IS_EQUAL(4, q.Size()); + VERIFY_IS_EQUAL(5, q.PopFront()); + VERIFY_IS_EQUAL(3, q.Size()); + VERIFY_IS_EQUAL(4, q.PopFront()); + VERIFY_IS_EQUAL(2, q.Size()); + VERIFY_IS_EQUAL(3, q.PopFront()); + VERIFY_IS_EQUAL(1, q.Size()); + VERIFY_IS_EQUAL(2, q.PopFront()); + VERIFY_IS_EQUAL(0, q.Size()); + VERIFY_IS_EQUAL(0, q.PopFront()); + // Push one back, pop one back. + VERIFY_IS_EQUAL(0, q.PushBack(7)); + VERIFY_IS_EQUAL(1, q.Size()); + VERIFY_IS_EQUAL(1, q.PopBackHalf(&stolen)); + VERIFY_IS_EQUAL(1, stolen.size()); + VERIFY_IS_EQUAL(7, stolen[0]); + VERIFY_IS_EQUAL(0, q.Size()); + stolen.clear(); + // Push back to overflow. + VERIFY_IS_EQUAL(0, q.PushBack(8)); + VERIFY_IS_EQUAL(1, q.Size()); + VERIFY_IS_EQUAL(0, q.PushBack(9)); + VERIFY_IS_EQUAL(2, q.Size()); + VERIFY_IS_EQUAL(0, q.PushBack(10)); + VERIFY_IS_EQUAL(3, q.Size()); + VERIFY_IS_EQUAL(0, q.PushBack(11)); + VERIFY_IS_EQUAL(4, q.Size()); + VERIFY_IS_EQUAL(12, q.PushBack(12)); + VERIFY_IS_EQUAL(4, q.Size()); + // Pop back in halves. + VERIFY_IS_EQUAL(2, q.PopBackHalf(&stolen)); + VERIFY_IS_EQUAL(2, stolen.size()); + VERIFY_IS_EQUAL(10, stolen[0]); + VERIFY_IS_EQUAL(11, stolen[1]); + VERIFY_IS_EQUAL(2, q.Size()); + stolen.clear(); + VERIFY_IS_EQUAL(1, q.PopBackHalf(&stolen)); + VERIFY_IS_EQUAL(1, stolen.size()); + VERIFY_IS_EQUAL(9, stolen[0]); + VERIFY_IS_EQUAL(1, q.Size()); + stolen.clear(); + VERIFY_IS_EQUAL(1, q.PopBackHalf(&stolen)); + VERIFY_IS_EQUAL(1, stolen.size()); + VERIFY_IS_EQUAL(8, stolen[0]); + stolen.clear(); + VERIFY_IS_EQUAL(0, q.PopBackHalf(&stolen)); + VERIFY_IS_EQUAL(0, stolen.size()); + // Empty again. + VERIFY(q.Empty()); + VERIFY_IS_EQUAL(0, q.Size()); +} + +// Empty tests that the queue is not claimed to be empty when is is in fact not. +// Emptiness property is crucial part of thread pool blocking scheme, +// so we go to great effort to ensure this property. We create a queue with +// 1 element and then push 1 element (either front or back at random) and pop +// 1 element (either front or back at random). So queue always contains at least +// 1 element, but otherwise changes chaotically. Another thread constantly tests +// that the queue is not claimed to be empty. +static void test_empty_runqueue() +{ + RunQueue q; + q.PushFront(1); + std::atomic done(false); + std::thread mutator([&q, &done]() { + unsigned rnd = 0; + std::vector stolen; + for (int i = 0; i < 1 << 18; i++) { + if (rand_r(&rnd) % 2) + VERIFY_IS_EQUAL(0, q.PushFront(1)); + else + VERIFY_IS_EQUAL(0, q.PushBack(1)); + if (rand_r(&rnd) % 2) + VERIFY_IS_EQUAL(1, q.PopFront()); + else { + for (;;) { + if (q.PopBackHalf(&stolen) == 1) { + stolen.clear(); + break; + } + VERIFY_IS_EQUAL(0, stolen.size()); + } + } + } + done = true; + }); + while (!done) { + VERIFY(!q.Empty()); + int size = q.Size(); + VERIFY_GE(size, 1); + VERIFY_LE(size, 2); + } + VERIFY_IS_EQUAL(1, q.PopFront()); + mutator.join(); +} + +// Stress is a chaotic random test. +// One thread (owner) calls PushFront/PopFront, other threads call PushBack/ +// PopBack. Ensure that we don't crash, deadlock, and all sanity checks pass. +static void test_stress_runqueue() +{ + const int kEvents = 1 << 18; + RunQueue q; + std::atomic total(0); + std::vector> threads; + threads.emplace_back(new std::thread([&q, &total]() { + int sum = 0; + int pushed = 1; + int popped = 1; + while (pushed < kEvents || popped < kEvents) { + if (pushed < kEvents) { + if (q.PushFront(pushed) == 0) { + sum += pushed; + pushed++; + } + } + if (popped < kEvents) { + int v = q.PopFront(); + if (v != 0) { + sum -= v; + popped++; + } + } + } + total += sum; + })); + for (int i = 0; i < 2; i++) { + threads.emplace_back(new std::thread([&q, &total]() { + int sum = 0; + for (int i = 1; i < kEvents; i++) { + if (q.PushBack(i) == 0) { + sum += i; + continue; + } + std::this_thread::yield(); + i--; + } + total += sum; + })); + threads.emplace_back(new std::thread([&q, &total]() { + int sum = 0; + std::vector stolen; + for (int i = 1; i < kEvents;) { + if (q.PopBackHalf(&stolen) == 0) { + std::this_thread::yield(); + continue; + } + while (stolen.size() && i < kEvents) { + int v = stolen.back(); + stolen.pop_back(); + VERIFY_IS_NOT_EQUAL(v, 0); + sum += v; + i++; + } + } + while (stolen.size()) { + int v = stolen.back(); + stolen.pop_back(); + VERIFY_IS_NOT_EQUAL(v, 0); + while ((v = q.PushBack(v)) != 0) std::this_thread::yield(); + } + total -= sum; + })); + } + for (size_t i = 0; i < threads.size(); i++) threads[i]->join(); + VERIFY(q.Empty()); + VERIFY(total.load() == 0); +} + +void test_cxx11_runqueue() +{ + CALL_SUBTEST_1(test_basic_runqueue()); + CALL_SUBTEST_2(test_empty_runqueue()); + CALL_SUBTEST_3(test_stress_runqueue()); +} -- cgit v1.2.3 From aeb5494a0b2edef3be447cec222e2d178e413389 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 14 Apr 2016 15:52:58 -0700 Subject: Improvements to cost model. --- .../Eigen/CXX11/src/Tensor/TensorCostModel.h | 57 +++++++++++++++++----- 1 file changed, 45 insertions(+), 12 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h index 366352853..32bc5d0b2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h @@ -88,6 +88,13 @@ class TensorOpCost { compute_cost * compute_cycles_; } + // Drop memory access component. Intended for cases when memory accesses are + // sequential or are completely masked by computations. + EIGEN_DEVICE_FUNC void dropMemoryCost() { + bytes_loaded_ = 0; + bytes_stored_ = 0; + } + // TODO(rmlarsen): Define min in terms of total cost, not elementwise. EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMin( const TensorOpCost& rhs) { @@ -155,24 +162,50 @@ class TensorOpCost { template class TensorCostModel { public: - // Costs in device cycles. - static const int kLoadCycles = 3; - static const int kStoreCycles = 3; // Scaling from Eigen compute cost to device cycles. static const int kDeviceCyclesPerComputeCycle = 1; - // Implements a simple "binary" policy: Return 1 if total cost is below - // kMinWorkToParallelize and max_threads otherwise. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int numThreads( + // Costs in device cycles. + static const int kStartupCycles = 100000; + static const int kPerThreadCycles = 100000; + static const int kTaskSize = 40000; + + // Returns the number of threads in [1:max_threads] to use for + // evaluating an expression with the given output size and cost per + // coefficient. + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads( double output_size, const TensorOpCost& cost_per_coeff, int max_threads) { - // Compute total cost C in device cycles. - const double total_cost = - output_size * + double cost = totalCost(output_size, cost_per_coeff); + int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9; + return numext::mini(max_threads, numext::maxi(1, threads)); + } + + // taskSize assesses parallel task size. + // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task + // granularity needs to be increased to mitigate parallelization overheads. + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize( + double output_size, const TensorOpCost& cost_per_coeff) { + return totalCost(output_size, cost_per_coeff) / kTaskSize; + } + + private: + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost( + double output_size, const TensorOpCost& cost_per_coeff) { + // Cost of memory fetches from L2 cache. 64 is typical cache line size. + // 11 is L2 cache latency on Haswell. + // We don't know whether data is in L1, L2 or L3. But we are most interested + // in single-threaded computational time around 100us-10ms (smaller time + // is too small for parallelization, larger time is not intersting + // either because we are probably using all available threads already). + // And for the target time range, L2 seems to be what matters. Data set + // fitting into L1 is too small to take noticeable time. Data set fitting + // only into L3 presumably will take more than 10ms to load and process. + const double kLoadCycles = 1.0 / 64 * 11; + const double kStoreCycles = 1.0 / 64 * 11; + // Scaling from Eigen compute cost to device cycles. + return output_size * cost_per_coeff.total_cost(kLoadCycles, kStoreCycles, kDeviceCyclesPerComputeCycle); - // Smallest work unit to parallelize. - const double kMinParallelCost = 1e6; - return total_cost < kMinParallelCost ? 1 : max_threads; } }; -- cgit v1.2.3 From 1372156c41a103757b68ed572b92654ed01f62e3 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 14 Apr 2016 16:16:42 -0700 Subject: Prepared the migration to the new non blocking thread pool --- unsupported/Eigen/CXX11/Tensor | 6 +- .../CXX11/src/Tensor/TensorDeviceThreadPool.h | 142 +-------------------- 2 files changed, 4 insertions(+), 144 deletions(-) diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 16132398d..65f5c87e9 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -51,11 +51,7 @@ typedef unsigned __int64 uint64_t; #endif #ifdef EIGEN_USE_THREADS -#include -#include -#include -#include -#include +#include "ThreadPool" #endif #ifdef EIGEN_USE_GPU diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index cd3dd214b..6da16985f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -12,145 +12,9 @@ namespace Eigen { -// This defines an interface that ThreadPoolDevice can take to use -// custom thread pools underneath. -class ThreadPoolInterface { - public: - virtual void Schedule(std::function fn) = 0; - - virtual ~ThreadPoolInterface() {} -}; - -// The implementation of the ThreadPool type ensures that the Schedule method -// runs the functions it is provided in FIFO order when the scheduling is done -// by a single thread. -// Environment provides a way to create threads and also allows to intercept -// task submission and execution. -template -class ThreadPoolTempl : public ThreadPoolInterface { - public: - // Construct a pool that contains "num_threads" threads. - explicit ThreadPoolTempl(int num_threads, Environment env = Environment()) - : env_(env), threads_(num_threads), waiters_(num_threads) { - for (int i = 0; i < num_threads; i++) { - threads_.push_back(env.CreateThread([this]() { WorkerLoop(); })); - } - } - - // Wait until all scheduled work has finished and then destroy the - // set of threads. - ~ThreadPoolTempl() { - { - // Wait for all work to get done. - std::unique_lock l(mu_); - while (!pending_.empty()) { - empty_.wait(l); - } - exiting_ = true; - - // Wakeup all waiters. - for (auto w : waiters_) { - w->ready = true; - w->task.f = nullptr; - w->cv.notify_one(); - } - } - - // Wait for threads to finish. - for (auto t : threads_) { - delete t; - } - } - - // Schedule fn() for execution in the pool of threads. The functions are - // executed in the order in which they are scheduled. - void Schedule(std::function fn) { - Task t = env_.CreateTask(std::move(fn)); - std::unique_lock l(mu_); - if (waiters_.empty()) { - pending_.push_back(std::move(t)); - } else { - Waiter* w = waiters_.back(); - waiters_.pop_back(); - w->ready = true; - w->task = std::move(t); - w->cv.notify_one(); - } - } - - protected: - void WorkerLoop() { - std::unique_lock l(mu_); - Waiter w; - Task t; - while (!exiting_) { - if (pending_.empty()) { - // Wait for work to be assigned to me - w.ready = false; - waiters_.push_back(&w); - while (!w.ready) { - w.cv.wait(l); - } - t = w.task; - w.task.f = nullptr; - } else { - // Pick up pending work - t = std::move(pending_.front()); - pending_.pop_front(); - if (pending_.empty()) { - empty_.notify_all(); - } - } - if (t.f) { - mu_.unlock(); - env_.ExecuteTask(t); - t.f = nullptr; - mu_.lock(); - } - } - } - - private: - typedef typename Environment::Task Task; - typedef typename Environment::EnvThread Thread; - - struct Waiter { - std::condition_variable cv; - Task task; - bool ready; - }; - - Environment env_; - std::mutex mu_; - MaxSizeVector threads_; // All threads - MaxSizeVector waiters_; // Stack of waiting threads. - std::deque pending_; // Queue of pending work - std::condition_variable empty_; // Signaled on pending_.empty() - bool exiting_ = false; -}; - -struct StlThreadEnvironment { - struct Task { - std::function f; - }; - - // EnvThread constructor must start the thread, - // destructor must join the thread. - class EnvThread { - public: - EnvThread(std::function f) : thr_(f) {} - ~EnvThread() { thr_.join(); } - - private: - std::thread thr_; - }; - - EnvThread* CreateThread(std::function f) { return new EnvThread(f); } - Task CreateTask(std::function f) { return Task{std::move(f)}; } - void ExecuteTask(const Task& t) { t.f(); } -}; - -typedef ThreadPoolTempl ThreadPool; +// Use the SimpleThreadPool by default. We'll switch to the new non blocking +// thread pool later. +typedef SimpleThreadPool ThreadPool; // Barrier is an object that allows one or more threads to wait until -- cgit v1.2.3 From 9c064b5a97f9504b573991963f3ed1fbbcf33eac Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 14 Apr 2016 16:41:31 -0700 Subject: Cleanup --- unsupported/Eigen/CXX11/ThreadPool | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/unsupported/Eigen/CXX11/ThreadPool b/unsupported/Eigen/CXX11/ThreadPool index 85028603c..6f360ae2f 100644 --- a/unsupported/Eigen/CXX11/ThreadPool +++ b/unsupported/Eigen/CXX11/ThreadPool @@ -10,7 +10,6 @@ #ifndef EIGEN_CXX11_THREADPOOL_MODULE #define EIGEN_CXX11_THREADPOOL_MODULE -//#include #include "Core" #include @@ -28,17 +27,9 @@ * \endcode */ -//#include - -//#include "src/Core/util/EmulateArray.h" -//#include "src/Core/util/MaxSizeVector.h" -//#include "third_party/eigen3/Eigen/src/Core/util/Macros.h" - - -// Emulate the cxx11 functionality that we need if the compiler doesn't support it. -// Visual studio 2015 doesn't advertise itself as cxx11 compliant, although it -// supports enough of the standard for our needs +// The code depends on CXX11, so only include the module if the +// compiler supports it. #if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900 #include #include -- cgit v1.2.3 From bebb89acfaffe68a48585e4724de20dfab3a92d6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 14 Apr 2016 16:44:10 -0700 Subject: Enabled the new threadpool tests --- test/main.h | 4 ++++ unsupported/test/CMakeLists.txt | 2 ++ unsupported/test/cxx11_eventcount.cpp | 1 + unsupported/test/cxx11_runqueue.cpp | 8 ++++---- 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/test/main.h b/test/main.h index dbb496b89..b0e3b7818 100644 --- a/test/main.h +++ b/test/main.h @@ -275,6 +275,10 @@ inline void verify_impl(bool condition, const char *testname, const char *file, #define VERIFY(a) ::verify_impl(a, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a)) +#define VERIFY_GE(a, b) ::verify_impl(a >= b, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a >= b)) +#define VERIFY_LE(a, b) ::verify_impl(a <= b, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a <= b)) + + #define VERIFY_IS_EQUAL(a, b) VERIFY(test_is_equal(a, b)) #define VERIFY_IS_NOT_EQUAL(a, b) VERIFY(!test_is_equal(a, b)) #define VERIFY_IS_APPROX(a, b) VERIFY(verifyIsApprox(a, b)) diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index c088df1c1..b9e1b34bf 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -116,6 +116,8 @@ if(EIGEN_TEST_CXX11) set(CMAKE_CXX_STANDARD 11) ei_add_test(cxx11_float16) + ei_add_test(cxx11_eventcount "-pthread" "${CMAKE_THREAD_LIBS_INIT}") + ei_add_test(cxx11_runqueue "-pthread" "${CMAKE_THREAD_LIBS_INIT}") ei_add_test(cxx11_meta) ei_add_test(cxx11_tensor_simple) # ei_add_test(cxx11_tensor_symmetry) diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp index a58c21a30..271e0f66d 100644 --- a/unsupported/test/cxx11_eventcount.cpp +++ b/unsupported/test/cxx11_eventcount.cpp @@ -8,6 +8,7 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#define EIGEN_USE_THREADS #include "main.h" #include diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp index 5a0a13103..f1217a01b 100644 --- a/unsupported/test/cxx11_runqueue.cpp +++ b/unsupported/test/cxx11_runqueue.cpp @@ -8,11 +8,11 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#define EIGEN_USE_THREADS #include "main.h" - #include -static void test_basic_runqueue() +void test_basic_runqueue() { RunQueue q; // Check empty state. @@ -96,7 +96,7 @@ static void test_basic_runqueue() // 1 element (either front or back at random). So queue always contains at least // 1 element, but otherwise changes chaotically. Another thread constantly tests // that the queue is not claimed to be empty. -static void test_empty_runqueue() +void test_empty_runqueue() { RunQueue q; q.PushFront(1); @@ -136,7 +136,7 @@ static void test_empty_runqueue() // Stress is a chaotic random test. // One thread (owner) calls PushFront/PopFront, other threads call PushBack/ // PopBack. Ensure that we don't crash, deadlock, and all sanity checks pass. -static void test_stress_runqueue() +void test_stress_runqueue() { const int kEvents = 1 << 18; RunQueue q; -- cgit v1.2.3 From 9624a1ea3d73292bdb5bef49f131089be341523f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 14 Apr 2016 17:16:58 -0700 Subject: Added missing definition of PacketSize in the gpu evaluator of convolution --- unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h | 1 + 1 file changed, 1 insertion(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 680fc0a68..ff3c5662d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -790,6 +790,7 @@ struct TensorEvaluator::type PacketReturnType; typedef typename InputArgType::Scalar Scalar; + static const int PacketSize = internal::unpacket_traits::size; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } -- cgit v1.2.3 From 07ac4f7e027cddd3457a34295420480f7e541ac5 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 14 Apr 2016 18:28:23 -0700 Subject: Eigen Tensor cost model part 2: Thread scheduling for standard evaluators and reductions. The cost model is turned off by default. --- .../Eigen/CXX11/src/Tensor/TensorCostModel.h | 6 +- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 5 + .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 93 ++++++++++++------ .../Eigen/CXX11/src/Tensor/TensorReduction.h | 104 ++++++++------------- 4 files changed, 111 insertions(+), 97 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h index 32bc5d0b2..4e8f86674 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h @@ -10,9 +10,9 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H -#if !defined(EIGEN_USE_GPU) -#define EIGEN_USE_COST_MODEL -#endif +//#if !defined(EIGEN_USE_GPU) +//#define EIGEN_USE_COST_MODEL +//#endif namespace Eigen { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index f1f9a90df..293012646 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -189,6 +189,11 @@ struct TensorEvaluator return loadConstant(m_data+index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, + internal::unpacket_traits::size); + } + EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index eabfd91fe..df9cc0998 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -59,9 +59,16 @@ class TensorExecutor { const Index size = array_prod(evaluator.dimensions()); const int PacketSize = unpacket_traits::PacketReturnType>::size; + // Manually unroll this loop since compilers don't do it. + const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; + for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) { + evaluator.evalPacket(i); + evaluator.evalPacket(i+PacketSize); + evaluator.evalPacket(i+2*PacketSize); + evaluator.evalPacket(i+3*PacketSize); + } const Index VectorizedSize = (size / PacketSize) * PacketSize; - - for (Index i = 0; i < VectorizedSize; i += PacketSize) { + for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) { evaluator.evalPacket(i); } for (Index i = VectorizedSize; i < size; ++i) { @@ -78,8 +85,9 @@ class TensorExecutor #ifdef EIGEN_USE_THREADS template struct EvalRange { - static void run(Evaluator evaluator, const Index first, const Index last) { - eigen_assert(last > first); + static void run(void* evaluator_in, const Index first, const Index last) { + Evaluator evaluator(*static_cast(evaluator_in)); + eigen_assert(last >= first); for (Index i = first; i < last; ++i) { evaluator.evalScalar(i); } @@ -88,28 +96,45 @@ struct EvalRange { template struct EvalRange { - static void run(Evaluator evaluator, const Index first, const Index last) { - eigen_assert(last > first); + static void run(void* evaluator_in, const Index first, const Index last) { + Evaluator evaluator(*static_cast(evaluator_in)); + eigen_assert(last >= first); Index i = first; - static const int PacketSize = unpacket_traits::size; + const int PacketSize = unpacket_traits::size; if (last - first >= PacketSize) { eigen_assert(first % PacketSize == 0); - Index lastPacket = last - (last % PacketSize); - for (; i < lastPacket; i += PacketSize) { + Index last_chunk_offset = last - 4 * PacketSize; + // Manually unroll this loop since compilers don't do it. + for (; i <= last_chunk_offset; i += 4*PacketSize) { + evaluator.evalPacket(i); + evaluator.evalPacket(i+PacketSize); + evaluator.evalPacket(i+2*PacketSize); + evaluator.evalPacket(i+3*PacketSize); + } + last_chunk_offset = last - PacketSize; + for (; i <= last_chunk_offset; i += PacketSize) { evaluator.evalPacket(i); } } - for (; i < last; ++i) { evaluator.evalScalar(i); } } }; -template -class TensorExecutor -{ +// Used to make an std::function to add to the ThreadPool with less templating +// than EvalRange::Run. +// This requires that this and EvalRange takes a void* to the evaluator that can +// be downcast to the right type by the EvalRange. +template +inline void InvokeEvalRange(void (*run_fn)(void*, const Index, const Index), + void* evaluator, const Index first, const Index last) { + run_fn(evaluator, first, last); +} + +template +class TensorExecutor { public: typedef typename Expression::Index Index; static inline void run(const Expression& expr, const ThreadPoolDevice& device) @@ -119,24 +144,35 @@ class TensorExecutor const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { + const Index PacketSize = Vectorizable ? unpacket_traits::size : 1; const Index size = array_prod(evaluator.dimensions()); - - static const int PacketSize = Vectorizable ? unpacket_traits::size : 1; - - int blocksz = std::ceil(static_cast(size)/device.numThreads()) + PacketSize - 1; - const Index blocksize = numext::maxi(PacketSize, (blocksz - (blocksz % PacketSize))); - const unsigned int numblocks = static_cast(size / blocksize); - - Barrier barrier(numblocks); - for (unsigned int i = 0; i < numblocks; ++i) { - device.enqueue_with_barrier(&barrier, &EvalRange::run, evaluator, i*blocksize, (i+1)*blocksize); + int num_threads = device.numThreads(); +#ifdef EIGEN_USE_COST_MODEL + if (num_threads > 1) { + num_threads = TensorCostModel::numThreads( + size, evaluator.costPerCoeff(Vectorizable), num_threads); } - - if (static_cast(numblocks) * blocksize < size) { - EvalRange::run(evaluator, numblocks * blocksize, size); +#endif + if (num_threads == 1) { + EvalRange::run(&evaluator, 0, size); + } else { + Index blocksz = std::ceil(static_cast(size)/num_threads) + PacketSize - 1; + const Index blocksize = numext::maxi(PacketSize, (blocksz - (blocksz % PacketSize))); + const Index numblocks = size / blocksize; + + Barrier barrier(numblocks); + for (int i = 0; i < numblocks; ++i) { + device.enqueue_with_barrier( + &barrier, &InvokeEvalRange, + &EvalRange::run, + static_cast(&evaluator), i * blocksize, + (i + 1) * blocksize); + } + if (numblocks * blocksize < size) { + EvalRange::run(&evaluator, numblocks * blocksize, size); + } + barrier.Wait(); } - - barrier.Wait(); } evaluator.cleanup(); } @@ -226,7 +262,6 @@ inline void TensorExecutor::run( #endif // __CUDACC__ #endif // EIGEN_USE_GPU - } // end namespace internal } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 1c9e7ab66..885295f0a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -214,7 +214,7 @@ struct FullReducer { static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) { const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions()); - *output = InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); + *output = InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); } }; @@ -222,18 +222,19 @@ struct FullReducer { #ifdef EIGEN_USE_THREADS // Multithreaded full reducers template + bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)> struct FullReducerShard { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer, typename Self::CoeffReturnType* output) { - *output = InnerMostDimReducer::reduce( + *output = InnerMostDimReducer::reduce( self, firstIndex, numValuesToReduce, reducer); } }; -template -struct FullReducer { +// Multithreaded full reducer +template +struct FullReducer { static const bool HasOptimizedImplementation = !Op::IsStateful; static const int PacketSize = unpacket_traits::size; @@ -247,79 +248,44 @@ struct FullReducer { *output = reducer.finalize(reducer.initialize()); return; } - const std::size_t num_threads = device.numThreads(); - if (num_threads == 1) { - *output = InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); - return; - } else { - const Index blocksize = std::floor(static_cast(num_coeffs) / num_threads); - const unsigned int numblocks = blocksize > 0 ? static_cast(num_coeffs / blocksize) : 0; - eigen_assert(num_coeffs >= static_cast(numblocks) * blocksize); - - Barrier barrier(numblocks); - MaxSizeVector shards(numblocks, reducer.initialize()); - for (unsigned int i = 0; i < numblocks; ++i) { - device.enqueue_with_barrier(&barrier, &FullReducerShard::run, self, - i * blocksize, blocksize, reducer, &shards[i]); - } - - typename Self::CoeffReturnType finalShard; - if (static_cast(numblocks) * blocksize < num_coeffs) { - finalShard = InnerMostDimReducer::reduce( - self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer); - } else { - finalShard = reducer.initialize(); - } - barrier.Wait(); - for (unsigned int i = 0; i < numblocks; ++i) { - reducer.reduce(shards[i], &finalShard); - } - *output = reducer.finalize(finalShard); - } - } -}; - -template -struct FullReducer { - static const bool HasOptimizedImplementation = !Op::IsStateful; - static const int PacketSize = - unpacket_traits::size; - - // launch one reducer per thread and accumulate the result. - static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, - typename Self::CoeffReturnType* output) { - typedef typename Self::Index Index; - const Index num_coeffs = array_prod(self.m_impl.dimensions()); - if (num_coeffs == 0) { - *output = reducer.finalize(reducer.initialize()); - return; - } - const std::size_t num_threads = device.numThreads(); +#ifdef EIGEN_USE_COST_MODEL + const TensorOpCost cost = + self.m_impl.costPerCoeff(Vectorizable) + + TensorOpCost(0, 0, internal::functor_traits::Cost, Vectorizable, + PacketSize); + const int num_threads = TensorCostModel::numThreads( + num_coeffs, cost, device.numThreads()); +#else + const int num_threads = device.numThreads(); +#endif if (num_threads == 1) { - *output = InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); + *output = + InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); return; } - const Index blocksize = std::floor(static_cast(num_coeffs) / num_threads); - const unsigned int numblocks = blocksize > 0 ? static_cast(num_coeffs / blocksize) : 0; - eigen_assert(num_coeffs >= static_cast(numblocks) * blocksize); + const Index blocksize = + std::floor(static_cast(num_coeffs) / num_threads); + const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; + eigen_assert(num_coeffs >= numblocks * blocksize); Barrier barrier(numblocks); MaxSizeVector shards(numblocks, reducer.initialize()); - for (unsigned int i = 0; i < numblocks; ++i) { - device.enqueue_with_barrier(&barrier, &FullReducerShard::run, + for (Index i = 0; i < numblocks; ++i) { + device.enqueue_with_barrier(&barrier, &FullReducerShard::run, self, i * blocksize, blocksize, reducer, &shards[i]); } typename Self::CoeffReturnType finalShard; - if (static_cast(numblocks) * blocksize < num_coeffs) { - finalShard = InnerMostDimReducer::reduce( - self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer); + if (numblocks * blocksize < num_coeffs) { + finalShard = InnerMostDimReducer::reduce( + self, numblocks * blocksize, num_coeffs - numblocks * blocksize, + reducer); } else { finalShard = reducer.initialize(); } - barrier.Wait(); - for (unsigned int i = 0; i < numblocks; ++i) { + + for (Index i = 0; i < numblocks; ++i) { reducer.reduce(shards[i], &finalShard); } *output = reducer.finalize(finalShard); @@ -498,13 +464,21 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + static bool size_large_enough(Index total_size) { +#ifndef EIGEN_USE_COST_MODEL + return total_size > 1024 * 1024; +#else + return true || total_size; +#endif + } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) { m_impl.evalSubExprsIfNeeded(NULL); // Use the FullReducer if possible. if (RunningFullReduction && internal::FullReducer::HasOptimizedImplementation && ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || - (!RunningOnGPU && (internal::array_prod(m_impl.dimensions()) > 1024 * 1024)))) { + (!RunningOnGPU && size_large_enough(internal::array_prod(m_impl.dimensions()))))) { bool need_assign = false; if (!data) { -- cgit v1.2.3 From 18e6f67426139e1bba34c49ed0935b7cc1e1f379 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 14 Apr 2016 20:37:46 -0700 Subject: Added support for exclusive or --- Eigen/src/Core/functors/BinaryFunctors.h | 16 ++++++++++++++++ Eigen/src/plugins/ArrayCwiseBinaryOps.h | 18 ++++++++++++++++++ unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 6 ++++++ 3 files changed, 40 insertions(+) diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h index e28fecfd0..5cd8ca950 100644 --- a/Eigen/src/Core/functors/BinaryFunctors.h +++ b/Eigen/src/Core/functors/BinaryFunctors.h @@ -344,6 +344,22 @@ template<> struct functor_traits { }; }; +/** \internal + * \brief Template functor to compute the xor of two booleans + * + * \sa class CwiseBinaryOp, ArrayBase::operator^ + */ +struct scalar_boolean_xor_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_xor_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a ^ b; } +}; +template<> struct functor_traits { + enum { + Cost = NumTraits::AddCost, + PacketAccess = false + }; +}; + /** \internal * \brief Template functor to compute the incomplete gamma function igamma(a, x) * diff --git a/Eigen/src/plugins/ArrayCwiseBinaryOps.h b/Eigen/src/plugins/ArrayCwiseBinaryOps.h index 9422c40bc..5694592d6 100644 --- a/Eigen/src/plugins/ArrayCwiseBinaryOps.h +++ b/Eigen/src/plugins/ArrayCwiseBinaryOps.h @@ -280,3 +280,21 @@ operator||(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const return CwiseBinaryOp(derived(),other.derived()); } +/** \returns an expression of the coefficient-wise ^ operator of *this and \a other + * + * \warning this operator is for expression of bool only. + * + * Example: \include Cwise_boolean_xor.cpp + * Output: \verbinclude Cwise_boolean_xor.out + * + * \sa operator&&(), select() + */ +template +EIGEN_DEVICE_FUNC +inline const CwiseBinaryOp +operator^(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + EIGEN_STATIC_ASSERT((internal::is_same::value && internal::is_same::value), + THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL); + return CwiseBinaryOp(derived(),other.derived()); +} diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 69d1802d5..1a34f3ccc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -334,6 +334,12 @@ class TensorBase return binaryExpr(other.derived(), internal::scalar_boolean_or_op()); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp + operator^(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_boolean_xor_op()); + } + // Comparisons and tests. template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> -- cgit v1.2.3 From a62e9246562970f384f316c66d17c6ed4bd2a55a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 14 Apr 2016 21:25:06 -0700 Subject: Added ability to access the cache sizes from the tensor devices --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 10 ++++++++++ .../Eigen/CXX11/src/Tensor/TensorDeviceDefault.h | 20 ++++++++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h | 9 +++++++++ 3 files changed, 39 insertions(+) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 54e118395..267ac1de9 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -2224,6 +2224,16 @@ inline std::ptrdiff_t l2CacheSize() return l2; } +/** \returns the currently set level 3 cpu cache size (in bytes) used to estimate the ideal blocking size paramete\ +rs. +* \sa setCpuCacheSize */ +inline std::ptrdiff_t l3CacheSize() +{ + std::ptrdiff_t l1, l2, l3; + internal::manage_caching_sizes(GetAction, &l1, &l2, &l3); + return l3; +} + /** Set the cpu L1 and L2 cache sizes (in bytes). * These values are use to adjust the size of the blocks * for the algorithms working per blocks. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h index 267f6f8e3..9d141395b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h @@ -44,6 +44,26 @@ struct DefaultDevice { #endif } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { +#ifndef __CUDA_ARCH__ + // Running on the host CPU + return l1CacheSize(); +#else + // Running on a CUDA device, return the amount of shared memory available. + return 48*1024; +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { +#ifndef __CUDA_ARCH__ + // Running single threaded on the host CPU + return l3CacheSize(); +#else + // Running on a CUDA device + return firstLevelCacheSize(); +#endif + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { #ifndef __CUDA_ARCH__ // Running single threaded on the host CPU diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index 6da16985f..41918eb19 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -128,6 +128,15 @@ struct ThreadPoolDevice { return num_threads_; } + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { + return l1CacheSize(); + } + + EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + // The l3 cache size is shared between all the cores. + return l3CacheSize() / num_threads_; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { // Should return an enum that encodes the ISA supported by the CPU return 1; -- cgit v1.2.3 From 1e80bddde3756ac7cd36a0db5e7d2493a7b93066 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 15 Apr 2016 17:58:36 +0200 Subject: Fix trmv for mixing types. --- Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h | 2 +- Eigen/src/Core/products/TriangularMatrixVector.h | 4 ++-- test/mixingtypes.cpp | 8 ++++++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index 831089dee..80ba89465 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -43,7 +43,7 @@ struct general_matrix_matrix_triangular_product::ReturnType ResScalar; static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride, const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride, - const ResScalar& alpha, level3_blocking& blocking) + const ResScalar& alpha, level3_blocking& blocking) { general_matrix_matrix_triangular_product EIGEN_DONT_INLINE void triangular_matrix_vector_product ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha) + const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha) { static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH; Index size = (std::min)(_rows,_cols); diff --git a/test/mixingtypes.cpp b/test/mixingtypes.cpp index a3b469af8..0b381ec6c 100644 --- a/test/mixingtypes.cpp +++ b/test/mixingtypes.cpp @@ -148,10 +148,14 @@ template void mixingtypes(int size = SizeAtCompileType) VERIFY_IS_APPROX(sd*vd.adjoint()*mcd, sd*vd.adjoint().template cast().eval()*mcd); VERIFY_IS_APPROX(scd*vd.adjoint()*mcd, scd*vd.adjoint().template cast().eval()*mcd); - VERIFY_IS_APPROX(sd*vcd.adjoint()*md.template triangularView(), sd*vcd.adjoint()*md.template cast().eval().template triangularView()); + VERIFY_IS_APPROX( sd*vcd.adjoint()*md.template triangularView(), sd*vcd.adjoint()*md.template cast().eval().template triangularView()); VERIFY_IS_APPROX(scd*vcd.adjoint()*md.template triangularView(), scd*vcd.adjoint()*md.template cast().eval().template triangularView()); - VERIFY_IS_APPROX(sd*vd.adjoint()*mcd.template triangularView(), sd*vd.adjoint().template cast().eval()*mcd.template triangularView()); + VERIFY_IS_APPROX( sd*vcd.adjoint()*md.transpose().template triangularView(), sd*vcd.adjoint()*md.transpose().template cast().eval().template triangularView()); + VERIFY_IS_APPROX(scd*vcd.adjoint()*md.transpose().template triangularView(), scd*vcd.adjoint()*md.transpose().template cast().eval().template triangularView()); + VERIFY_IS_APPROX( sd*vd.adjoint()*mcd.template triangularView(), sd*vd.adjoint().template cast().eval()*mcd.template triangularView()); VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.template triangularView(), scd*vd.adjoint().template cast().eval()*mcd.template triangularView()); + VERIFY_IS_APPROX( sd*vd.adjoint()*mcd.transpose().template triangularView(), sd*vd.adjoint().template cast().eval()*mcd.transpose().template triangularView()); + VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.transpose().template triangularView(), scd*vd.adjoint().template cast().eval()*mcd.transpose().template triangularView()); // Not supported yet: trmm // VERIFY_IS_APPROX(sd*mcd*md.template triangularView(), sd*mcd*md.template cast().eval().template triangularView()); -- cgit v1.2.3 From 1d2343062805edb86113e2aef5ebcbe5030a57a5 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 15 Apr 2016 10:53:31 -0700 Subject: Improved the matrix multiplication blocking in the case where mr is not a power of 2 (e.g on Haswell CPUs). --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 38 ++++++++++------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 267ac1de9..3d35c8d46 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -11,8 +11,8 @@ #define EIGEN_GENERAL_BLOCK_PANEL_H -namespace Eigen { - +namespace Eigen { + namespace internal { template @@ -36,7 +36,7 @@ const std::ptrdiff_t defaultL3CacheSize = 512*1024; #endif /** \internal */ -struct CacheSizes { +struct CacheSizes { CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) { int l1CacheSize, l2CacheSize, l3CacheSize; queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize); @@ -107,13 +107,9 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n enum { kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)), ksub = Traits::mr * Traits::nr * sizeof(ResScalar), - k_mask = -8, - + kr = 8, mr = Traits::mr, - mr_mask = -mr, - nr = Traits::nr, - nr_mask = -nr }; // Increasing k gives us more time to prefetch the content of the "C" // registers. However once the latency is hidden there is no point in @@ -121,7 +117,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // experimentally). const Index k_cache = (std::min)((l1-ksub)/kdiv, 320); if (k_cache < k) { - k = k_cache & k_mask; + k = k_cache - (k_cache % kr); eigen_internal_assert(k > 0); } @@ -130,10 +126,10 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n if (n_cache <= n_per_thread) { // Don't exceed the capacity of the l2 cache. eigen_internal_assert(n_cache >= static_cast(nr)); - n = n_cache & nr_mask; + n = n_cache - (n_cache % nr); eigen_internal_assert(n > 0); } else { - n = (std::min)(n, (n_per_thread + nr - 1) & nr_mask); + n = (std::min)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr)); } if (l3 > l2) { @@ -141,10 +137,10 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads); const Index m_per_thread = numext::div_ceil(m, num_threads); if(m_cache < m_per_thread && m_cache >= static_cast(mr)) { - m = m_cache & mr_mask; + m = m_cache - (m_cache % mr); eigen_internal_assert(m > 0); } else { - m = (std::min)(m, (m_per_thread + mr - 1) & mr_mask); + m = (std::min)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr)); } } } @@ -156,23 +152,23 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n l2 = 32*1024; l3 = 512*1024; #endif - + // Early return for small problems because the computation below are time consuming for small problems. // Perhaps it would make more sense to consider k*n*m?? // Note that for very tiny problem, this function should be bypassed anyway // because we use the coefficient-based implementation for them. if((std::max)(k,(std::max)(m,n))<48) return; - + typedef typename Traits::ResScalar ResScalar; enum { k_peeling = 8, k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)), k_sub = Traits::mr * Traits::nr * sizeof(ResScalar) }; - + // ---- 1st level of blocking on L1, yields kc ---- - + // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache. // We also include a register-level block of the result (mx x nr). @@ -187,12 +183,12 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // while keeping the same number of sweeps over the result. k = (k%max_kc)==0 ? max_kc : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1))); - + eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same"); } - + // ---- 2nd level of blocking on max(L2,L3), yields nc ---- - + // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is: // actual_l2 = max(l2, l3/nb_core_sharing_l3) // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it) @@ -202,7 +198,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n #else const Index actual_l2 = 1572864; // == 1.5 MB #endif - + // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2. // The second half is implicitly reserved to access the result and lhs coefficients. // When k Date: Fri, 15 Apr 2016 11:27:52 -0700 Subject: Fixed compilation errors with msvc --- unsupported/test/cxx11_eventcount.cpp | 4 ++-- unsupported/test/cxx11_runqueue.cpp | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp index 271e0f66d..ebd019e2d 100644 --- a/unsupported/test/cxx11_eventcount.cpp +++ b/unsupported/test/cxx11_eventcount.cpp @@ -67,8 +67,8 @@ const int TestQueue::kQueueSize; static void test_stress_eventcount() { const int kThreads = std::thread::hardware_concurrency(); - const int kEvents = 1 << 16; - const int kQueues = 10; + static const int kEvents = 1 << 16; + static const int kQueues = 10; std::vector waiters(kThreads); EventCount ec(waiters); diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp index f1217a01b..b1e2dbd6d 100644 --- a/unsupported/test/cxx11_runqueue.cpp +++ b/unsupported/test/cxx11_runqueue.cpp @@ -9,6 +9,7 @@ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #define EIGEN_USE_THREADS +#include #include "main.h" #include @@ -138,7 +139,7 @@ void test_empty_runqueue() // PopBack. Ensure that we don't crash, deadlock, and all sanity checks pass. void test_stress_runqueue() { - const int kEvents = 1 << 18; + static const int kEvents = 1 << 18; RunQueue q; std::atomic total(0); std::vector> threads; -- cgit v1.2.3 From 3718bf654bd173ae05f396f5d0cff1a4e15ef72d Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 15 Apr 2016 12:51:33 -0700 Subject: Get rid of void* casting when calling EvalRange::run. --- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 28 +++++++--------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index df9cc0998..7a54f7a23 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -85,8 +85,8 @@ class TensorExecutor #ifdef EIGEN_USE_THREADS template struct EvalRange { - static void run(void* evaluator_in, const Index first, const Index last) { - Evaluator evaluator(*static_cast(evaluator_in)); + static void run(Evaluator* evaluator_in, const Index first, const Index last) { + Evaluator evaluator = *evaluator_in; eigen_assert(last >= first); for (Index i = first; i < last; ++i) { evaluator.evalScalar(i); @@ -96,10 +96,9 @@ struct EvalRange { template struct EvalRange { - static void run(void* evaluator_in, const Index first, const Index last) { - Evaluator evaluator(*static_cast(evaluator_in)); + static void run(Evaluator* evaluator_in, const Index first, const Index last) { + Evaluator evaluator = *evaluator_in; eigen_assert(last >= first); - Index i = first; const int PacketSize = unpacket_traits::size; if (last - first >= PacketSize) { @@ -123,16 +122,6 @@ struct EvalRange { } }; -// Used to make an std::function to add to the ThreadPool with less templating -// than EvalRange::Run. -// This requires that this and EvalRange takes a void* to the evaluator that can -// be downcast to the right type by the EvalRange. -template -inline void InvokeEvalRange(void (*run_fn)(void*, const Index, const Index), - void* evaluator, const Index first, const Index last) { - run_fn(evaluator, first, last); -} - template class TensorExecutor { public: @@ -163,13 +152,12 @@ class TensorExecutor { Barrier barrier(numblocks); for (int i = 0; i < numblocks; ++i) { device.enqueue_with_barrier( - &barrier, &InvokeEvalRange, - &EvalRange::run, - static_cast(&evaluator), i * blocksize, - (i + 1) * blocksize); + &barrier, &EvalRange::run, + &evaluator, i * blocksize, (i + 1) * blocksize); } if (numblocks * blocksize < size) { - EvalRange::run(&evaluator, numblocks * blocksize, size); + EvalRange::run( + &evaluator, numblocks * blocksize, size); } barrier.Wait(); } -- cgit v1.2.3 From 2a7115daca3e36e1486c4e22f05ad11f8a055030 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 15 Apr 2016 22:34:11 +0200 Subject: bug #1203: by-pass large stack-allocation in stableNorm if EIGEN_STACK_ALLOCATION_LIMIT is too small --- Eigen/src/Core/StableNorm.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h index 7fe39808b..d2fe1e199 100644 --- a/Eigen/src/Core/StableNorm.h +++ b/Eigen/src/Core/StableNorm.h @@ -168,11 +168,12 @@ MatrixBase::stableNorm() const DerivedCopy copy(derived()); enum { - CanAlign = (int(Flags)&DirectAccessBit) || (int(internal::evaluator::Alignment)>0) // FIXME + CanAlign = ( (int(DerivedCopyClean::Flags)&DirectAccessBit) + || (int(internal::evaluator::Alignment)>0) // FIXME Alignment)>0 might not be enough + ) && (blockSize*sizeof(Scalar)*2, internal::evaluator::Alignment>, - typename DerivedCopyClean - ::ConstSegmentReturnType>::type SegmentWrapper; + typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper; Index n = size(); if(n==1) -- cgit v1.2.3 From 6c43c49e4aaf833e97a3c759b79d952bfd0a4d9c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 15 Apr 2016 15:34:34 -0700 Subject: Fixed a few compilation warnings --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h | 1 - unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h | 1 - 4 files changed, 2 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 1627d4870..2671f4357 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -440,7 +440,7 @@ struct TensorContractionEvaluatorBase return m_result[index]; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { return TensorOpCost(sizeof(CoeffReturnType), 0, 0); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index 9e5791bd3..8ff7d5815 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -146,7 +146,7 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { + costPerCoeff(bool) const { // TODO(rmlarsen): This is just a placeholder. Define interface to make // generators return their cost. return TensorOpCost(0, 0, TensorOpCost::AddCost() + diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 0b769f278..bafcc67bd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -448,7 +448,6 @@ struct TensorEvaluator, Device> protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { - const int PacketSize = internal::unpacket_traits::size; EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index f8db43dbe..e735fc76f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -525,7 +525,6 @@ struct TensorEvaluator, D protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { - const int PacketSize = internal::unpacket_traits::size; EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); -- cgit v1.2.3 From 7cff898e0a2a576a17168f36ba3576b764bbf326 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 15 Apr 2016 15:46:14 -0700 Subject: Deleted unnecessary variable --- unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h | 1 - 1 file changed, 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 293012646..ae4ce3c90 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -459,7 +459,6 @@ struct TensorEvaluator template EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { - const int PacketSize = internal::unpacket_traits::size; internal::Selector select; for (Index i = 0; i < PacketSize; ++i) { select.select[i] = m_condImpl.coeff(index+i); -- cgit v1.2.3 From 1a16fb1532552c87fb1cb67ef40679b7a4744613 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 15 Apr 2016 15:50:13 -0700 Subject: Deleted extraneous comma. --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 3d35c8d46..bd559dc6a 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -109,7 +109,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n ksub = Traits::mr * Traits::nr * sizeof(ResScalar), kr = 8, mr = Traits::mr, - nr = Traits::nr, + nr = Traits::nr }; // Increasing k gives us more time to prefetch the content of the "C" // registers. However once the latency is hidden there is no point in -- cgit v1.2.3 From c8e8f93d6ce9ed02a40ea08b5ccc1ef6e82de0f2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 15 Apr 2016 16:48:10 -0700 Subject: Move the evalGemm method into the TensorContractionEvaluatorBase class to make it accessible from both the single and multithreaded contraction evaluators. --- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 188 ++++++++++----------- 1 file changed, 94 insertions(+), 94 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 2671f4357..97182258d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -426,6 +426,99 @@ struct TensorContractionEvaluatorBase buffer, resIncr, alpha); } + template + EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + // define mr, nr, and all of my data mapper types + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + typedef typename internal::gebp_traits Traits; + + const Index nr = Traits::nr; + const Index mr = Traits::mr; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + + const Index lhs_packet_size = internal::unpacket_traits::size; + const Index rhs_packet_size = internal::unpacket_traits::size; + + typedef internal::TensorContractionInputMapper LhsMapper; + + typedef internal::TensorContractionInputMapper RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + // Declare GEBP packing and kernel structs + internal::gemm_pack_lhs pack_lhs; + internal::gemm_pack_rhs pack_rhs; + + internal::gebp_kernel gebp; + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + // Sizes of the blocks to load in cache. See the Goto paper for details. + internal::TensorContractionBlocking blocking(k, m, n, 1); + const Index kc = blocking.kc(); + const Index mc = numext::mini(m, blocking.mc()); + const Index nc = numext::mini(n, blocking.nc()); + const Index sizeA = mc * kc; + const Index sizeB = kc * nc; + + LhsScalar* blockA = static_cast(this->m_device.allocate(sizeA * sizeof(LhsScalar))); + RhsScalar* blockB = static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar))); + + for(Index i2=0; i2m_device.deallocate(blockA); + this->m_device.deallocate(blockB); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_leftImpl.cleanup(); m_rightImpl.cleanup(); @@ -533,100 +626,7 @@ struct TensorEvaluator(buffer); - } - - template - EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const { - // columns in left side, rows in right side - const Index k = this->m_k_size; - - // rows in left side - const Index m = this->m_i_size; - - // columns in right side - const Index n = this->m_j_size; - - // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - - // define mr, nr, and all of my data mapper types - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; - typedef typename internal::gebp_traits Traits; - - const Index nr = Traits::nr; - const Index mr = Traits::mr; - - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; - - const Index lhs_packet_size = internal::unpacket_traits::size; - const Index rhs_packet_size = internal::unpacket_traits::size; - - typedef internal::TensorContractionInputMapper LhsMapper; - - typedef internal::TensorContractionInputMapper RhsMapper; - - typedef internal::blas_data_mapper OutputMapper; - - // Declare GEBP packing and kernel structs - internal::gemm_pack_lhs pack_lhs; - internal::gemm_pack_rhs pack_rhs; - - internal::gebp_kernel gebp; - - // initialize data mappers - LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, - this->m_left_contracting_strides, this->m_k_strides); - - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, - this->m_right_contracting_strides, this->m_k_strides); - - OutputMapper output(buffer, m); - - // Sizes of the blocks to load in cache. See the Goto paper for details. - internal::TensorContractionBlocking blocking(k, m, n, 1); - const Index kc = blocking.kc(); - const Index mc = numext::mini(m, blocking.mc()); - const Index nc = numext::mini(n, blocking.nc()); - const Index sizeA = mc * kc; - const Index sizeB = kc * nc; - - LhsScalar* blockA = static_cast(this->m_device.allocate(sizeA * sizeof(LhsScalar))); - RhsScalar* blockB = static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar))); - - for(Index i2=0; i2m_device.deallocate(blockA); - this->m_device.deallocate(blockB); + this->template evalGemm(buffer); } }; -- cgit v1.2.3 From e4fe611e2c9cf7b2853016c39898a224b3ec51a3 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 16 Apr 2016 15:17:39 +0200 Subject: Enable lazy-coeff-based-product for vector*(1x1) products --- Eigen/src/Core/GeneralProduct.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index 53f934999..f7c5f4276 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -81,6 +81,8 @@ public: * This is a compile time mapping from {1,Small,Large}^3 -> {product types} */ // FIXME I'm not sure the current mapping is the ideal one. template struct product_type_selector { enum { ret = OuterProduct }; }; +template struct product_type_selector { enum { ret = LazyCoeffBasedProductMode }; }; +template struct product_type_selector<1, N, 1> { enum { ret = LazyCoeffBasedProductMode }; }; template struct product_type_selector<1, 1, Depth> { enum { ret = InnerProduct }; }; template<> struct product_type_selector<1, 1, 1> { enum { ret = InnerProduct }; }; template<> struct product_type_selector { enum { ret = CoeffBasedProductMode }; }; -- cgit v1.2.3 From 5fbcfe5eb45a946230c06b7a7cac2e6ec6088457 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sun, 17 Apr 2016 18:42:31 -0700 Subject: Worked around the lack of a rand_r function on windows systems --- unsupported/test/cxx11_runqueue.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp index b1e2dbd6d..f8552b76d 100644 --- a/unsupported/test/cxx11_runqueue.cpp +++ b/unsupported/test/cxx11_runqueue.cpp @@ -13,6 +13,15 @@ #include "main.h" #include + +#ifdef EIGEN_COMP_MSVC_STRICT +// Visual studio doesn't implementan rand_r() function since its +// implementation of rand()is already thread safe +int rand_r(unsigned int*) { + return rand(); +} +#endif + void test_basic_runqueue() { RunQueue q; -- cgit v1.2.3 From 84543c8be2076b747bbc730a760e7693ab6caae4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sun, 17 Apr 2016 19:29:27 -0700 Subject: Worked around the lack of a rand_r function on windows systems --- unsupported/test/cxx11_eventcount.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp index ebd019e2d..59039dae9 100644 --- a/unsupported/test/cxx11_eventcount.cpp +++ b/unsupported/test/cxx11_eventcount.cpp @@ -12,6 +12,14 @@ #include "main.h" #include +#ifdef EIGEN_COMP_MSVC_STRICT +// Visual studio doesn't implementan rand_r() function since its +// implementation of rand()is already thread safe +int rand_r(unsigned int*) { + return rand(); +} +#endif + static void test_basic_eventcount() { std::vector waiters(1); -- cgit v1.2.3 From 50968a0a3ed2686b25f6df1687f4cf7fc6b66da1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Apr 2016 11:53:58 -0700 Subject: Use DenseIndex in the MeanReducer to avoid overflows when processing very large tensors. --- unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index 44dc2d730..33cd00391 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -158,8 +158,8 @@ template struct MeanReducer } protected: - int scalarCount_; - int packetCount_; + DenseIndex scalarCount_; + DenseIndex packetCount_; }; template struct MaxReducer -- cgit v1.2.3 From f953c607058efd7c3508e888ac4bf2c526336b87 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Apr 2016 12:57:39 -0700 Subject: Fixed 2 recent regression tests --- unsupported/test/cxx11_eventcount.cpp | 25 ++++++++++++++----------- unsupported/test/cxx11_runqueue.cpp | 27 +++++++++++++++------------ 2 files changed, 29 insertions(+), 23 deletions(-) diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp index 59039dae9..898c4c278 100644 --- a/unsupported/test/cxx11_eventcount.cpp +++ b/unsupported/test/cxx11_eventcount.cpp @@ -12,11 +12,14 @@ #include "main.h" #include +// Visual studio doesn't implement a rand_r() function since its +// implementation of rand() is already thread safe +int rand_reentrant(unsigned int* s) { #ifdef EIGEN_COMP_MSVC_STRICT -// Visual studio doesn't implementan rand_r() function since its -// implementation of rand()is already thread safe -int rand_r(unsigned int*) { return rand(); +#else + return rand_r(s); +endif } #endif @@ -85,15 +88,15 @@ static void test_stress_eventcount() std::vector> producers; for (int i = 0; i < kThreads; i++) { producers.emplace_back(new std::thread([&ec, &queues]() { - unsigned rnd = std::hash()(std::this_thread::get_id()); - for (int i = 0; i < kEvents; i++) { - unsigned idx = rand_r(&rnd) % kQueues; + unsigned int rnd = static_cast(std::hash()(std::this_thread::get_id())); + for (int j = 0; j < kEvents; j++) { + unsigned idx = rand_reentrant(&rnd) % kQueues; if (queues[idx].Push()) { ec.Notify(false); continue; } std::this_thread::yield(); - i--; + j--; } })); } @@ -102,11 +105,11 @@ static void test_stress_eventcount() for (int i = 0; i < kThreads; i++) { consumers.emplace_back(new std::thread([&ec, &queues, &waiters, i]() { EventCount::Waiter& w = waiters[i]; - unsigned rnd = std::hash()(std::this_thread::get_id()); - for (int i = 0; i < kEvents; i++) { - unsigned idx = rand_r(&rnd) % kQueues; + unsigned int rnd = static_cast(std::hash()(std::this_thread::get_id())); + for (int j = 0; j < kEvents; k++) { + unsigned idx = rand_reentrant(&rnd) % kQueues; if (queues[idx].Pop()) continue; - i--; + j--; ec.Prewait(&w); bool empty = true; for (int q = 0; q < kQueues; q++) { diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp index f8552b76d..5cfb38b2e 100644 --- a/unsupported/test/cxx11_runqueue.cpp +++ b/unsupported/test/cxx11_runqueue.cpp @@ -14,11 +14,14 @@ #include +// Visual studio doesn't implement a rand_r() function since its +// implementation of rand() is already thread safe +int rand_reentrant(unsigned int* s) { #ifdef EIGEN_COMP_MSVC_STRICT -// Visual studio doesn't implementan rand_r() function since its -// implementation of rand()is already thread safe -int rand_r(unsigned int*) { return rand(); +#else + return rand_r(s); +endif } #endif @@ -115,11 +118,11 @@ void test_empty_runqueue() unsigned rnd = 0; std::vector stolen; for (int i = 0; i < 1 << 18; i++) { - if (rand_r(&rnd) % 2) + if (rand_reentrant(&rnd) % 2) VERIFY_IS_EQUAL(0, q.PushFront(1)); else VERIFY_IS_EQUAL(0, q.PushBack(1)); - if (rand_r(&rnd) % 2) + if (rand_reentrant(&rnd) % 2) VERIFY_IS_EQUAL(1, q.PopFront()); else { for (;;) { @@ -176,30 +179,30 @@ void test_stress_runqueue() for (int i = 0; i < 2; i++) { threads.emplace_back(new std::thread([&q, &total]() { int sum = 0; - for (int i = 1; i < kEvents; i++) { - if (q.PushBack(i) == 0) { - sum += i; + for (int j = 1; j < kEvents; j++) { + if (q.PushBack(j) == 0) { + sum += j; continue; } std::this_thread::yield(); - i--; + j--; } total += sum; })); threads.emplace_back(new std::thread([&q, &total]() { int sum = 0; std::vector stolen; - for (int i = 1; i < kEvents;) { + for (int j = 1; j < kEvents;) { if (q.PopBackHalf(&stolen) == 0) { std::this_thread::yield(); continue; } - while (stolen.size() && i < kEvents) { + while (stolen.size() && j < kEvents) { int v = stolen.back(); stolen.pop_back(); VERIFY_IS_NOT_EQUAL(v, 0); sum += v; - i++; + j++; } } while (stolen.size()) { -- cgit v1.2.3 From a278414d1b1220d56f574a80027955c55542cd95 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Apr 2016 13:54:28 -0700 Subject: Avoid an unnecessary copy of the evaluator. --- unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 7a54f7a23..907da9446 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -183,7 +183,7 @@ class TensorExecutor { template struct EigenMetaKernelEval { static __device__ EIGEN_ALWAYS_INLINE - void run(Evaluator eval, Index first, Index last, Index step_size) { + void run(Evaluator& eval, Index first, Index last, Index step_size) { for (Index i = first; i < last; i += step_size) { eval.evalScalar(i); } -- cgit v1.2.3 From 884c07505868c0167467c5c3de207724b24f12ab Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Apr 2016 14:33:30 -0700 Subject: Use numext::ceil instead of std::ceil --- unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index c33d54d6e..02193f263 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -136,7 +136,7 @@ struct FullReducer { const Index num_coeffs = array_prod(self.m_impl.dimensions()); const int block_size = 256; const int num_per_thread = 128; - const int num_blocks = std::ceil(static_cast(num_coeffs) / (block_size * num_per_thread)); + const int num_blocks = numext::ceil(static_cast(num_coeffs) / (block_size * num_per_thread)); if (num_blocks > 1) { // We initialize the outputs outside the reduction kernel when we can't be sure that there -- cgit v1.2.3 From b9ea40c30d1d32d0f31b047aa681c384fd1a2c98 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Apr 2016 14:35:11 -0700 Subject: Don't take the address of a kernel on CUDA devices that don't support this feature. --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index 821835cf3..8e7f5dddb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -291,14 +291,17 @@ struct GpuDevice { int max_blocks_; }; -#ifndef __CUDA_ARCH__ +#if !defined(__CUDA_ARCH__) #define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ assert(cudaGetLastError() == cudaSuccess); -#else +#elif __CUDA_ARCH__ >= 350 #define LAUNCH_CUDA_KERNEL(kernel, ...) \ { const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); } \ - eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__); + eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__ kernel); +#else +#define LAUNCH_CUDA_KERNEL(kernel, ...) \ + eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__ kernel); #endif -- cgit v1.2.3 From 7129d998db0a8dd74125ad7081f3d220cbce96f0 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Apr 2016 14:55:21 -0700 Subject: Simplified the code that launches cuda kernels. --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 9 --------- unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 12 ++++++------ 3 files changed, 7 insertions(+), 16 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index 8e7f5dddb..1d2d162dc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -291,18 +291,9 @@ struct GpuDevice { int max_blocks_; }; -#if !defined(__CUDA_ARCH__) #define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ assert(cudaGetLastError() == cudaSuccess); -#elif __CUDA_ARCH__ >= 350 -#define LAUNCH_CUDA_KERNEL(kernel, ...) \ - { const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); } \ - eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__ kernel); -#else -#define LAUNCH_CUDA_KERNEL(kernel, ...) \ - eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__ kernel); -#endif // FIXME: Should be device and kernel specific. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 907da9446..bf6e10a7b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -193,7 +193,7 @@ struct EigenMetaKernelEval { template struct EigenMetaKernelEval { static __device__ EIGEN_ALWAYS_INLINE - void run(Evaluator eval, Index first, Index last, Index step_size) { + void run(Evaluator& eval, Index first, Index last, Index step_size) { const Index PacketSize = unpacket_traits::size; const Index vectorized_size = (last / PacketSize) * PacketSize; const Index vectorized_step_size = step_size * PacketSize; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 02193f263..d80436326 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -126,11 +126,11 @@ struct FullReducer { internal::is_same::value; template - static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const GpuDevice&, OutputType*) { + static void run(const Self&, Op&, const GpuDevice&, OutputType*) { assert(false && "Should only be called on floats"); } - static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) { + static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) { typedef typename Self::Index Index; const Index num_coeffs = array_prod(self.m_impl.dimensions()); @@ -226,12 +226,12 @@ struct InnerReducer { internal::is_same::value; template - static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { + static bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { assert(false && "Should only be called to reduce floats on a gpu device"); return true; } - static EIGEN_DEVICE_FUNC bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { typedef typename Self::Index Index; // It's faster to use the usual code. @@ -305,12 +305,12 @@ struct OuterReducer { internal::is_same::value; template - static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { + static bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { assert(false && "Should only be called to reduce floats on a gpu device"); return true; } - static EIGEN_DEVICE_FUNC bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { typedef typename Self::Index Index; // It's faster to use the usual code. -- cgit v1.2.3 From 5b1106c56b64e4691c2849184d416631c689348a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Apr 2016 14:57:57 -0700 Subject: Fixed a compilation error with nvcc 7. --- unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index d80436326..afa5a257a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -126,7 +126,7 @@ struct FullReducer { internal::is_same::value; template - static void run(const Self&, Op&, const GpuDevice&, OutputType*) { + static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const GpuDevice&, OutputType*) { assert(false && "Should only be called on floats"); } @@ -226,7 +226,7 @@ struct InnerReducer { internal::is_same::value; template - static bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { + static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { assert(false && "Should only be called to reduce floats on a gpu device"); return true; } @@ -305,7 +305,7 @@ struct OuterReducer { internal::is_same::value; template - static bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { + static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { assert(false && "Should only be called to reduce floats on a gpu device"); return true; } -- cgit v1.2.3 From 04f954956d407fbf6cf7c2be3a993460aa608177 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Apr 2016 15:27:09 -0700 Subject: Fixed a few typos --- unsupported/test/cxx11_eventcount.cpp | 6 +++--- unsupported/test/cxx11_runqueue.cpp | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp index 898c4c278..2f250338c 100644 --- a/unsupported/test/cxx11_eventcount.cpp +++ b/unsupported/test/cxx11_eventcount.cpp @@ -16,12 +16,12 @@ // implementation of rand() is already thread safe int rand_reentrant(unsigned int* s) { #ifdef EIGEN_COMP_MSVC_STRICT + EIGEN_UNUSED_VARIABLE(s); return rand(); #else return rand_r(s); -endif -} #endif +} static void test_basic_eventcount() { @@ -106,7 +106,7 @@ static void test_stress_eventcount() consumers.emplace_back(new std::thread([&ec, &queues, &waiters, i]() { EventCount::Waiter& w = waiters[i]; unsigned int rnd = static_cast(std::hash()(std::this_thread::get_id())); - for (int j = 0; j < kEvents; k++) { + for (int j = 0; j < kEvents; j++) { unsigned idx = rand_reentrant(&rnd) % kQueues; if (queues[idx].Pop()) continue; j--; diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp index 5cfb38b2e..4207824bf 100644 --- a/unsupported/test/cxx11_runqueue.cpp +++ b/unsupported/test/cxx11_runqueue.cpp @@ -18,12 +18,12 @@ // implementation of rand() is already thread safe int rand_reentrant(unsigned int* s) { #ifdef EIGEN_COMP_MSVC_STRICT + EIGEN_UNUSED_VARIABLE(s); return rand(); #else return rand_r(s); -endif -} #endif +} void test_basic_runqueue() { -- cgit v1.2.3 From 2b7216302864715eb6950448fc71b0e4b51d617c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Apr 2016 15:56:02 -0700 Subject: Implemented a more portable version of thread local variables --- unsupported/Eigen/CXX11/ThreadPool | 1 + .../CXX11/src/ThreadPool/NonBlockingThreadPool.h | 2 +- .../Eigen/CXX11/src/ThreadPool/ThreadLocal.h | 22 ++++++++++++++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h diff --git a/unsupported/Eigen/CXX11/ThreadPool b/unsupported/Eigen/CXX11/ThreadPool index 6f360ae2f..6cd057b4b 100644 --- a/unsupported/Eigen/CXX11/ThreadPool +++ b/unsupported/Eigen/CXX11/ThreadPool @@ -47,6 +47,7 @@ #include "src/ThreadPool/EventCount.h" #include "src/ThreadPool/RunQueue.h" +#include "src/ThreadPool/ThreadLocal.h" #include "src/ThreadPool/ThreadPoolInterface.h" #include "src/ThreadPool/ThreadEnvironment.h" #include "src/ThreadPool/SimpleThreadPool.h" diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h index 18dec5393..1c471a19f 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h @@ -212,7 +212,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { } PerThread* GetPerThread() { - static thread_local PerThread per_thread_; + EIGEN_THREAD_LOCAL PerThread per_thread_; PerThread* pt = &per_thread_; if (pt->inited) return pt; pt->inited = true; diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h new file mode 100644 index 000000000..cfa221732 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h @@ -0,0 +1,22 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H +#define EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H + +// Try to come up with a portable implementation of thread local variables +#if EIGEN_COMP_GNUC && EIGEN_GNUC_AT_MOST(4, 7) +#define EIGEN_THREAD_LOCAL static __thread +#elif EIGEN_COMP_CLANG +#define EIGEN_THREAD_LOCAL static __thread +#else +#define EIGEN_THREAD_LOCAL static thread_local +#endif + +#endif // EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H -- cgit v1.2.3 From 1d0238375d53e168eda40139979f8d128e72f4e8 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Apr 2016 17:44:12 -0700 Subject: Made sure all the required header files are included when trying to use fp16 --- Eigen/Core | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Eigen/Core b/Eigen/Core index c7192b037..ac58f10c3 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -214,10 +214,14 @@ #include #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 #define EIGEN_HAS_CUDA_FP16 - #include #endif #endif +#if defined EIGEN_HAS_CUDA_FP16 + #include + #include +#endif + #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE) #define EIGEN_HAS_OPENMP #endif -- cgit v1.2.3 From c7c2054bb519ad01039560254f9dda8698cb0cd9 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Apr 2016 17:59:58 -0700 Subject: Started to implement a portable way to yield. --- unsupported/Eigen/CXX11/ThreadPool | 3 ++- unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h | 4 ++-- unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h | 20 ++++++++++++++++++++ 3 files changed, 24 insertions(+), 3 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h diff --git a/unsupported/Eigen/CXX11/ThreadPool b/unsupported/Eigen/CXX11/ThreadPool index 6cd057b4b..fe00a0b65 100644 --- a/unsupported/Eigen/CXX11/ThreadPool +++ b/unsupported/Eigen/CXX11/ThreadPool @@ -45,9 +45,10 @@ #include #include +#include "src/ThreadPool/ThreadLocal.h" +#include "src/ThreadPool/ThreadYield.h" #include "src/ThreadPool/EventCount.h" #include "src/ThreadPool/RunQueue.h" -#include "src/ThreadPool/ThreadLocal.h" #include "src/ThreadPool/ThreadPoolInterface.h" #include "src/ThreadPool/ThreadEnvironment.h" #include "src/ThreadPool/SimpleThreadPool.h" diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h index 16eee1a41..6dd64f185 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h @@ -81,7 +81,7 @@ class EventCount { if (int64_t((state & kEpochMask) - epoch) < 0) { // The preceeding waiter has not decided on its fate. Wait until it // calls either CancelWait or CommitWait, or is notified. - std::this_thread::yield(); + EIGEN_THREAD_YIELD(); state = state_.load(std::memory_order_seq_cst); continue; } @@ -112,7 +112,7 @@ class EventCount { if (int64_t((state & kEpochMask) - epoch) < 0) { // The preceeding waiter has not decided on its fate. Wait until it // calls either CancelWait or CommitWait, or is notified. - std::this_thread::yield(); + EIGEN_THREAD_YIELD(); state = state_.load(std::memory_order_relaxed); continue; } diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h new file mode 100644 index 000000000..a859c7ba3 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h @@ -0,0 +1,20 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H +#define EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H + +// Try to come up with a portable way to yield +#if EIGEN_COMP_GNUC && EIGEN_GNUC_AT_MOST(4, 7) +#define EIGEN_THREAD_YIELD() sched_yield() +#else +#define EIGEN_THREAD_YIELD() std::this_thread::yield() +#endif + +#endif // EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H -- cgit v1.2.3 From 80200a182862bf7c19b51645ab8e7b9e15b65a90 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 20 Apr 2016 12:10:27 -0700 Subject: Don't attempt to leverage the _cvtss_sh and _cvtsh_ss instructions when compiling with clang since it's unclear which versions of clang actually support these instruction. --- Eigen/Core | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/Core b/Eigen/Core index ac58f10c3..50040135f 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -204,7 +204,7 @@ #endif #endif -#if defined(__F16C__) +#if defined(__F16C__) && !defined(EIGEN_COMP_CLANG) // We can use the optimized fp16 to float and float to fp16 conversion routines #define EIGEN_HAS_FP16_C #endif -- cgit v1.2.3 From a792cd357d31f0a4fce62ed1fa4cc0334cf2f143 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 20 Apr 2016 17:33:58 -0700 Subject: Added more tests --- unsupported/test/cxx11_tensor_fixed_size.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp index 1c33fefb3..5fe164859 100644 --- a/unsupported/test/cxx11_tensor_fixed_size.cpp +++ b/unsupported/test/cxx11_tensor_fixed_size.cpp @@ -20,6 +20,8 @@ static void test_0d() TensorFixedSize > scalar1; TensorFixedSize, RowMajor> scalar2; VERIFY_IS_EQUAL(scalar1.rank(), 0); + VERIFY_IS_EQUAL(scalar1.size(), 1); + VERIFY_IS_EQUAL(array_prod(scalar1.dimensions()), 1); scalar1() = 7.0; scalar2() = 13.0; -- cgit v1.2.3 From 2dde1b102866e1928e925678951463f2a7051af1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 20 Apr 2016 18:08:20 -0700 Subject: Don't crash when attempting to reduce empty tensors. --- unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h | 12 ++++++++++-- unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 7 ++++++- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index bf6e10a7b..c3edae477 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -238,7 +238,7 @@ inline void TensorExecutor::run( device.maxCudaThreadsPerMultiProcessor() / block_size; const Index size = array_prod(evaluator.dimensions()); // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. - const int num_blocks = numext::maxi(numext::mini(max_blocks, (size + block_size - 1) / block_size), 1); + const int num_blocks = numext::maxi(numext::mini(max_blocks, divup(size, block_size)), 1); LAUNCH_CUDA_KERNEL( (EigenMetaKernel, Index>), diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index 6af2d45d4..cd04716bd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -24,9 +24,17 @@ const T2& choose(Cond, const T1&, const T2& second) { return second; } -template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T divup(const X x, const Y y) { + return static_cast((x + y - 1) / y); +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T divup(const T x, const T y) { - return (x + y - 1) / y; + return static_cast((x + y - 1) / y); } template struct max_n_1 { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index afa5a257a..fd2587dd5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -134,9 +134,14 @@ struct FullReducer { typedef typename Self::Index Index; const Index num_coeffs = array_prod(self.m_impl.dimensions()); + // Don't crash when we're called with an input tensor of size 0. + if (num_coeffs == 0) { + return; + } + const int block_size = 256; const int num_per_thread = 128; - const int num_blocks = numext::ceil(static_cast(num_coeffs) / (block_size * num_per_thread)); + const int num_blocks = divup(num_coeffs, block_size * num_per_thread); if (num_blocks > 1) { // We initialize the outputs outside the reduction kernel when we can't be sure that there -- cgit v1.2.3 From 32ffce04fc3415ef10d2913fc90806077602e87d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 21 Apr 2016 08:47:28 -0700 Subject: Use EIGEN_THREAD_YIELD instead of std::this_thread::yield to make the code more portable. --- unsupported/test/cxx11_eventcount.cpp | 2 +- unsupported/test/cxx11_runqueue.cpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp index 2f250338c..f16cc6f07 100644 --- a/unsupported/test/cxx11_eventcount.cpp +++ b/unsupported/test/cxx11_eventcount.cpp @@ -95,7 +95,7 @@ static void test_stress_eventcount() ec.Notify(false); continue; } - std::this_thread::yield(); + EIGEN_THREAD_YIELD(); j--; } })); diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp index 4207824bf..6c99eb981 100644 --- a/unsupported/test/cxx11_runqueue.cpp +++ b/unsupported/test/cxx11_runqueue.cpp @@ -184,7 +184,7 @@ void test_stress_runqueue() sum += j; continue; } - std::this_thread::yield(); + EIGEN_THREAD_YIELD(); j--; } total += sum; @@ -194,7 +194,7 @@ void test_stress_runqueue() std::vector stolen; for (int j = 1; j < kEvents;) { if (q.PopBackHalf(&stolen) == 0) { - std::this_thread::yield(); + EIGEN_THREAD_YIELD(); continue; } while (stolen.size() && j < kEvents) { @@ -209,7 +209,7 @@ void test_stress_runqueue() int v = stolen.back(); stolen.pop_back(); VERIFY_IS_NOT_EQUAL(v, 0); - while ((v = q.PushBack(v)) != 0) std::this_thread::yield(); + while ((v = q.PushBack(v)) != 0) EIGEN_THREAD_YIELD(); } total -= sum; })); -- cgit v1.2.3 From 6015422ee6e07377e9f8c776d136674ea303c57b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 21 Apr 2016 10:30:29 -0700 Subject: Added an option to enable the use of the F16C instruction set --- CMakeLists.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 51beba118..3b3753332 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -221,6 +221,12 @@ if(NOT MSVC) message(STATUS "Enabling FMA in tests/examples") endif() + option(EIGEN_TEST_F16C "Enable/Disable F16C in tests/examples" OFF) + if(EIGEN_TEST_F16C) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mf16c") + message(STATUS "Enabling F16C in tests/examples") + endif() + option(EIGEN_TEST_ALTIVEC "Enable/Disable AltiVec in tests/examples" OFF) if(EIGEN_TEST_ALTIVEC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec") -- cgit v1.2.3 From f670613e4b90609229b016c3e2d1be9f4b8d54eb Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 21 Apr 2016 11:03:02 -0700 Subject: Fixed several compilation warnings --- unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 2 +- unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h | 2 +- unsupported/test/cxx11_runqueue.cpp | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index c3edae477..5c3d4d630 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -135,7 +135,7 @@ class TensorExecutor { { const Index PacketSize = Vectorizable ? unpacket_traits::size : 1; const Index size = array_prod(evaluator.dimensions()); - int num_threads = device.numThreads(); + size_t num_threads = device.numThreads(); #ifdef EIGEN_USE_COST_MODEL if (num_threads > 1) { num_threads = TensorCostModel::numThreads( diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h index aaa1d92c7..0544a6e15 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h @@ -168,7 +168,7 @@ class RunQueue { // larger than it is during concurrent modifications. E.g. pop can // decrement size before the corresponding push has incremented it. // So the computed size can be up to kSize + 1, fix it. - if (size > kSize) size = kSize; + if (size > static_cast(kSize)) size = kSize; return size; } } diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp index 6c99eb981..d1770ee1b 100644 --- a/unsupported/test/cxx11_runqueue.cpp +++ b/unsupported/test/cxx11_runqueue.cpp @@ -30,11 +30,11 @@ void test_basic_runqueue() RunQueue q; // Check empty state. VERIFY(q.Empty()); - VERIFY_IS_EQUAL(0, q.Size()); + VERIFY_IS_EQUAL(0u, q.Size()); VERIFY_IS_EQUAL(0, q.PopFront()); std::vector stolen; VERIFY_IS_EQUAL(0, q.PopBackHalf(&stolen)); - VERIFY_IS_EQUAL(0, stolen.size()); + VERIFY_IS_EQUAL(0u, stolen.size()); // Push one front, pop one front. VERIFY_IS_EQUAL(0, q.PushFront(1)); VERIFY_IS_EQUAL(1, q.Size()); -- cgit v1.2.3 From 79b900375fc0bffd659b19f56818156942687b0c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 21 Apr 2016 11:58:27 -0700 Subject: Use index list for the striding benchmarks --- bench/tensors/tensor_benchmarks.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index 90b9bc741..62533a608 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -201,9 +201,15 @@ template class BenchmarkSuite { size_b[1] = k_/2; TensorMap, Eigen::Aligned> B(b_, size_b); +#ifndef EIGEN_HAS_INDEX_LIST Eigen::array strides; strides[0] = 1; strides[1] = 2; +#else + // Take advantage of cxx11 to give the compiler information it can use to + // optimize the code. + Eigen::IndexList, Eigen::type2index<2> > strides; +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { -- cgit v1.2.3 From 33adce5c3abcf14b1da5d9ba6502530d140f5cb4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 21 Apr 2016 11:59:58 -0700 Subject: Added the ability to switch to the new thread pool with a #define --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index 41918eb19..fc03d84a7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -14,7 +14,11 @@ namespace Eigen { // Use the SimpleThreadPool by default. We'll switch to the new non blocking // thread pool later. +#ifdef EIGEN_USE_NONBLOCKING_THREAD_POOL +typedef NonBlockingThreadPool ThreadPool; +#else typedef SimpleThreadPool ThreadPool; +#endif // Barrier is an object that allows one or more threads to wait until -- cgit v1.2.3 From a3256d78d849b6978356a587038c3c62d4437bc9 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 21 Apr 2016 16:49:28 -0700 Subject: Prevent crash in CompleteOrthogonalDecomposition if object was default constructed. --- Eigen/src/QR/CompleteOrthogonalDecomposition.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/Eigen/src/QR/CompleteOrthogonalDecomposition.h index e71944fd7..230d0d23c 100644 --- a/Eigen/src/QR/CompleteOrthogonalDecomposition.h +++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h @@ -397,6 +397,10 @@ CompleteOrthogonalDecomposition& CompleteOrthogonalDecomposition< const Index rank = m_cpqr.rank(); const Index cols = matrix.cols(); + const Index rows = matrix.rows(); + m_zCoeffs.resize((std::min)(rows, cols)); + m_temp.resize(cols); + if (rank < cols) { // We have reduced the (permuted) matrix to the form // [R11 R12] -- cgit v1.2.3 From 4bbc97be5eda5dacd5ac44baa54f59a59176e12c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 21 Apr 2016 17:59:33 -0700 Subject: Provide access to the base threadpool classes --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index fc03d84a7..c02891465 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -15,8 +15,10 @@ namespace Eigen { // Use the SimpleThreadPool by default. We'll switch to the new non blocking // thread pool later. #ifdef EIGEN_USE_NONBLOCKING_THREAD_POOL +template using ThreadPoolTempl = NonBlockingThreadPoolTempl; typedef NonBlockingThreadPool ThreadPool; #else +template using ThreadPoolTempl = SimpleThreadPoolTempl; typedef SimpleThreadPool ThreadPool; #endif -- cgit v1.2.3 From fd9401f2601fe8ed16c1a386f217da02dc8bfd2f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 25 Apr 2016 09:16:08 -0700 Subject: Refined the cost of the striding operation. --- unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 23248c626..52b7d216a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -209,11 +209,17 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - const double compute_cost = NumDims * (2 * TensorOpCost::AddCost() + - 2 * TensorOpCost::MulCost() + - TensorOpCost::DivCost()); - return m_impl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize); + double compute_cost = (NumDims - 1) * (TensorOpCost::AddCost() + + TensorOpCost::MulCost() + + TensorOpCost::DivCost()) + + TensorOpCost::MulCost(); + if (vectorized) { + compute_cost *= 2; // packet() computes two indices + } + const int innerDim = (static_cast(Layout) == static_cast(ColMajor)) ? 0 : (NumDims - 1); + return m_impl.costPerCoeff(vectorized && m_inputStrides[innerDim] == 1) + + // Computation is not vectorized per se, but it is done once per packet. + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); } EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } -- cgit v1.2.3 From 4a164d2c465e494406ccfa663eb35e2e114e5292 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 25 Apr 2016 10:43:03 -0700 Subject: Fixed the partial evaluation of non vectorizable tensor subexpressions --- unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 2 +- unsupported/test/cxx11_tensor_of_float16_cuda.cu | 37 ++++++++++++++++++++++ 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 893542d29..5c6748a43 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -95,7 +95,7 @@ struct TensorEvaluator, Device> enum { IsAligned = true, - PacketAccess = true, + PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = true diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index d2b0b30a1..1ce53ad69 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -90,7 +90,7 @@ struct TensorEvaluator, Device> enum { IsAligned = true, - PacketAccess = (internal::packet_traits::size > 1), + PacketAccess = (PacketSize > 1), Layout = TensorEvaluator::Layout, RawAccess = true }; diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index cb917bb37..154a72d5c 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -228,6 +228,42 @@ void test_cuda_reductions() { gpu_device.deallocate(d_res_float); } +void test_cuda_forced_evals() { + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int num_elem = 101; + + float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + + Eigen::TensorMap, Eigen::Aligned> gpu_float( + d_float, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res_half( + d_res_half, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res_float( + d_res_float, num_elem); + + gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f); + gpu_res_float.device(gpu_device) = gpu_float.abs(); + gpu_res_half.device(gpu_device) = gpu_float.cast().abs().eval().cast(); + + Tensor half_prec(num_elem); + Tensor full_prec(num_elem); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); + gpu_device.synchronize(); + + for (int i = 0; i < num_elem; ++i) { + std::cout << "Checking unary " << i << std::endl; + VERIFY_IS_APPROX(full_prec(i), half_prec(i)); + } + + gpu_device.deallocate(d_float); + gpu_device.deallocate(d_res_half); + gpu_device.deallocate(d_res_float); +} #endif @@ -246,6 +282,7 @@ void test_cxx11_tensor_of_float16_cuda() CALL_SUBTEST_1(test_cuda_elementwise()); CALL_SUBTEST_2(test_cuda_contractions()); CALL_SUBTEST_3(test_cuda_reductions()); + CALL_SUBTEST_4(test_cuda_forced_evals()); } else { std::cout << "Half floats require compute capability of at least 5.3. This device only supports " << device.majorDeviceVersion() << "." << device.minorDeviceVersion() << ". Skipping the test" << std::endl; -- cgit v1.2.3 From 3dddd341334975d4f71f4f7ca15c48aaa920c57d Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 26 Apr 2016 11:20:25 +0200 Subject: Refactor the unsupported CXX11/Core module to internal headers only. --- unsupported/Eigen/CXX11/CMakeLists.txt | 2 +- unsupported/Eigen/CXX11/Core | 51 -- unsupported/Eigen/CXX11/Tensor | 4 +- unsupported/Eigen/CXX11/TensorSymmetry | 2 + unsupported/Eigen/CXX11/ThreadPool | 5 +- unsupported/Eigen/CXX11/src/CMakeLists.txt | 3 +- unsupported/Eigen/CXX11/src/Core/CMakeLists.txt | 1 - .../Eigen/CXX11/src/Core/util/CMakeLists.txt | 6 - unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h | 526 -------------------- .../Eigen/CXX11/src/Core/util/CXX11Workarounds.h | 88 ---- .../Eigen/CXX11/src/Core/util/EmulateArray.h | 271 ----------- .../Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h | 311 ------------ .../Eigen/CXX11/src/Core/util/MaxSizeVector.h | 130 ----- .../Eigen/CXX11/src/ThreadPool/CMakeLists.txt | 6 + unsupported/Eigen/CXX11/src/util/CMakeLists.txt | 6 + unsupported/Eigen/CXX11/src/util/CXX11Meta.h | 542 +++++++++++++++++++++ .../Eigen/CXX11/src/util/CXX11Workarounds.h | 88 ++++ unsupported/Eigen/CXX11/src/util/EmulateArray.h | 267 ++++++++++ .../Eigen/CXX11/src/util/EmulateCXX11Meta.h | 311 ++++++++++++ unsupported/Eigen/CXX11/src/util/MaxSizeVector.h | 130 +++++ unsupported/test/cxx11_meta.cpp | 2 +- 21 files changed, 1361 insertions(+), 1391 deletions(-) delete mode 100644 unsupported/Eigen/CXX11/Core delete mode 100644 unsupported/Eigen/CXX11/src/Core/CMakeLists.txt delete mode 100644 unsupported/Eigen/CXX11/src/Core/util/CMakeLists.txt delete mode 100644 unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h delete mode 100644 unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h delete mode 100644 unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h delete mode 100644 unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h delete mode 100644 unsupported/Eigen/CXX11/src/Core/util/MaxSizeVector.h create mode 100644 unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt create mode 100644 unsupported/Eigen/CXX11/src/util/CMakeLists.txt create mode 100644 unsupported/Eigen/CXX11/src/util/CXX11Meta.h create mode 100644 unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h create mode 100644 unsupported/Eigen/CXX11/src/util/EmulateArray.h create mode 100644 unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h create mode 100644 unsupported/Eigen/CXX11/src/util/MaxSizeVector.h diff --git a/unsupported/Eigen/CXX11/CMakeLists.txt b/unsupported/Eigen/CXX11/CMakeLists.txt index f1d9f0482..a40bc4715 100644 --- a/unsupported/Eigen/CXX11/CMakeLists.txt +++ b/unsupported/Eigen/CXX11/CMakeLists.txt @@ -1,4 +1,4 @@ -set(Eigen_CXX11_HEADERS Core Tensor TensorSymmetry) +set(Eigen_CXX11_HEADERS Tensor TensorSymmetry ThreadPool) install(FILES ${Eigen_CXX11_HEADERS} diff --git a/unsupported/Eigen/CXX11/Core b/unsupported/Eigen/CXX11/Core deleted file mode 100644 index 946145f5a..000000000 --- a/unsupported/Eigen/CXX11/Core +++ /dev/null @@ -1,51 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2013 Christian Seiler -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_CORE_MODULE -#define EIGEN_CXX11_CORE_MODULE - -#include - -#include - -/** \defgroup CXX11_Core_Module C++11 Core Module - * - * This module provides common core features for all modules that - * explicitly depend on C++11. Currently, this is only the Tensor - * module. Note that at this stage, you should not need to include - * this module directly. - * - * It also provides a limited fallback for compilers that don't support - * CXX11 yet, such as nvcc. - * - * \code - * #include - * \endcode - */ - -#include - -#include "src/Core/util/EmulateArray.h" -#include "src/Core/util/MaxSizeVector.h" - -// Emulate the cxx11 functionality that we need if the compiler doesn't support it. -// Visual studio 2015 doesn't advertise itself as cxx11 compliant, although it -// supports enough of the standard for our needs -#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900 -#include "src/Core/util/CXX11Workarounds.h" -#include "src/Core/util/CXX11Meta.h" -#else -#include "src/Core/util/EmulateCXX11Meta.h" -#endif - -#include - -#endif // EIGEN_CXX11_CORE_MODULE - diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index c4bcd2981..5abd5de1a 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -11,10 +11,10 @@ //#ifndef EIGEN_CXX11_TENSOR_MODULE //#define EIGEN_CXX11_TENSOR_MODULE -#include "Core" - #include +#include "src/util/CXX11Meta.h" +#include "src/util/MaxSizeVector.h" /** \defgroup CXX11_Tensor_Module Tensor Module * diff --git a/unsupported/Eigen/CXX11/TensorSymmetry b/unsupported/Eigen/CXX11/TensorSymmetry index f1dc25fea..fb1b0c0fb 100644 --- a/unsupported/Eigen/CXX11/TensorSymmetry +++ b/unsupported/Eigen/CXX11/TensorSymmetry @@ -14,6 +14,8 @@ #include +#include "src/util/CXX11Meta.h" + /** \defgroup CXX11_TensorSymmetry_Module Tensor Symmetry Module * * This module provides a classes that allow for the definition of diff --git a/unsupported/Eigen/CXX11/ThreadPool b/unsupported/Eigen/CXX11/ThreadPool index fe00a0b65..15e5634ed 100644 --- a/unsupported/Eigen/CXX11/ThreadPool +++ b/unsupported/Eigen/CXX11/ThreadPool @@ -10,8 +10,6 @@ #ifndef EIGEN_CXX11_THREADPOOL_MODULE #define EIGEN_CXX11_THREADPOOL_MODULE -#include "Core" - #include /** \defgroup CXX11_ThreadPool_Module C++11 ThreadPool Module @@ -45,6 +43,9 @@ #include #include +#include "src/util/CXX11Meta.h" +#include "src/util/MaxSizeVector.h" + #include "src/ThreadPool/ThreadLocal.h" #include "src/ThreadPool/ThreadYield.h" #include "src/ThreadPool/EventCount.h" diff --git a/unsupported/Eigen/CXX11/src/CMakeLists.txt b/unsupported/Eigen/CXX11/src/CMakeLists.txt index d90ee1b0f..1734262bb 100644 --- a/unsupported/Eigen/CXX11/src/CMakeLists.txt +++ b/unsupported/Eigen/CXX11/src/CMakeLists.txt @@ -1,3 +1,4 @@ -add_subdirectory(Core) +add_subdirectory(util) +add_subdirectory(ThreadPool) add_subdirectory(Tensor) add_subdirectory(TensorSymmetry) diff --git a/unsupported/Eigen/CXX11/src/Core/CMakeLists.txt b/unsupported/Eigen/CXX11/src/Core/CMakeLists.txt deleted file mode 100644 index 28571dcb9..000000000 --- a/unsupported/Eigen/CXX11/src/Core/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -add_subdirectory(util) diff --git a/unsupported/Eigen/CXX11/src/Core/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/Core/util/CMakeLists.txt deleted file mode 100644 index 1e3b14712..000000000 --- a/unsupported/Eigen/CXX11/src/Core/util/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_CXX11_Core_util_SRCS "*.h") - -INSTALL(FILES - ${Eigen_CXX11_Core_util_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/Core/util COMPONENT Devel - ) diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h deleted file mode 100644 index c582e21f5..000000000 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h +++ /dev/null @@ -1,526 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2013 Christian Seiler -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11META_H -#define EIGEN_CXX11META_H - -namespace Eigen { - -namespace internal { - -/** \internal - * \file CXX11/Core/util/CXX11Meta.h - * This file contains generic metaprogramming classes which are not specifically related to Eigen. - * This file expands upon Core/util/Meta.h and adds support for C++11 specific features. - */ - -template -struct type_list { constexpr static int count = sizeof...(tt); }; - -template -struct type_list { constexpr static int count = sizeof...(tt) + 1; typedef t first_type; }; - -template -struct numeric_list { constexpr static std::size_t count = sizeof...(nn); }; - -template -struct numeric_list { constexpr static std::size_t count = sizeof...(nn) + 1; constexpr static T first_value = n; }; - -/* numeric list constructors - * - * equivalencies: - * constructor result - * typename gen_numeric_list::type numeric_list - * typename gen_numeric_list_reversed::type numeric_list - * typename gen_numeric_list_swapped_pair::type numeric_list - * typename gen_numeric_list_repeated::type numeric_list - */ - -template struct gen_numeric_list : gen_numeric_list {}; -template struct gen_numeric_list { typedef numeric_list type; }; - -template struct gen_numeric_list_reversed : gen_numeric_list_reversed {}; -template struct gen_numeric_list_reversed { typedef numeric_list type; }; - -template struct gen_numeric_list_swapped_pair : gen_numeric_list_swapped_pair {}; -template struct gen_numeric_list_swapped_pair { typedef numeric_list type; }; - -template struct gen_numeric_list_repeated : gen_numeric_list_repeated {}; -template struct gen_numeric_list_repeated { typedef numeric_list type; }; - -/* list manipulation: concatenate */ - -template struct concat; - -template struct concat, type_list> { typedef type_list type; }; -template struct concat, numeric_list > { typedef numeric_list type; }; - -template struct mconcat; -template struct mconcat { typedef a type; }; -template struct mconcat : concat {}; -template struct mconcat : concat::type> {}; - -/* list manipulation: extract slices */ - -template struct take; -template struct take> : concat, typename take>::type> {}; -template struct take> { typedef type_list<> type; }; -template struct take<0, type_list> { typedef type_list<> type; }; -template<> struct take<0, type_list<>> { typedef type_list<> type; }; - -template struct take> : concat, typename take>::type> {}; -template struct take> { typedef numeric_list type; }; -template struct take<0, numeric_list> { typedef numeric_list type; }; -template struct take<0, numeric_list> { typedef numeric_list type; }; - -template struct h_skip_helper_numeric; -template struct h_skip_helper_numeric : h_skip_helper_numeric {}; -template struct h_skip_helper_numeric { typedef numeric_list type; }; -template struct h_skip_helper_numeric { typedef numeric_list type; }; -template struct h_skip_helper_numeric { typedef numeric_list type; }; - -template struct h_skip_helper_type; -template struct h_skip_helper_type : h_skip_helper_type {}; -template struct h_skip_helper_type<0, t, tt...> { typedef type_list type; }; -template struct h_skip_helper_type { typedef type_list<> type; }; -template<> struct h_skip_helper_type<0> { typedef type_list<> type; }; - -template -struct h_skip { - template - constexpr static inline typename h_skip_helper_numeric::type helper(numeric_list) { return typename h_skip_helper_numeric::type(); } - template - constexpr static inline typename h_skip_helper_type::type helper(type_list) { return typename h_skip_helper_type::type(); } -}; - -template struct skip { typedef decltype(h_skip::helper(a())) type; }; - -template struct slice : take::type> {}; - -/* list manipulation: retrieve single element from list */ - -template struct get; - -template struct get> : get> {}; -template struct get<0, type_list> { typedef a type; }; - -template struct get> : get> {}; -template struct get<0, numeric_list> { constexpr static T value = a; }; - -/* always get type, regardless of dummy; good for parameter pack expansion */ - -template struct id_numeric { typedef t type; }; -template struct id_type { typedef t type; }; - -/* equality checking, flagged version */ - -template struct is_same_gf : is_same { constexpr static int global_flags = 0; }; - -/* apply_op to list */ - -template< - bool from_left, // false - template class op, - typename additional_param, - typename... values -> -struct h_apply_op_helper { typedef type_list::type...> type; }; -template< - template class op, - typename additional_param, - typename... values -> -struct h_apply_op_helper { typedef type_list::type...> type; }; - -template< - bool from_left, - template class op, - typename additional_param -> -struct h_apply_op -{ - template - constexpr static typename h_apply_op_helper::type helper(type_list) - { return typename h_apply_op_helper::type(); } -}; - -template< - template class op, - typename additional_param, - typename a -> -struct apply_op_from_left { typedef decltype(h_apply_op::helper(a())) type; }; - -template< - template class op, - typename additional_param, - typename a -> -struct apply_op_from_right { typedef decltype(h_apply_op::helper(a())) type; }; - -/* see if an element is in a list */ - -template< - template class test, - typename check_against, - typename h_list, - bool last_check_positive = false -> -struct contained_in_list; - -template< - template class test, - typename check_against, - typename h_list -> -struct contained_in_list -{ - constexpr static bool value = true; -}; - -template< - template class test, - typename check_against, - typename a, - typename... as -> -struct contained_in_list, false> : contained_in_list, test::value> {}; - -template< - template class test, - typename check_against - EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty) -> -struct contained_in_list, false> { constexpr static bool value = false; }; - -/* see if an element is in a list and check for global flags */ - -template< - template class test, - typename check_against, - typename h_list, - int default_flags = 0, - bool last_check_positive = false, - int last_check_flags = default_flags -> -struct contained_in_list_gf; - -template< - template class test, - typename check_against, - typename h_list, - int default_flags, - int last_check_flags -> -struct contained_in_list_gf -{ - constexpr static bool value = true; - constexpr static int global_flags = last_check_flags; -}; - -template< - template class test, - typename check_against, - typename a, - typename... as, - int default_flags, - int last_check_flags -> -struct contained_in_list_gf, default_flags, false, last_check_flags> : contained_in_list_gf, default_flags, test::value, test::global_flags> {}; - -template< - template class test, - typename check_against - EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty), - int default_flags, - int last_check_flags -> -struct contained_in_list_gf, default_flags, false, last_check_flags> { constexpr static bool value = false; constexpr static int global_flags = default_flags; }; - -/* generic reductions */ - -template< - typename Reducer, - typename... Ts -> struct reduce; - -template< - typename Reducer -> struct reduce -{ - constexpr static inline int run() { return Reducer::Identity; } -}; - -template< - typename Reducer, - typename A -> struct reduce -{ - constexpr static inline A run(A a) { return a; } -}; - -template< - typename Reducer, - typename A, - typename... Ts -> struct reduce -{ - constexpr static inline auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce::run(ts...))) { - return Reducer::run(a, reduce::run(ts...)); - } -}; - -/* generic binary operations */ - -struct sum_op { - template EIGEN_DEVICE_FUNC constexpr static inline auto run(A a, B b) -> decltype(a + b) { return a + b; } - static constexpr int Identity = 0; -}; -struct product_op { - template EIGEN_DEVICE_FUNC constexpr static inline auto run(A a, B b) -> decltype(a * b) { return a * b; } - static constexpr int Identity = 1; -}; - -struct logical_and_op { template constexpr static inline auto run(A a, B b) -> decltype(a && b) { return a && b; } }; -struct logical_or_op { template constexpr static inline auto run(A a, B b) -> decltype(a || b) { return a || b; } }; - -struct equal_op { template constexpr static inline auto run(A a, B b) -> decltype(a == b) { return a == b; } }; -struct not_equal_op { template constexpr static inline auto run(A a, B b) -> decltype(a != b) { return a != b; } }; -struct lesser_op { template constexpr static inline auto run(A a, B b) -> decltype(a < b) { return a < b; } }; -struct lesser_equal_op { template constexpr static inline auto run(A a, B b) -> decltype(a <= b) { return a <= b; } }; -struct greater_op { template constexpr static inline auto run(A a, B b) -> decltype(a > b) { return a > b; } }; -struct greater_equal_op { template constexpr static inline auto run(A a, B b) -> decltype(a >= b) { return a >= b; } }; - -/* generic unary operations */ - -struct not_op { template constexpr static inline auto run(A a) -> decltype(!a) { return !a; } }; -struct negation_op { template constexpr static inline auto run(A a) -> decltype(-a) { return -a; } }; -struct greater_equal_zero_op { template constexpr static inline auto run(A a) -> decltype(a >= 0) { return a >= 0; } }; - - -/* reductions for lists */ - -// using auto -> return value spec makes ICC 13.0 and 13.1 crash here, so we have to hack it -// together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1 -// does... -template -constexpr inline decltype(reduce::run((*((Ts*)0))...)) arg_prod(Ts... ts) -{ - return reduce::run(ts...); -} - -template -constexpr inline decltype(reduce::run((*((Ts*)0))...)) arg_sum(Ts... ts) -{ - return reduce::run(ts...); -} - -/* reverse arrays */ - -template -constexpr inline Array h_array_reverse(Array arr, numeric_list) -{ - return {{array_get(arr)...}}; -} - -template -constexpr inline array array_reverse(array arr) -{ - return h_array_reverse(arr, typename gen_numeric_list::type()); -} - - -/* generic array reductions */ - -// can't reuse standard reduce() interface above because Intel's Compiler -// *really* doesn't like it, so we just reimplement the stuff -// (start from N - 1 and work down to 0 because specialization for -// n == N - 1 also doesn't work in Intel's compiler, so it goes into -// an infinite loop) -template -struct h_array_reduce { - EIGEN_DEVICE_FUNC constexpr static inline auto run(array arr, T identity) -> decltype(Reducer::run(h_array_reduce::run(arr, identity), array_get(arr))) - { - return Reducer::run(h_array_reduce::run(arr, identity), array_get(arr)); - } -}; - -template -struct h_array_reduce -{ - EIGEN_DEVICE_FUNC constexpr static inline T run(const array& arr, T) - { - return array_get<0>(arr); - } -}; - -template -struct h_array_reduce -{ - EIGEN_DEVICE_FUNC constexpr static inline T run(const array&, T identity) - { - return identity; - } -}; - -template -EIGEN_DEVICE_FUNC constexpr inline auto array_reduce(const array& arr, T identity) -> decltype(h_array_reduce::run(arr, identity)) -{ - return h_array_reduce::run(arr, identity); -} - -/* standard array reductions */ - -template -EIGEN_DEVICE_FUNC constexpr inline auto array_sum(const array& arr) -> decltype(array_reduce(arr, static_cast(0))) -{ - return array_reduce(arr, static_cast(0)); -} - -template -EIGEN_DEVICE_FUNC constexpr inline auto array_prod(const array& arr) -> decltype(array_reduce(arr, static_cast(1))) -{ - return array_reduce(arr, static_cast(1)); -} - -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector& a) { - eigen_assert(a.size() > 0); - t prod = 1; - for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; } - return prod; -} - -/* zip an array */ - -template -constexpr inline array h_array_zip(array a, array b, numeric_list) -{ - return array{{ Op::run(array_get(a), array_get(b))... }}; -} - -template -constexpr inline array array_zip(array a, array b) -{ - return h_array_zip(a, b, typename gen_numeric_list::type()); -} - -/* zip an array and reduce the result */ - -template -constexpr inline auto h_array_zip_and_reduce(array a, array b, numeric_list) -> decltype(reduce::type...>::run(Op::run(array_get(a), array_get(b))...)) -{ - return reduce::type...>::run(Op::run(array_get(a), array_get(b))...); -} - -template -constexpr inline auto array_zip_and_reduce(array a, array b) -> decltype(h_array_zip_and_reduce(a, b, typename gen_numeric_list::type())) -{ - return h_array_zip_and_reduce(a, b, typename gen_numeric_list::type()); -} - -/* apply stuff to an array */ - -template -constexpr inline array h_array_apply(array a, numeric_list) -{ - return array{{ Op::run(array_get(a))... }}; -} - -template -constexpr inline array array_apply(array a) -{ - return h_array_apply(a, typename gen_numeric_list::type()); -} - -/* apply stuff to an array and reduce */ - -template -constexpr inline auto h_array_apply_and_reduce(array arr, numeric_list) -> decltype(reduce::type...>::run(Op::run(array_get(arr))...)) -{ - return reduce::type...>::run(Op::run(array_get(arr))...); -} - -template -constexpr inline auto array_apply_and_reduce(array a) -> decltype(h_array_apply_and_reduce(a, typename gen_numeric_list::type())) -{ - return h_array_apply_and_reduce(a, typename gen_numeric_list::type()); -} - -/* repeat a value n times (and make an array out of it - * usage: - * array = repeat<16>(42); - */ - -template -struct h_repeat -{ - template - constexpr static inline array run(t v, numeric_list) - { - return {{ typename id_numeric::type(v)... }}; - } -}; - -template -constexpr array repeat(t v) { return h_repeat::run(v, typename gen_numeric_list::type()); } - -/* instantiate a class by a C-style array */ -template -struct h_instantiate_by_c_array; - -template -struct h_instantiate_by_c_array -{ - static InstType run(ArrType* arr, Ps... args) - { - return h_instantiate_by_c_array::run(arr + 1, args..., arr[0]); - } -}; - -template -struct h_instantiate_by_c_array -{ - static InstType run(ArrType* arr, Ps... args) - { - return h_instantiate_by_c_array::run(arr + 1, arr[0], args...); - } -}; - -template -struct h_instantiate_by_c_array -{ - static InstType run(ArrType* arr, Ps... args) - { - (void)arr; - return InstType(args...); - } -}; - -template -struct h_instantiate_by_c_array -{ - static InstType run(ArrType* arr, Ps... args) - { - (void)arr; - return InstType(args...); - } -}; - -template -InstType instantiate_by_c_array(ArrType* arr) -{ - return h_instantiate_by_c_array::run(arr); -} - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_CXX11META_H diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h deleted file mode 100644 index fe4d22803..000000000 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h +++ /dev/null @@ -1,88 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2013 Christian Seiler -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11WORKAROUNDS_H -#define EIGEN_CXX11WORKAROUNDS_H - -/* COMPATIBILITY CHECKS - * (so users of compilers that are too old get some realistic error messages) - */ -#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1310) -#error Intel Compiler only supports required C++ features since version 13.1. -// note that most stuff in principle works with 13.0 but when combining -// some features, at some point 13.0 will just fail with an internal assertion -#elif defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6)) -// G++ < 4.6 by default will continue processing the source files - even if we use #error to make -// it error out. For this reason, we use the pragma to make sure G++ aborts at the first error -// it sees. Unfortunately, that is still not our #error directive, but at least the output is -// short enough the user has a chance to see that the compiler version is not sufficient for -// the funky template mojo we use. -#pragma GCC diagnostic error "-Wfatal-errors" -#error GNU C++ Compiler (g++) only supports required C++ features since version 4.6. -#endif - -/* Check that the compiler at least claims to support C++11. It might not be sufficient - * because the compiler may not implement it correctly, but at least we'll know. - * On the other hand, visual studio still doesn't claim to support C++11 although it's - * compliant enugh for our purpose. - */ -#if (__cplusplus <= 199711L) && (EIGEN_COMP_MSVC < 1900) -#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) -#pragma GCC diagnostic error "-Wfatal-errors" -#endif -#error This library needs at least a C++11 compliant compiler. If you use g++/clang, please enable the -std=c++11 compiler flag. (-std=c++0x on older versions.) -#endif - -namespace Eigen { - -namespace internal { - -/* std::get is only constexpr in C++14, not yet in C++11 - */ - - -template constexpr inline T& array_get(std::vector& a) { return a[I]; } -template constexpr inline T&& array_get(std::vector&& a) { return a[I]; } -template constexpr inline T const& array_get(std::vector const& a) { return a[I]; } - -/* Suppose you have a template of the form - * template struct X; - * And you want to specialize it in such a way: - * template struct X> { ::: }; - * template<> struct X> { ::: }; - * This will work in Intel's compiler 13.0, but only to some extent in g++ 4.6, since - * g++ can only match templates called with parameter packs if the number of template - * arguments is not a fixed size (so inside the first specialization, referencing - * X> will fail in g++). On the other hand, g++ will accept the following: - * template struct X> { ::: }: - * as an additional (!) specialization, which will then only match the empty case. - * But Intel's compiler 13.0 won't accept that, it will only accept the empty syntax, - * so we have to create a workaround for this. - */ -#if defined(__GNUC__) && !defined(__INTEL_COMPILER) -#define EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n) mt... n -#define EIGEN_TPL_PP_SPEC_HACK_DEFC(mt, n) , EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n) -#define EIGEN_TPL_PP_SPEC_HACK_USE(n) n... -#define EIGEN_TPL_PP_SPEC_HACK_USEC(n) , n... -#else -#define EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n) -#define EIGEN_TPL_PP_SPEC_HACK_DEFC(mt, n) -#define EIGEN_TPL_PP_SPEC_HACK_USE(n) -#define EIGEN_TPL_PP_SPEC_HACK_USEC(n) -#endif - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_CXX11WORKAROUNDS_H - -/* - * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; - */ diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h deleted file mode 100644 index 579519b04..000000000 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h +++ /dev/null @@ -1,271 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_EMULATE_ARRAY_H -#define EIGEN_EMULATE_ARRAY_H - - - -// The array class is only available starting with cxx11. Emulate our own here -// if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler! -// Moreover, CUDA doesn't support the STL containers, so we use our own instead. -#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(__CUDACC__) || defined(EIGEN_AVOID_STL_ARRAY) - -namespace Eigen { -template class array { - public: - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE T& operator[] (size_t index) { return values[index]; } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE T& front() { return values[0]; } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const T& front() const { return values[0]; } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE T& back() { return values[n-1]; } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const T& back() const { return values[n-1]; } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - static std::size_t size() { return n; } - - T values[n]; - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array() { } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array(const T& v) { - EIGEN_STATIC_ASSERT(n==1, YOU_MADE_A_PROGRAMMING_MISTAKE) - values[0] = v; - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array(const T& v1, const T& v2) { - EIGEN_STATIC_ASSERT(n==2, YOU_MADE_A_PROGRAMMING_MISTAKE) - values[0] = v1; - values[1] = v2; - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3) { - EIGEN_STATIC_ASSERT(n==3, YOU_MADE_A_PROGRAMMING_MISTAKE) - values[0] = v1; - values[1] = v2; - values[2] = v3; - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, - const T& v4) { - EIGEN_STATIC_ASSERT(n==4, YOU_MADE_A_PROGRAMMING_MISTAKE) - values[0] = v1; - values[1] = v2; - values[2] = v3; - values[3] = v4; - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, - const T& v5) { - EIGEN_STATIC_ASSERT(n==5, YOU_MADE_A_PROGRAMMING_MISTAKE) - values[0] = v1; - values[1] = v2; - values[2] = v3; - values[3] = v4; - values[4] = v5; - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, - const T& v5, const T& v6) { - EIGEN_STATIC_ASSERT(n==6, YOU_MADE_A_PROGRAMMING_MISTAKE) - values[0] = v1; - values[1] = v2; - values[2] = v3; - values[3] = v4; - values[4] = v5; - values[5] = v6; - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, - const T& v5, const T& v6, const T& v7) { - EIGEN_STATIC_ASSERT(n==7, YOU_MADE_A_PROGRAMMING_MISTAKE) - values[0] = v1; - values[1] = v2; - values[2] = v3; - values[3] = v4; - values[4] = v5; - values[5] = v6; - values[6] = v7; - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array( - const T& v1, const T& v2, const T& v3, const T& v4, - const T& v5, const T& v6, const T& v7, const T& v8) { - EIGEN_STATIC_ASSERT(n==8, YOU_MADE_A_PROGRAMMING_MISTAKE) - values[0] = v1; - values[1] = v2; - values[2] = v3; - values[3] = v4; - values[4] = v5; - values[5] = v6; - values[6] = v7; - values[7] = v8; - } - -#ifdef EIGEN_HAS_VARIADIC_TEMPLATES - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array(std::initializer_list l) { - eigen_assert(l.size() == n); - internal::smart_copy(l.begin(), l.end(), values); - } -#endif -}; - - -// Specialize array for zero size -template class array { - public: - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE T& operator[] (size_t) { - eigen_assert(false && "Can't index a zero size array"); - return dummy; - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const T& operator[] (size_t) const { - eigen_assert(false && "Can't index a zero size array"); - return dummy; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE T& front() { - eigen_assert(false && "Can't index a zero size array"); - return dummy; - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const T& front() const { - eigen_assert(false && "Can't index a zero size array"); - return dummy; - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE T& back() { - eigen_assert(false && "Can't index a zero size array"); - return dummy; - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const T& back() const { - eigen_assert(false && "Can't index a zero size array"); - return dummy; - } - - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::size_t size() { return 0; } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array() : dummy() { } - -#ifdef EIGEN_HAS_VARIADIC_TEMPLATES - EIGEN_DEVICE_FUNC array(std::initializer_list l) : dummy() { - eigen_assert(l.size() == 0); - } -#endif - - private: - T dummy; -}; - -// Comparison operator -// Todo: implement !=, <, <=, >, and >= -template -EIGEN_DEVICE_FUNC bool operator==(const array& lhs, const array& rhs) { - for (std::size_t i = 0; i < N; ++i) { - if (lhs[i] != rhs[i]) { - return false; - } - } - return true; -} - - -namespace internal { -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array& a) { - return a[I]; -} -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array& a) { - return a[I]; -} - -template struct array_size; -template struct array_size > { - static const size_t value = N; -}; -template struct array_size; -template struct array_size& > { - static const size_t value = N; -}; -template struct array_size; -template struct array_size > { - static const size_t value = N; -}; -template struct array_size; -template struct array_size& > { - static const size_t value = N; -}; - -} // end namespace internal -} // end namespace Eigen - -#else - -// The compiler supports c++11, and we're not targetting cuda: use std::array as Eigen array -#include -namespace Eigen { - -template using array = std::array; - -namespace internal { -/* std::get is only constexpr in C++14, not yet in C++11 - * - libstdc++ from version 4.7 onwards has it nevertheless, - * so use that - * - libstdc++ older versions: use _M_instance directly - * - libc++ all versions so far: use __elems_ directly - * - all other libs: use std::get to be portable, but - * this may not be constexpr - */ -#if defined(__GLIBCXX__) && __GLIBCXX__ < 20120322 -#define STD_GET_ARR_HACK a._M_instance[I] -#elif defined(_LIBCPP_VERSION) -#define STD_GET_ARR_HACK a.__elems_[I] -#else -#define STD_GET_ARR_HACK std::template get(a) -#endif - -template constexpr inline T& array_get(std::array& a) { return (T&) STD_GET_ARR_HACK; } -template constexpr inline T&& array_get(std::array&& a) { return (T&&) STD_GET_ARR_HACK; } -template constexpr inline T const& array_get(std::array const& a) { return (T const&) STD_GET_ARR_HACK; } - -#undef STD_GET_ARR_HACK - -template struct array_size; -template struct array_size > { - static const size_t value = N; -}; -template struct array_size; -template struct array_size > { - static const size_t value = N; -}; -} // end namespace internal -} // end namespace Eigen - -#endif - - - - - -#endif // EIGEN_EMULATE_ARRAY_H diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h deleted file mode 100644 index d685d4f9d..000000000 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h +++ /dev/null @@ -1,311 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_EMULATE_CXX11_META_H -#define EIGEN_EMULATE_CXX11_META_H - - - -namespace Eigen { - -namespace internal { - -/** \internal - * \file CXX11/Core/util/EmulateCXX11Meta.h - * This file emulates a subset of the functionality provided by CXXMeta.h for - * compilers that don't yet support cxx11 such as nvcc. - */ - -struct empty_list { static const std::size_t count = 0; }; - -template struct type_list { - typedef T HeadType; - typedef Tail TailType; - static const T head; - static const Tail tail; - static const std::size_t count = 1 + Tail::count; -}; - -struct null_type { }; - -template -struct make_type_list { - typedef typename make_type_list::type tailresult; - - typedef type_list type; -}; - -template<> struct make_type_list<> { - typedef empty_list type; -}; - - -template struct get_type; - -template -struct get_type<0, type_list > -{ - typedef Head type; -}; - -template -struct get_type > -{ - typedef typename get_type::type type; -}; - - -/* numeric list */ -template -struct type2val { - typedef T type; - static const T value = n; -}; - - -template struct gen_numeric_list_repeated; - -template struct gen_numeric_list_repeated { - typedef typename make_type_list >::type type; -}; - -template struct gen_numeric_list_repeated { - typedef typename make_type_list, type2val >::type type; -}; - -template struct gen_numeric_list_repeated { - typedef typename make_type_list, type2val, type2val >::type type; -}; - -template struct gen_numeric_list_repeated { - typedef typename make_type_list, type2val, type2val, type2val >::type type; -}; - -template struct gen_numeric_list_repeated { - typedef typename make_type_list, type2val, type2val, type2val, type2val >::type type; -}; - -template struct gen_numeric_list_repeated { - typedef typename make_type_list, type2val, type2val, - type2val, type2val, type2val >::type type; -}; - -template struct gen_numeric_list_repeated { - typedef typename make_type_list, type2val, type2val, - type2val, type2val, type2val, - type2val >::type type; -}; - -template struct gen_numeric_list_repeated { - typedef typename make_type_list, type2val, type2val, - type2val, type2val, type2val, - type2val, type2val >::type type; -}; - - -template struct get; - -template -struct get -{ - get() { eigen_assert(false && "index overflow"); } - typedef void type; - static const char value = '\0'; -}; - -template -struct get > -{ - get() { eigen_assert(false && "index overflow"); } - typedef void type; - static const char value = '\0'; -}; - -template -struct get<0, type_list > -{ - typedef typename Head::type type; - static const type value = Head::value; -}; - -template -struct get<0, type_list > -{ - typedef typename Head::type type; - static const type value = Head::value; -}; - -template -struct get > -{ - typedef typename Tail::HeadType::type type; - static const type value = get::value; -}; - - -template struct arg_prod { - static const typename NList::HeadType::type value = get<0, NList>::value * arg_prod::value; -}; -template <> struct arg_prod { - static const int value = 1; -}; - - -template -array repeat(t v) { - array array; - array.fill(v); - return array; -} - -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(type_list&) { - return get >::value; -} -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_list&) { - return get >::value; -} - -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NList::HeadType::type array_prod(const NList&) { - return arg_prod::value; -} - -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array& a) { - t prod = 1; - for (size_t i = 0; i < n; ++i) { prod *= a[i]; } - return prod; -} -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array& /*a*/) { - return 0; -} - -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector& a) { - eigen_assert(a.size() > 0); - t prod = 1; - for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; } - return prod; -} - - -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(std::vector& a) { - return a[I]; -} -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const std::vector& a) { - return a[I]; -} - -struct sum_op { - template static inline bool run(A a, B b) { return a + b; } -}; -struct product_op { - template static inline bool run(A a, B b) { return a * b; } -}; - -struct logical_and_op { - template static inline bool run(A a, B b) { return a && b; } -}; -struct logical_or_op { - template static inline bool run(A a, B b) { return a || b; } -}; - -struct equal_op { - template static inline bool run(A a, B b) { return a == b; } -}; -struct not_equal_op { - template static inline bool run(A a, B b) { return a != b; } -}; -struct lesser_op { - template static inline bool run(A a, B b) { return a < b; } -}; -struct lesser_equal_op { - template static inline bool run(A a, B b) { return a <= b; } -}; - -struct greater_op { - template static inline bool run(A a, B b) { return a > b; } -}; -struct greater_equal_op { - template static inline bool run(A a, B b) { return a >= b; } -}; - -struct not_op { - template static inline bool run(A a) { return !a; } -}; -struct negation_op { - template static inline bool run(A a) { return -a; } -}; -struct greater_equal_zero_op { - template static inline bool run(A a) { return a >= 0; } -}; - - -template -struct ArrayApplyAndReduce { - static inline bool run(const array& a) { - EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE); - bool result = Reducer::run(Op::run(a[0]), Op::run(a[1])); - for (size_t i = 2; i < N; ++i) { - result = Reducer::run(result, Op::run(a[i])); - } - return result; - } -}; - -template -struct ArrayApplyAndReduce { - static inline bool run(const array& a) { - return Op::run(a[0]); - } -}; - -template -inline bool array_apply_and_reduce(const array& a) { - return ArrayApplyAndReduce::run(a); -} - -template -struct ArrayZipAndReduce { - static inline bool run(const array& a, const array& b) { - EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE); - bool result = Reducer::run(Op::run(a[0], b[0]), Op::run(a[1], b[1])); - for (size_t i = 2; i < N; ++i) { - result = Reducer::run(result, Op::run(a[i], b[i])); - } - return result; - } -}; - -template -struct ArrayZipAndReduce { - static inline bool run(const array& a, const array& b) { - return Op::run(a[0], b[0]); - } -}; - -template -inline bool array_zip_and_reduce(const array& a, const array& b) { - return ArrayZipAndReduce::run(a, b); -} - -} // end namespace internal - -} // end namespace Eigen - - - -#endif // EIGEN_EMULATE_CXX11_META_H diff --git a/unsupported/Eigen/CXX11/src/Core/util/MaxSizeVector.h b/unsupported/Eigen/CXX11/src/Core/util/MaxSizeVector.h deleted file mode 100644 index 551124bae..000000000 --- a/unsupported/Eigen/CXX11/src/Core/util/MaxSizeVector.h +++ /dev/null @@ -1,130 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_FIXEDSIZEVECTOR_H -#define EIGEN_FIXEDSIZEVECTOR_H - -namespace Eigen { - -/** \class MaxSizeVector - * \ingroup Core - * - * \brief The MaxSizeVector class. - * - * The %MaxSizeVector provides a subset of std::vector functionality. - * - * The goal is to provide basic std::vector operations when using - * std::vector is not an option (e.g. on GPU or when compiling using - * FMA/AVX, as this can cause either compilation failures or illegal - * instruction failures). - * - * Beware: The constructors are not API compatible with these of - * std::vector. - */ -template -class MaxSizeVector { - public: - // Construct a new MaxSizeVector, reserve n elements. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit MaxSizeVector(size_t n) - : reserve_(n), size_(0), - data_(static_cast(internal::aligned_malloc(n * sizeof(T)))) { - for (size_t i = 0; i < n; ++i) { new (&data_[i]) T; } - } - - // Construct a new MaxSizeVector, reserve and resize to n. - // Copy the init value to all elements. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit MaxSizeVector(size_t n, const T& init) - : reserve_(n), size_(n), - data_(static_cast(internal::aligned_malloc(n * sizeof(T)))) { - for (size_t i = 0; i < n; ++i) { new (&data_[i]) T(init); } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - ~MaxSizeVector() { - for (size_t i = 0; i < size_; ++i) { - data_[i].~T(); - } - internal::aligned_free(data_); - } - - // Append new elements (up to reserved size). - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void push_back(const T& t) { - eigen_assert(size_ < reserve_); - data_[size_++] = t; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const T& operator[] (size_t i) const { - eigen_assert(i < size_); - return data_[i]; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - T& operator[] (size_t i) { - eigen_assert(i < size_); - return data_[i]; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - T& back() { - eigen_assert(size_ > 0); - return data_[size_ - 1]; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const T& back() const { - eigen_assert(size_ > 0); - return data_[size_ - 1]; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void pop_back() { - // NOTE: This does not destroy the value at the end the way - // std::vector's version of pop_back() does. That happens when - // the Vector is destroyed. - eigen_assert(size_ > 0); - size_--; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - size_t size() const { return size_; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - bool empty() const { return size_ == 0; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - T* data() { return data_; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const T* data() const { return data_; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - T* begin() { return data_; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - T* end() { return data_ + size_; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const T* begin() const { return data_; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const T* end() const { return data_ + size_; } - - private: - size_t reserve_; - size_t size_; - T* data_; -}; - -} // namespace Eigen - -#endif // EIGEN_FIXEDSIZEVECTOR_H diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt b/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt new file mode 100644 index 000000000..88fef50c6 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt @@ -0,0 +1,6 @@ +FILE(GLOB Eigen_CXX11_ThreadPool_SRCS "*.h") + +INSTALL(FILES + ${Eigen_CXX11_ThreadPool_SRCS} + DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/ThreadPool COMPONENT Devel + ) diff --git a/unsupported/Eigen/CXX11/src/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/util/CMakeLists.txt new file mode 100644 index 000000000..7eab492d6 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/util/CMakeLists.txt @@ -0,0 +1,6 @@ +FILE(GLOB Eigen_CXX11_util_SRCS "*.h") + +INSTALL(FILES + ${Eigen_CXX11_util_SRCS} + DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/util COMPONENT Devel + ) diff --git a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h new file mode 100644 index 000000000..f479590b9 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h @@ -0,0 +1,542 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11META_H +#define EIGEN_CXX11META_H + +#include +#include "EmulateArray.h" + +// Emulate the cxx11 functionality that we need if the compiler doesn't support it. +// Visual studio 2015 doesn't advertise itself as cxx11 compliant, although it +// supports enough of the standard for our needs +#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900 + +#include "CXX11Workarounds.h" + +namespace Eigen { + +namespace internal { + +/** \internal + * \file CXX11/util/CXX11Meta.h + * This file contains generic metaprogramming classes which are not specifically related to Eigen. + * This file expands upon Core/util/Meta.h and adds support for C++11 specific features. + */ + +template +struct type_list { constexpr static int count = sizeof...(tt); }; + +template +struct type_list { constexpr static int count = sizeof...(tt) + 1; typedef t first_type; }; + +template +struct numeric_list { constexpr static std::size_t count = sizeof...(nn); }; + +template +struct numeric_list { constexpr static std::size_t count = sizeof...(nn) + 1; constexpr static T first_value = n; }; + +/* numeric list constructors + * + * equivalencies: + * constructor result + * typename gen_numeric_list::type numeric_list + * typename gen_numeric_list_reversed::type numeric_list + * typename gen_numeric_list_swapped_pair::type numeric_list + * typename gen_numeric_list_repeated::type numeric_list + */ + +template struct gen_numeric_list : gen_numeric_list {}; +template struct gen_numeric_list { typedef numeric_list type; }; + +template struct gen_numeric_list_reversed : gen_numeric_list_reversed {}; +template struct gen_numeric_list_reversed { typedef numeric_list type; }; + +template struct gen_numeric_list_swapped_pair : gen_numeric_list_swapped_pair {}; +template struct gen_numeric_list_swapped_pair { typedef numeric_list type; }; + +template struct gen_numeric_list_repeated : gen_numeric_list_repeated {}; +template struct gen_numeric_list_repeated { typedef numeric_list type; }; + +/* list manipulation: concatenate */ + +template struct concat; + +template struct concat, type_list> { typedef type_list type; }; +template struct concat, numeric_list > { typedef numeric_list type; }; + +template struct mconcat; +template struct mconcat { typedef a type; }; +template struct mconcat : concat {}; +template struct mconcat : concat::type> {}; + +/* list manipulation: extract slices */ + +template struct take; +template struct take> : concat, typename take>::type> {}; +template struct take> { typedef type_list<> type; }; +template struct take<0, type_list> { typedef type_list<> type; }; +template<> struct take<0, type_list<>> { typedef type_list<> type; }; + +template struct take> : concat, typename take>::type> {}; +template struct take> { typedef numeric_list type; }; +template struct take<0, numeric_list> { typedef numeric_list type; }; +template struct take<0, numeric_list> { typedef numeric_list type; }; + +template struct h_skip_helper_numeric; +template struct h_skip_helper_numeric : h_skip_helper_numeric {}; +template struct h_skip_helper_numeric { typedef numeric_list type; }; +template struct h_skip_helper_numeric { typedef numeric_list type; }; +template struct h_skip_helper_numeric { typedef numeric_list type; }; + +template struct h_skip_helper_type; +template struct h_skip_helper_type : h_skip_helper_type {}; +template struct h_skip_helper_type<0, t, tt...> { typedef type_list type; }; +template struct h_skip_helper_type { typedef type_list<> type; }; +template<> struct h_skip_helper_type<0> { typedef type_list<> type; }; + +template +struct h_skip { + template + constexpr static inline typename h_skip_helper_numeric::type helper(numeric_list) { return typename h_skip_helper_numeric::type(); } + template + constexpr static inline typename h_skip_helper_type::type helper(type_list) { return typename h_skip_helper_type::type(); } +}; + +template struct skip { typedef decltype(h_skip::helper(a())) type; }; + +template struct slice : take::type> {}; + +/* list manipulation: retrieve single element from list */ + +template struct get; + +template struct get> : get> {}; +template struct get<0, type_list> { typedef a type; }; + +template struct get> : get> {}; +template struct get<0, numeric_list> { constexpr static T value = a; }; + +/* always get type, regardless of dummy; good for parameter pack expansion */ + +template struct id_numeric { typedef t type; }; +template struct id_type { typedef t type; }; + +/* equality checking, flagged version */ + +template struct is_same_gf : is_same { constexpr static int global_flags = 0; }; + +/* apply_op to list */ + +template< + bool from_left, // false + template class op, + typename additional_param, + typename... values +> +struct h_apply_op_helper { typedef type_list::type...> type; }; +template< + template class op, + typename additional_param, + typename... values +> +struct h_apply_op_helper { typedef type_list::type...> type; }; + +template< + bool from_left, + template class op, + typename additional_param +> +struct h_apply_op +{ + template + constexpr static typename h_apply_op_helper::type helper(type_list) + { return typename h_apply_op_helper::type(); } +}; + +template< + template class op, + typename additional_param, + typename a +> +struct apply_op_from_left { typedef decltype(h_apply_op::helper(a())) type; }; + +template< + template class op, + typename additional_param, + typename a +> +struct apply_op_from_right { typedef decltype(h_apply_op::helper(a())) type; }; + +/* see if an element is in a list */ + +template< + template class test, + typename check_against, + typename h_list, + bool last_check_positive = false +> +struct contained_in_list; + +template< + template class test, + typename check_against, + typename h_list +> +struct contained_in_list +{ + constexpr static bool value = true; +}; + +template< + template class test, + typename check_against, + typename a, + typename... as +> +struct contained_in_list, false> : contained_in_list, test::value> {}; + +template< + template class test, + typename check_against + EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty) +> +struct contained_in_list, false> { constexpr static bool value = false; }; + +/* see if an element is in a list and check for global flags */ + +template< + template class test, + typename check_against, + typename h_list, + int default_flags = 0, + bool last_check_positive = false, + int last_check_flags = default_flags +> +struct contained_in_list_gf; + +template< + template class test, + typename check_against, + typename h_list, + int default_flags, + int last_check_flags +> +struct contained_in_list_gf +{ + constexpr static bool value = true; + constexpr static int global_flags = last_check_flags; +}; + +template< + template class test, + typename check_against, + typename a, + typename... as, + int default_flags, + int last_check_flags +> +struct contained_in_list_gf, default_flags, false, last_check_flags> : contained_in_list_gf, default_flags, test::value, test::global_flags> {}; + +template< + template class test, + typename check_against + EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty), + int default_flags, + int last_check_flags +> +struct contained_in_list_gf, default_flags, false, last_check_flags> { constexpr static bool value = false; constexpr static int global_flags = default_flags; }; + +/* generic reductions */ + +template< + typename Reducer, + typename... Ts +> struct reduce; + +template< + typename Reducer +> struct reduce +{ + constexpr static inline int run() { return Reducer::Identity; } +}; + +template< + typename Reducer, + typename A +> struct reduce +{ + constexpr static inline A run(A a) { return a; } +}; + +template< + typename Reducer, + typename A, + typename... Ts +> struct reduce +{ + constexpr static inline auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce::run(ts...))) { + return Reducer::run(a, reduce::run(ts...)); + } +}; + +/* generic binary operations */ + +struct sum_op { + template EIGEN_DEVICE_FUNC constexpr static inline auto run(A a, B b) -> decltype(a + b) { return a + b; } + static constexpr int Identity = 0; +}; +struct product_op { + template EIGEN_DEVICE_FUNC constexpr static inline auto run(A a, B b) -> decltype(a * b) { return a * b; } + static constexpr int Identity = 1; +}; + +struct logical_and_op { template constexpr static inline auto run(A a, B b) -> decltype(a && b) { return a && b; } }; +struct logical_or_op { template constexpr static inline auto run(A a, B b) -> decltype(a || b) { return a || b; } }; + +struct equal_op { template constexpr static inline auto run(A a, B b) -> decltype(a == b) { return a == b; } }; +struct not_equal_op { template constexpr static inline auto run(A a, B b) -> decltype(a != b) { return a != b; } }; +struct lesser_op { template constexpr static inline auto run(A a, B b) -> decltype(a < b) { return a < b; } }; +struct lesser_equal_op { template constexpr static inline auto run(A a, B b) -> decltype(a <= b) { return a <= b; } }; +struct greater_op { template constexpr static inline auto run(A a, B b) -> decltype(a > b) { return a > b; } }; +struct greater_equal_op { template constexpr static inline auto run(A a, B b) -> decltype(a >= b) { return a >= b; } }; + +/* generic unary operations */ + +struct not_op { template constexpr static inline auto run(A a) -> decltype(!a) { return !a; } }; +struct negation_op { template constexpr static inline auto run(A a) -> decltype(-a) { return -a; } }; +struct greater_equal_zero_op { template constexpr static inline auto run(A a) -> decltype(a >= 0) { return a >= 0; } }; + + +/* reductions for lists */ + +// using auto -> return value spec makes ICC 13.0 and 13.1 crash here, so we have to hack it +// together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1 +// does... +template +constexpr inline decltype(reduce::run((*((Ts*)0))...)) arg_prod(Ts... ts) +{ + return reduce::run(ts...); +} + +template +constexpr inline decltype(reduce::run((*((Ts*)0))...)) arg_sum(Ts... ts) +{ + return reduce::run(ts...); +} + +/* reverse arrays */ + +template +constexpr inline Array h_array_reverse(Array arr, numeric_list) +{ + return {{array_get(arr)...}}; +} + +template +constexpr inline array array_reverse(array arr) +{ + return h_array_reverse(arr, typename gen_numeric_list::type()); +} + + +/* generic array reductions */ + +// can't reuse standard reduce() interface above because Intel's Compiler +// *really* doesn't like it, so we just reimplement the stuff +// (start from N - 1 and work down to 0 because specialization for +// n == N - 1 also doesn't work in Intel's compiler, so it goes into +// an infinite loop) +template +struct h_array_reduce { + EIGEN_DEVICE_FUNC constexpr static inline auto run(array arr, T identity) -> decltype(Reducer::run(h_array_reduce::run(arr, identity), array_get(arr))) + { + return Reducer::run(h_array_reduce::run(arr, identity), array_get(arr)); + } +}; + +template +struct h_array_reduce +{ + EIGEN_DEVICE_FUNC constexpr static inline T run(const array& arr, T) + { + return array_get<0>(arr); + } +}; + +template +struct h_array_reduce +{ + EIGEN_DEVICE_FUNC constexpr static inline T run(const array&, T identity) + { + return identity; + } +}; + +template +EIGEN_DEVICE_FUNC constexpr inline auto array_reduce(const array& arr, T identity) -> decltype(h_array_reduce::run(arr, identity)) +{ + return h_array_reduce::run(arr, identity); +} + +/* standard array reductions */ + +template +EIGEN_DEVICE_FUNC constexpr inline auto array_sum(const array& arr) -> decltype(array_reduce(arr, static_cast(0))) +{ + return array_reduce(arr, static_cast(0)); +} + +template +EIGEN_DEVICE_FUNC constexpr inline auto array_prod(const array& arr) -> decltype(array_reduce(arr, static_cast(1))) +{ + return array_reduce(arr, static_cast(1)); +} + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector& a) { + eigen_assert(a.size() > 0); + t prod = 1; + for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; } + return prod; +} + +/* zip an array */ + +template +constexpr inline array h_array_zip(array a, array b, numeric_list) +{ + return array{{ Op::run(array_get(a), array_get(b))... }}; +} + +template +constexpr inline array array_zip(array a, array b) +{ + return h_array_zip(a, b, typename gen_numeric_list::type()); +} + +/* zip an array and reduce the result */ + +template +constexpr inline auto h_array_zip_and_reduce(array a, array b, numeric_list) -> decltype(reduce::type...>::run(Op::run(array_get(a), array_get(b))...)) +{ + return reduce::type...>::run(Op::run(array_get(a), array_get(b))...); +} + +template +constexpr inline auto array_zip_and_reduce(array a, array b) -> decltype(h_array_zip_and_reduce(a, b, typename gen_numeric_list::type())) +{ + return h_array_zip_and_reduce(a, b, typename gen_numeric_list::type()); +} + +/* apply stuff to an array */ + +template +constexpr inline array h_array_apply(array a, numeric_list) +{ + return array{{ Op::run(array_get(a))... }}; +} + +template +constexpr inline array array_apply(array a) +{ + return h_array_apply(a, typename gen_numeric_list::type()); +} + +/* apply stuff to an array and reduce */ + +template +constexpr inline auto h_array_apply_and_reduce(array arr, numeric_list) -> decltype(reduce::type...>::run(Op::run(array_get(arr))...)) +{ + return reduce::type...>::run(Op::run(array_get(arr))...); +} + +template +constexpr inline auto array_apply_and_reduce(array a) -> decltype(h_array_apply_and_reduce(a, typename gen_numeric_list::type())) +{ + return h_array_apply_and_reduce(a, typename gen_numeric_list::type()); +} + +/* repeat a value n times (and make an array out of it + * usage: + * array = repeat<16>(42); + */ + +template +struct h_repeat +{ + template + constexpr static inline array run(t v, numeric_list) + { + return {{ typename id_numeric::type(v)... }}; + } +}; + +template +constexpr array repeat(t v) { return h_repeat::run(v, typename gen_numeric_list::type()); } + +/* instantiate a class by a C-style array */ +template +struct h_instantiate_by_c_array; + +template +struct h_instantiate_by_c_array +{ + static InstType run(ArrType* arr, Ps... args) + { + return h_instantiate_by_c_array::run(arr + 1, args..., arr[0]); + } +}; + +template +struct h_instantiate_by_c_array +{ + static InstType run(ArrType* arr, Ps... args) + { + return h_instantiate_by_c_array::run(arr + 1, arr[0], args...); + } +}; + +template +struct h_instantiate_by_c_array +{ + static InstType run(ArrType* arr, Ps... args) + { + (void)arr; + return InstType(args...); + } +}; + +template +struct h_instantiate_by_c_array +{ + static InstType run(ArrType* arr, Ps... args) + { + (void)arr; + return InstType(args...); + } +}; + +template +InstType instantiate_by_c_array(ArrType* arr) +{ + return h_instantiate_by_c_array::run(arr); +} + +} // end namespace internal + +} // end namespace Eigen + +#else // Non C++11, fallback to emulation mode + +#include "src/Core/util/EmulateCXX11Meta.h" + +#endif + +#endif // EIGEN_CXX11META_H diff --git a/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h new file mode 100644 index 000000000..fe4d22803 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h @@ -0,0 +1,88 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11WORKAROUNDS_H +#define EIGEN_CXX11WORKAROUNDS_H + +/* COMPATIBILITY CHECKS + * (so users of compilers that are too old get some realistic error messages) + */ +#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1310) +#error Intel Compiler only supports required C++ features since version 13.1. +// note that most stuff in principle works with 13.0 but when combining +// some features, at some point 13.0 will just fail with an internal assertion +#elif defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6)) +// G++ < 4.6 by default will continue processing the source files - even if we use #error to make +// it error out. For this reason, we use the pragma to make sure G++ aborts at the first error +// it sees. Unfortunately, that is still not our #error directive, but at least the output is +// short enough the user has a chance to see that the compiler version is not sufficient for +// the funky template mojo we use. +#pragma GCC diagnostic error "-Wfatal-errors" +#error GNU C++ Compiler (g++) only supports required C++ features since version 4.6. +#endif + +/* Check that the compiler at least claims to support C++11. It might not be sufficient + * because the compiler may not implement it correctly, but at least we'll know. + * On the other hand, visual studio still doesn't claim to support C++11 although it's + * compliant enugh for our purpose. + */ +#if (__cplusplus <= 199711L) && (EIGEN_COMP_MSVC < 1900) +#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) +#pragma GCC diagnostic error "-Wfatal-errors" +#endif +#error This library needs at least a C++11 compliant compiler. If you use g++/clang, please enable the -std=c++11 compiler flag. (-std=c++0x on older versions.) +#endif + +namespace Eigen { + +namespace internal { + +/* std::get is only constexpr in C++14, not yet in C++11 + */ + + +template constexpr inline T& array_get(std::vector& a) { return a[I]; } +template constexpr inline T&& array_get(std::vector&& a) { return a[I]; } +template constexpr inline T const& array_get(std::vector const& a) { return a[I]; } + +/* Suppose you have a template of the form + * template struct X; + * And you want to specialize it in such a way: + * template struct X> { ::: }; + * template<> struct X> { ::: }; + * This will work in Intel's compiler 13.0, but only to some extent in g++ 4.6, since + * g++ can only match templates called with parameter packs if the number of template + * arguments is not a fixed size (so inside the first specialization, referencing + * X> will fail in g++). On the other hand, g++ will accept the following: + * template struct X> { ::: }: + * as an additional (!) specialization, which will then only match the empty case. + * But Intel's compiler 13.0 won't accept that, it will only accept the empty syntax, + * so we have to create a workaround for this. + */ +#if defined(__GNUC__) && !defined(__INTEL_COMPILER) +#define EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n) mt... n +#define EIGEN_TPL_PP_SPEC_HACK_DEFC(mt, n) , EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n) +#define EIGEN_TPL_PP_SPEC_HACK_USE(n) n... +#define EIGEN_TPL_PP_SPEC_HACK_USEC(n) , n... +#else +#define EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n) +#define EIGEN_TPL_PP_SPEC_HACK_DEFC(mt, n) +#define EIGEN_TPL_PP_SPEC_HACK_USE(n) +#define EIGEN_TPL_PP_SPEC_HACK_USEC(n) +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11WORKAROUNDS_H + +/* + * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; + */ diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h new file mode 100644 index 000000000..24159e54c --- /dev/null +++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h @@ -0,0 +1,267 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_EMULATE_ARRAY_H +#define EIGEN_EMULATE_ARRAY_H + + + +// The array class is only available starting with cxx11. Emulate our own here +// if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler! +// Moreover, CUDA doesn't support the STL containers, so we use our own instead. +#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(__CUDACC__) || defined(EIGEN_AVOID_STL_ARRAY) + +namespace Eigen { +template class array { + public: + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& operator[] (size_t index) { return values[index]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& front() { return values[0]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& front() const { return values[0]; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& back() { return values[n-1]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& back() const { return values[n-1]; } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + static std::size_t size() { return n; } + + T values[n]; + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array() { } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v) { + EIGEN_STATIC_ASSERT(n==1, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2) { + EIGEN_STATIC_ASSERT(n==2, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3) { + EIGEN_STATIC_ASSERT(n==3, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, + const T& v4) { + EIGEN_STATIC_ASSERT(n==4, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5) { + EIGEN_STATIC_ASSERT(n==5, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + values[4] = v5; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5, const T& v6) { + EIGEN_STATIC_ASSERT(n==6, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + values[4] = v5; + values[5] = v6; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5, const T& v6, const T& v7) { + EIGEN_STATIC_ASSERT(n==7, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + values[4] = v5; + values[5] = v6; + values[6] = v7; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array( + const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5, const T& v6, const T& v7, const T& v8) { + EIGEN_STATIC_ASSERT(n==8, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + values[4] = v5; + values[5] = v6; + values[6] = v7; + values[7] = v8; + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(std::initializer_list l) { + eigen_assert(l.size() == n); + internal::smart_copy(l.begin(), l.end(), values); + } +#endif +}; + + +// Specialize array for zero size +template class array { + public: + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& operator[] (size_t) { + eigen_assert(false && "Can't index a zero size array"); + return dummy; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& operator[] (size_t) const { + eigen_assert(false && "Can't index a zero size array"); + return dummy; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& front() { + eigen_assert(false && "Can't index a zero size array"); + return dummy; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& front() const { + eigen_assert(false && "Can't index a zero size array"); + return dummy; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& back() { + eigen_assert(false && "Can't index a zero size array"); + return dummy; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& back() const { + eigen_assert(false && "Can't index a zero size array"); + return dummy; + } + + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::size_t size() { return 0; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array() : dummy() { } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + EIGEN_DEVICE_FUNC array(std::initializer_list l) : dummy() { + eigen_assert(l.size() == 0); + } +#endif + + private: + T dummy; +}; + +// Comparison operator +// Todo: implement !=, <, <=, >, and >= +template +EIGEN_DEVICE_FUNC bool operator==(const array& lhs, const array& rhs) { + for (std::size_t i = 0; i < N; ++i) { + if (lhs[i] != rhs[i]) { + return false; + } + } + return true; +} + + +namespace internal { +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array& a) { + return a[I]; +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array& a) { + return a[I]; +} + +template struct array_size; +template struct array_size > { + static const size_t value = N; +}; +template struct array_size; +template struct array_size& > { + static const size_t value = N; +}; +template struct array_size; +template struct array_size > { + static const size_t value = N; +}; +template struct array_size; +template struct array_size& > { + static const size_t value = N; +}; + +} // end namespace internal +} // end namespace Eigen + +#else + +// The compiler supports c++11, and we're not targetting cuda: use std::array as Eigen::array +#include +namespace Eigen { + +template using array = std::array; + +namespace internal { +/* std::get is only constexpr in C++14, not yet in C++11 + * - libstdc++ from version 4.7 onwards has it nevertheless, + * so use that + * - libstdc++ older versions: use _M_instance directly + * - libc++ all versions so far: use __elems_ directly + * - all other libs: use std::get to be portable, but + * this may not be constexpr + */ +#if defined(__GLIBCXX__) && __GLIBCXX__ < 20120322 +#define STD_GET_ARR_HACK a._M_instance[I] +#elif defined(_LIBCPP_VERSION) +#define STD_GET_ARR_HACK a.__elems_[I] +#else +#define STD_GET_ARR_HACK std::template get(a) +#endif + +template constexpr inline T& array_get(std::array& a) { return (T&) STD_GET_ARR_HACK; } +template constexpr inline T&& array_get(std::array&& a) { return (T&&) STD_GET_ARR_HACK; } +template constexpr inline T const& array_get(std::array const& a) { return (T const&) STD_GET_ARR_HACK; } + +#undef STD_GET_ARR_HACK + +template struct array_size; +template struct array_size > { + static const size_t value = N; +}; +template struct array_size; +template struct array_size > { + static const size_t value = N; +}; +} // end namespace internal +} // end namespace Eigen + +#endif + +#endif // EIGEN_EMULATE_ARRAY_H diff --git a/unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h new file mode 100644 index 000000000..f3aa1b144 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h @@ -0,0 +1,311 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_EMULATE_CXX11_META_H +#define EIGEN_EMULATE_CXX11_META_H + + + +namespace Eigen { + +namespace internal { + +/** \internal + * \file CXX11/util/EmulateCXX11Meta.h + * This file emulates a subset of the functionality provided by CXXMeta.h for + * compilers that don't yet support cxx11 such as nvcc. + */ + +struct empty_list { static const std::size_t count = 0; }; + +template struct type_list { + typedef T HeadType; + typedef Tail TailType; + static const T head; + static const Tail tail; + static const std::size_t count = 1 + Tail::count; +}; + +struct null_type { }; + +template +struct make_type_list { + typedef typename make_type_list::type tailresult; + + typedef type_list type; +}; + +template<> struct make_type_list<> { + typedef empty_list type; +}; + + +template struct get_type; + +template +struct get_type<0, type_list > +{ + typedef Head type; +}; + +template +struct get_type > +{ + typedef typename get_type::type type; +}; + + +/* numeric list */ +template +struct type2val { + typedef T type; + static const T value = n; +}; + + +template struct gen_numeric_list_repeated; + +template struct gen_numeric_list_repeated { + typedef typename make_type_list >::type type; +}; + +template struct gen_numeric_list_repeated { + typedef typename make_type_list, type2val >::type type; +}; + +template struct gen_numeric_list_repeated { + typedef typename make_type_list, type2val, type2val >::type type; +}; + +template struct gen_numeric_list_repeated { + typedef typename make_type_list, type2val, type2val, type2val >::type type; +}; + +template struct gen_numeric_list_repeated { + typedef typename make_type_list, type2val, type2val, type2val, type2val >::type type; +}; + +template struct gen_numeric_list_repeated { + typedef typename make_type_list, type2val, type2val, + type2val, type2val, type2val >::type type; +}; + +template struct gen_numeric_list_repeated { + typedef typename make_type_list, type2val, type2val, + type2val, type2val, type2val, + type2val >::type type; +}; + +template struct gen_numeric_list_repeated { + typedef typename make_type_list, type2val, type2val, + type2val, type2val, type2val, + type2val, type2val >::type type; +}; + + +template struct get; + +template +struct get +{ + get() { eigen_assert(false && "index overflow"); } + typedef void type; + static const char value = '\0'; +}; + +template +struct get > +{ + get() { eigen_assert(false && "index overflow"); } + typedef void type; + static const char value = '\0'; +}; + +template +struct get<0, type_list > +{ + typedef typename Head::type type; + static const type value = Head::value; +}; + +template +struct get<0, type_list > +{ + typedef typename Head::type type; + static const type value = Head::value; +}; + +template +struct get > +{ + typedef typename Tail::HeadType::type type; + static const type value = get::value; +}; + + +template struct arg_prod { + static const typename NList::HeadType::type value = get<0, NList>::value * arg_prod::value; +}; +template <> struct arg_prod { + static const int value = 1; +}; + + +template +array repeat(t v) { + array array; + array.fill(v); + return array; +} + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(type_list&) { + return get >::value; +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_list&) { + return get >::value; +} + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NList::HeadType::type array_prod(const NList&) { + return arg_prod::value; +} + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array& a) { + t prod = 1; + for (size_t i = 0; i < n; ++i) { prod *= a[i]; } + return prod; +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array& /*a*/) { + return 0; +} + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector& a) { + eigen_assert(a.size() > 0); + t prod = 1; + for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; } + return prod; +} + + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(std::vector& a) { + return a[I]; +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const std::vector& a) { + return a[I]; +} + +struct sum_op { + template static inline bool run(A a, B b) { return a + b; } +}; +struct product_op { + template static inline bool run(A a, B b) { return a * b; } +}; + +struct logical_and_op { + template static inline bool run(A a, B b) { return a && b; } +}; +struct logical_or_op { + template static inline bool run(A a, B b) { return a || b; } +}; + +struct equal_op { + template static inline bool run(A a, B b) { return a == b; } +}; +struct not_equal_op { + template static inline bool run(A a, B b) { return a != b; } +}; +struct lesser_op { + template static inline bool run(A a, B b) { return a < b; } +}; +struct lesser_equal_op { + template static inline bool run(A a, B b) { return a <= b; } +}; + +struct greater_op { + template static inline bool run(A a, B b) { return a > b; } +}; +struct greater_equal_op { + template static inline bool run(A a, B b) { return a >= b; } +}; + +struct not_op { + template static inline bool run(A a) { return !a; } +}; +struct negation_op { + template static inline bool run(A a) { return -a; } +}; +struct greater_equal_zero_op { + template static inline bool run(A a) { return a >= 0; } +}; + + +template +struct ArrayApplyAndReduce { + static inline bool run(const array& a) { + EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE); + bool result = Reducer::run(Op::run(a[0]), Op::run(a[1])); + for (size_t i = 2; i < N; ++i) { + result = Reducer::run(result, Op::run(a[i])); + } + return result; + } +}; + +template +struct ArrayApplyAndReduce { + static inline bool run(const array& a) { + return Op::run(a[0]); + } +}; + +template +inline bool array_apply_and_reduce(const array& a) { + return ArrayApplyAndReduce::run(a); +} + +template +struct ArrayZipAndReduce { + static inline bool run(const array& a, const array& b) { + EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE); + bool result = Reducer::run(Op::run(a[0], b[0]), Op::run(a[1], b[1])); + for (size_t i = 2; i < N; ++i) { + result = Reducer::run(result, Op::run(a[i], b[i])); + } + return result; + } +}; + +template +struct ArrayZipAndReduce { + static inline bool run(const array& a, const array& b) { + return Op::run(a[0], b[0]); + } +}; + +template +inline bool array_zip_and_reduce(const array& a, const array& b) { + return ArrayZipAndReduce::run(a, b); +} + +} // end namespace internal + +} // end namespace Eigen + + + +#endif // EIGEN_EMULATE_CXX11_META_H diff --git a/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h b/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h new file mode 100644 index 000000000..551124bae --- /dev/null +++ b/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h @@ -0,0 +1,130 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_FIXEDSIZEVECTOR_H +#define EIGEN_FIXEDSIZEVECTOR_H + +namespace Eigen { + +/** \class MaxSizeVector + * \ingroup Core + * + * \brief The MaxSizeVector class. + * + * The %MaxSizeVector provides a subset of std::vector functionality. + * + * The goal is to provide basic std::vector operations when using + * std::vector is not an option (e.g. on GPU or when compiling using + * FMA/AVX, as this can cause either compilation failures or illegal + * instruction failures). + * + * Beware: The constructors are not API compatible with these of + * std::vector. + */ +template +class MaxSizeVector { + public: + // Construct a new MaxSizeVector, reserve n elements. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit MaxSizeVector(size_t n) + : reserve_(n), size_(0), + data_(static_cast(internal::aligned_malloc(n * sizeof(T)))) { + for (size_t i = 0; i < n; ++i) { new (&data_[i]) T; } + } + + // Construct a new MaxSizeVector, reserve and resize to n. + // Copy the init value to all elements. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit MaxSizeVector(size_t n, const T& init) + : reserve_(n), size_(n), + data_(static_cast(internal::aligned_malloc(n * sizeof(T)))) { + for (size_t i = 0; i < n; ++i) { new (&data_[i]) T(init); } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + ~MaxSizeVector() { + for (size_t i = 0; i < size_; ++i) { + data_[i].~T(); + } + internal::aligned_free(data_); + } + + // Append new elements (up to reserved size). + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void push_back(const T& t) { + eigen_assert(size_ < reserve_); + data_[size_++] = t; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const T& operator[] (size_t i) const { + eigen_assert(i < size_); + return data_[i]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T& operator[] (size_t i) { + eigen_assert(i < size_); + return data_[i]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T& back() { + eigen_assert(size_ > 0); + return data_[size_ - 1]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const T& back() const { + eigen_assert(size_ > 0); + return data_[size_ - 1]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void pop_back() { + // NOTE: This does not destroy the value at the end the way + // std::vector's version of pop_back() does. That happens when + // the Vector is destroyed. + eigen_assert(size_ > 0); + size_--; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + size_t size() const { return size_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + bool empty() const { return size_ == 0; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T* data() { return data_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const T* data() const { return data_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T* begin() { return data_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T* end() { return data_ + size_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const T* begin() const { return data_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const T* end() const { return data_ + size_; } + + private: + size_t reserve_; + size_t size_; + T* data_; +}; + +} // namespace Eigen + +#endif // EIGEN_FIXEDSIZEVECTOR_H diff --git a/unsupported/test/cxx11_meta.cpp b/unsupported/test/cxx11_meta.cpp index ecac3add1..8911c59d8 100644 --- a/unsupported/test/cxx11_meta.cpp +++ b/unsupported/test/cxx11_meta.cpp @@ -10,7 +10,7 @@ #include "main.h" #include -#include +#include using Eigen::internal::is_same; using Eigen::internal::type_list; -- cgit v1.2.3 From 1f48f47ab73ec1132326769db44b0afeea8b6713 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 27 Apr 2016 19:59:44 +0200 Subject: Implement stricter argument checking for SYRK and SY2K and real matrices. To implement the BLAS API they should return info=2 if op='C' is passed for a complex matrix. Without this change, the Eigen BLAS fails the strict zblat3 and cblat3 tests in LAPACK 3.5. --- blas/level3_impl.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/blas/level3_impl.h b/blas/level3_impl.h index beb36c47d..6c802cd5f 100644 --- a/blas/level3_impl.h +++ b/blas/level3_impl.h @@ -159,7 +159,7 @@ int EIGEN_BLAS_FUNC(trsm)(const char *side, const char *uplo, const char *opa, c return 0; int code = OP(*opa) | (SIDE(*side) << 2) | (UPLO(*uplo) << 3) | (DIAG(*diag) << 4); - + if(SIDE(*side)==LEFT) { internal::gemm_blocking_space blocking(*m,*n,*m,1,false); @@ -385,7 +385,7 @@ int EIGEN_BLAS_FUNC(syrk)(const char *uplo, const char *op, const int *n, const int info = 0; if(UPLO(*uplo)==INVALID) info = 1; - else if(OP(*op)==INVALID) info = 2; + else if(OP(*op)==INVALID || (ISCOMPLEX && OP(*op)==ADJ) ) info = 2; else if(*n<0) info = 3; else if(*k<0) info = 4; else if(*lda().setZero(); else matrix(c, *n, *n, *ldc).triangularView() *= beta; - + if(beta!=Scalar(0)) { matrix(c, *n, *n, *ldc).diagonal().real() *= beta; -- cgit v1.2.3 From 6744d776ba4c799d4d63d03c5305cba112b6de14 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 27 Apr 2016 12:10:25 -0700 Subject: Added support for fpclassify in Eigen::Numext --- Eigen/src/Core/MathFunctions.h | 9 +++++++++ Eigen/src/Core/arch/CUDA/Half.h | 22 ++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 5771abf7d..2bdf7dde7 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -937,6 +937,15 @@ template EIGEN_DEVICE_FUNC bool (isnan) (const T &x) { return inte template EIGEN_DEVICE_FUNC bool (isinf) (const T &x) { return internal::isinf_impl(x); } template EIGEN_DEVICE_FUNC bool (isfinite)(const T &x) { return internal::isfinite_impl(x); } +template EIGEN_DEVICE_FUNC int (fpclassify)(const T& x){ + #ifdef __CUDA_ARCH__ + return (::fpclassify)(x); + #else + using std::fpclassify; + return fpclassify(x); + #endif +} + template EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(round, Scalar) round(const Scalar& x) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 9ecc4fd88..319074e4a 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -407,6 +407,28 @@ static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const Eigen::half& a) static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const Eigen::half& a) { return !(Eigen::numext::isinf)(a) && !(Eigen::numext::isnan)(a); } +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (fpclassify)(const Eigen::half& a) { + const int exponent = a.x & 0x7c00; + const int mantissa = a.x & 0x03ff; + if (exponent == 0) { + if (mantissa == 0) { + // Positive or negative zero. + return FP_ZERO; + } else { + return FP_SUBNORMAL; + } + } else if (exponent == 0x7c00) { + // Maximum possible exponent signifies either NaN or +/- inf. + if (mantissa == 0) { + return FP_INFINITE; + } else { + return FP_NAN; + } + } else { + return FP_NORMAL; + } +} + template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half abs(const Eigen::half& a) { Eigen::half result; result.x = a.x & 0x7FFF; -- cgit v1.2.3 From 463738ccbe1ac26d841244aad7886dc92e639fe4 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 27 Apr 2016 12:26:18 -0700 Subject: Use computeProductBlockingSizes to compute blocking for both ShardByCol and ShardByRow cases. --- unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h index 3d3f6904f..5cf7b4f71 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h @@ -35,9 +35,7 @@ class TensorContractionBlocking { computeProductBlockingSizes(kc_, mc_, nc_, num_threads); } else { - if (kc_ && mc_ && nc_) { - mc_ = (((m / num_threads) + 15) / 16) * 16; - } + computeProductBlockingSizes(kc_, nc_, mc_, num_threads); } } -- cgit v1.2.3 From 25141b69d4dfe844d57e58d216649f6607cab713 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 27 Apr 2016 12:57:21 -0700 Subject: Improved support for min and max on 16 bit floats when running on recent cuda gpus --- Eigen/src/Core/arch/CUDA/Half.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 319074e4a..37e714411 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -465,6 +465,25 @@ template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceil(const Eigen::h return Eigen::half(::ceilf(float(a))); } +template <> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half mini(const Eigen::half& a, const Eigen::half& b) { +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hlt(b, a) ? b : a; +#else + const float f1 = static_cast(a); + const float f2 = static_cast(b); + return f2 < f1 ? b : a; +#endif +} +template <> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half maxi(const Eigen::half& a, const Eigen::half& b) { +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hlt(a, b) ? b : a; +#else + const float f1 = static_cast(a); + const float f2 = static_cast(b); + return f1 < f2 ? b : a; +#endif +} + #ifdef EIGEN_HAS_C99_MATH template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half lgamma(const Eigen::half& a) { return Eigen::half(Eigen::numext::lgamma(static_cast(a))); -- cgit v1.2.3 From f629fe95c8973706a9deacd586d1b16d6ea0c5c0 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 27 Apr 2016 13:11:19 -0700 Subject: Made the index type a template parameter to evaluateProductBlockingSizes Use numext::mini and numext::maxi instead of std::min/std::max to compute blocking sizes. --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 33 ++++++++++++----------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index bd559dc6a..5b0473598 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -89,7 +89,7 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff * * \sa setCpuCacheSizes */ -template +template void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1) { typedef gebp_traits Traits; @@ -115,7 +115,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // registers. However once the latency is hidden there is no point in // increasing the value of k, so we'll cap it at 320 (value determined // experimentally). - const Index k_cache = (std::min)((l1-ksub)/kdiv, 320); + const Index k_cache = (numext::mini)((l1-ksub)/kdiv, 320); if (k_cache < k) { k = k_cache - (k_cache % kr); eigen_internal_assert(k > 0); @@ -129,7 +129,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n n = n_cache - (n_cache % nr); eigen_internal_assert(n > 0); } else { - n = (std::min)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr)); + n = (numext::mini)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr)); } if (l3 > l2) { @@ -140,7 +140,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n m = m_cache - (m_cache % mr); eigen_internal_assert(m > 0); } else { - m = (std::min)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr)); + m = (numext::mini)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr)); } } } @@ -157,7 +157,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // Perhaps it would make more sense to consider k*n*m?? // Note that for very tiny problem, this function should be bypassed anyway // because we use the coefficient-based implementation for them. - if((std::max)(k,(std::max)(m,n))<48) + if((numext::maxi)(k,(numext::maxi)(m,n))<48) return; typedef typename Traits::ResScalar ResScalar; @@ -174,7 +174,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // We also include a register-level block of the result (mx x nr). // (In an ideal world only the lhs panel would stay in L1) // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of: - const Index max_kc = std::max(((l1-k_sub)/k_div) & (~(k_peeling-1)),1); + const Index max_kc = numext::maxi(((l1-k_sub)/k_div) & (~(k_peeling-1)),1); const Index old_k = k; if(k>max_kc) { @@ -219,7 +219,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar)); } // WARNING Below, we assume that Traits::nr is a power of two. - Index nc = std::min(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1)); + Index nc = numext::mini(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1)); if(n>nc) { // We are really blocking over the columns: @@ -248,9 +248,9 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // we have both L2 and L3, and problem is small enough to be kept in L2 // Let's choose m such that lhs's block fit in 1/3 of L2 actual_lm = l2; - max_mc = (std::min)(576,max_mc); + max_mc = (numext::mini)(576,max_mc); } - Index mc = (std::min)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc); + Index mc = (numext::mini)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc); if (mc > Traits::mr) mc -= mc % Traits::mr; else if (mc==0) return; m = (m%mc)==0 ? mc @@ -259,13 +259,14 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n } } +template inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n) { #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) { - k = std::min(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K); - m = std::min(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M); - n = std::min(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N); + k = numext::mini(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K); + m = numext::mini(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M); + n = numext::mini(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N); return true; } #else @@ -292,11 +293,11 @@ inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n) * * \sa setCpuCacheSizes */ -template +template void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) { if (!useSpecificBlockingSizes(k, m, n)) { - evaluateProductBlockingSizesHeuristic(k, m, n, num_threads); + evaluateProductBlockingSizesHeuristic(k, m, n, num_threads); } typedef gebp_traits Traits; @@ -310,10 +311,10 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads if (n > nr) n -= n % nr; } -template +template inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) { - computeProductBlockingSizes(k, m, n, num_threads); + computeProductBlockingSizes(k, m, n, num_threads); } #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -- cgit v1.2.3 From 318e65e0ae4f6204b8704c77d10a35a340b2e666 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 27 Apr 2016 23:05:40 +0200 Subject: Fix missing inclusion of Eigen/Core --- unsupported/Eigen/CXX11/Tensor | 2 ++ unsupported/Eigen/CXX11/ThreadPool | 2 ++ 2 files changed, 4 insertions(+) diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 5abd5de1a..1e97ad3c0 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -11,6 +11,8 @@ //#ifndef EIGEN_CXX11_TENSOR_MODULE //#define EIGEN_CXX11_TENSOR_MODULE +#include "../../../Eigen/Core" + #include #include "src/util/CXX11Meta.h" diff --git a/unsupported/Eigen/CXX11/ThreadPool b/unsupported/Eigen/CXX11/ThreadPool index 15e5634ed..09d637e9a 100644 --- a/unsupported/Eigen/CXX11/ThreadPool +++ b/unsupported/Eigen/CXX11/ThreadPool @@ -10,6 +10,8 @@ #ifndef EIGEN_CXX11_THREADPOOL_MODULE #define EIGEN_CXX11_THREADPOOL_MODULE +#include "../../../Eigen/Core" + #include /** \defgroup CXX11_ThreadPool_Module C++11 ThreadPool Module -- cgit v1.2.3 From c61170e87dccdd093c1471a7e287dd269635a461 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 27 Apr 2016 14:22:20 -0700 Subject: fpclassify isn't portable enough. In particular, the return values of the function are not available on all the platforms Eigen supportes: remove it from Eigen. --- Eigen/src/Core/MathFunctions.h | 9 --------- Eigen/src/Core/arch/CUDA/Half.h | 21 --------------------- 2 files changed, 30 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 2bdf7dde7..5771abf7d 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -937,15 +937,6 @@ template EIGEN_DEVICE_FUNC bool (isnan) (const T &x) { return inte template EIGEN_DEVICE_FUNC bool (isinf) (const T &x) { return internal::isinf_impl(x); } template EIGEN_DEVICE_FUNC bool (isfinite)(const T &x) { return internal::isfinite_impl(x); } -template EIGEN_DEVICE_FUNC int (fpclassify)(const T& x){ - #ifdef __CUDA_ARCH__ - return (::fpclassify)(x); - #else - using std::fpclassify; - return fpclassify(x); - #endif -} - template EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(round, Scalar) round(const Scalar& x) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 37e714411..6387f2870 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -407,27 +407,6 @@ static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const Eigen::half& a) static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const Eigen::half& a) { return !(Eigen::numext::isinf)(a) && !(Eigen::numext::isnan)(a); } -template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (fpclassify)(const Eigen::half& a) { - const int exponent = a.x & 0x7c00; - const int mantissa = a.x & 0x03ff; - if (exponent == 0) { - if (mantissa == 0) { - // Positive or negative zero. - return FP_ZERO; - } else { - return FP_SUBNORMAL; - } - } else if (exponent == 0x7c00) { - // Maximum possible exponent signifies either NaN or +/- inf. - if (mantissa == 0) { - return FP_INFINITE; - } else { - return FP_NAN; - } - } else { - return FP_NORMAL; - } -} template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half abs(const Eigen::half& a) { Eigen::half result; -- cgit v1.2.3 From 1a325ef71ce47713cab3cebc3ded44609ebcec28 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 27 Apr 2016 14:33:51 -0700 Subject: Detect cxx_constexpr support when compiling with clang. --- Eigen/src/Core/util/Macros.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 97627d14c..88f86c442 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -371,10 +371,10 @@ // Does the compiler support const expressions? #ifdef __CUDACC__ // Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above -#if __cplusplus > 199711L && defined(__CUDACC_VER__) && (defined(__clang__) || __CUDACC_VER__ >= 70500) +#if __cplusplus > 199711L && defined(__CUDACC_VER__) && (EIGEN_COMP_CLANG || __CUDACC_VER__ >= 70500) #define EIGEN_HAS_CONSTEXPR 1 #endif -#elif (defined(__cplusplus) && __cplusplus >= 201402L) || \ +#elif __has_feature(cxx_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \ EIGEN_GNUC_AT_LEAST(4,8) #define EIGEN_HAS_CONSTEXPR 1 #endif @@ -572,12 +572,12 @@ namespace Eigen { //------------------------------------------------------------------------------------------ // Static and dynamic alignment control -// +// // The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES // as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively. // The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not, // a default value is automatically computed based on architecture, compiler, and OS. -// +// // This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX} // to be used to declare statically aligned buffers. //------------------------------------------------------------------------------------------ @@ -637,7 +637,7 @@ namespace Eigen { #ifndef EIGEN_MAX_STATIC_ALIGN_BYTES // Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES - + // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in @@ -664,13 +664,13 @@ namespace Eigen { #else #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0 #endif - + #if EIGEN_ARCH_WANTS_STACK_ALIGNMENT #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES #else #define EIGEN_MAX_STATIC_ALIGN_BYTES 0 #endif - + #endif // If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_ALIGN_BYTES -- cgit v1.2.3 From 09b9e951e36cfd8835a557682f2250f01bbf7051 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 27 Apr 2016 14:59:11 -0700 Subject: Depend on the more extensive support for constexpr in clang: http://clang.llvm.org/docs/LanguageExtensions.html#c-1y-relaxed-constexpr --- Eigen/src/Core/util/Macros.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 88f86c442..de1265af3 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -374,7 +374,7 @@ #if __cplusplus > 199711L && defined(__CUDACC_VER__) && (EIGEN_COMP_CLANG || __CUDACC_VER__ >= 70500) #define EIGEN_HAS_CONSTEXPR 1 #endif -#elif __has_feature(cxx_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \ +#elif __has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \ EIGEN_GNUC_AT_LEAST(4,8) #define EIGEN_HAS_CONSTEXPR 1 #endif -- cgit v1.2.3 From 3ec81fc00f447ce47df079c66450c680155ceeb4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 27 Apr 2016 19:32:12 -0700 Subject: Fixed compilation error with clang. --- test/product_large.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/product_large.cpp b/test/product_large.cpp index 98f84c53b..845cd40ca 100644 --- a/test/product_large.cpp +++ b/test/product_large.cpp @@ -71,7 +71,7 @@ void test_product_large() std::ptrdiff_t m1 = internal::random(10,100)*16; std::ptrdiff_t n1 = internal::random(10,100)*16; // only makes sure it compiles fine - internal::computeProductBlockingSizes(k1,m1,n1,1); + internal::computeProductBlockingSizes(k1,m1,n1,1); } { -- cgit v1.2.3 From 40d1e2f8c7a85b7c0522d11d6e3d0c6a18bc9721 Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Thu, 28 Apr 2016 13:57:08 -0700 Subject: Eliminate mutual recursion in igamma{,c}_impl::Run. Presently, igammac_impl::Run calls igamma_impl::Run, which in turn calls igammac_impl::Run. This isn't actually mutual recursion; the calls are guarded such that we never get into a loop. Nonetheless, it's a stretch for clang to prove this. As a result, clang emits a recursive call in both igammac_impl::Run and igamma_impl::Run. That this is suboptimal code is bad enough, but it's particularly bad when compiling for CUDA/nvptx. nvptx allows recursion, but only begrudgingly: If you have recursive calls in a kernel, it's on you to manually specify the kernel's stack size. Otherwise, ptxas will dump a warning, make a guess, and who knows if it's right. This change explicitly eliminates the mutual recursion in igammac_impl::Run and igamma_impl::Run. --- Eigen/src/Core/SpecialFunctions.h | 83 +++++++++++++++++++++++++++++++-------- 1 file changed, 67 insertions(+), 16 deletions(-) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index a3857ae1f..10ff4371e 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -515,6 +515,41 @@ struct igammac_impl { Copyright 1985, 1987, 1992 by Stephen L. Moshier Direct inquiries to 30 Frost Street, Cambridge, MA 02140 */ + const Scalar zero = 0; + const Scalar one = 1; + const Scalar nan = NumTraits::quiet_NaN(); + + if ((x < zero) || (a <= zero)) { + // domain error + return nan; + } + + if ((x < one) || (x < a)) { + /* The checks above ensure that we meet the preconditions for + * igamma_impl::Impl(), so call it, rather than igamma_impl::Run(). + * Calling Run() would also work, but in that case the compiler may not be + * able to prove that igammac_impl::Run and igamma_impl::Run are not + * mutually recursive. This leads to worse code, particularly on + * platforms like nvptx, where recursion is allowed only begrudgingly. + */ + return (one - igamma_impl::Impl(a, x)); + } + + return Impl(a, x); + } + + private: + /* igamma_impl calls igammac_impl::Impl. */ + friend struct igamma_impl; + + /* Actually computes igamc(a, x). + * + * Preconditions: + * a > 0 + * x >= 1 + * x >= a + */ + static Scalar Impl(Scalar a, Scalar x) { const Scalar zero = 0; const Scalar one = 1; const Scalar two = 2; @@ -522,21 +557,11 @@ struct igammac_impl { const Scalar maxlog = numext::log(NumTraits::highest()); const Scalar big = igamma_helper::big(); const Scalar biginv = 1 / big; - const Scalar nan = NumTraits::quiet_NaN(); const Scalar inf = NumTraits::infinity(); Scalar ans, ax, c, yc, r, t, y, z; Scalar pk, pkm1, pkm2, qk, qkm1, qkm2; - if ((x < zero) || ( a <= zero)) { - // domain error - return nan; - } - - if ((x < one) || (x < a)) { - return (one - igamma_impl::run(a, x)); - } - if (x == inf) return zero; // std::isinf crashes on CUDA /* Compute x**a * exp(-x) / gamma(a) */ @@ -678,22 +703,48 @@ struct igamma_impl { */ const Scalar zero = 0; const Scalar one = 1; - const Scalar machep = igamma_helper::machep(); - const Scalar maxlog = numext::log(NumTraits::highest()); const Scalar nan = NumTraits::quiet_NaN(); - double ans, ax, c, r; - if (x == zero) return zero; - if ((x < zero) || ( a <= zero)) { // domain error + if ((x < zero) || (a <= zero)) { // domain error return nan; } if ((x > one) && (x > a)) { - return (one - igammac_impl::run(a, x)); + /* The checks above ensure that we meet the preconditions for + * igammac_impl::Impl(), so call it, rather than igammac_impl::Run(). + * Calling Run() would also work, but in that case the compiler may not be + * able to prove that igammac_impl::Run and igamma_impl::Run are not + * mutually recursive. This leads to worse code, particularly on + * platforms like nvptx, where recursion is allowed only begrudgingly. + */ + return (one - igammac_impl::Impl(a, x)); } + return Impl(a, x); + } + + private: + /* igammac_impl calls igamma_impl::Impl. */ + friend struct igammac_impl; + + /* Actually computes igam(a, x). + * + * Preconditions: + * x > 0 + * a > 0 + * !(x > 1 && x > a) + */ + static Scalar Impl(Scalar a, Scalar x) { + const Scalar zero = 0; + const Scalar one = 1; + const Scalar machep = igamma_helper::machep(); + const Scalar maxlog = numext::log(NumTraits::highest()); + const Scalar nan = NumTraits::quiet_NaN(); + + double ans, ax, c, r; + /* Compute x**a * exp(-x) / gamma(a) */ ax = a * numext::log(x) - x - lgamma_impl::run(a); if (ax < -maxlog) { -- cgit v1.2.3 From a5d454508393541f370a3f9a0fd9a65888cfb7c8 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 28 Apr 2016 14:14:48 -0700 Subject: Deleted unused variable --- Eigen/src/Core/SpecialFunctions.h | 1 - 1 file changed, 1 deletion(-) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index 10ff4371e..1ec0fb325 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -741,7 +741,6 @@ struct igamma_impl { const Scalar one = 1; const Scalar machep = igamma_helper::machep(); const Scalar maxlog = numext::log(NumTraits::highest()); - const Scalar nan = NumTraits::quiet_NaN(); double ans, ax, c, r; -- cgit v1.2.3 From dacb23277e75bc1c3465f41b6f3ba42359e41038 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 28 Apr 2016 18:54:54 -0700 Subject: Fixed the igamma and igammac implementations to make them callable from a gpu kernel. --- Eigen/src/Core/SpecialFunctions.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index 1ec0fb325..3513a5c63 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -549,7 +549,7 @@ struct igammac_impl { * x >= 1 * x >= a */ - static Scalar Impl(Scalar a, Scalar x) { + EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) { const Scalar zero = 0; const Scalar one = 1; const Scalar two = 2; @@ -736,7 +736,7 @@ struct igamma_impl { * a > 0 * !(x > 1 && x > a) */ - static Scalar Impl(Scalar a, Scalar x) { + EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) { const Scalar zero = 0; const Scalar one = 1; const Scalar machep = igamma_helper::machep(); -- cgit v1.2.3 From a524a26fdc558d867a9bb04a9c4cf4daf9813615 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 28 Apr 2016 18:55:53 -0700 Subject: Fixed a few memory leaks --- unsupported/test/cxx11_tensor_cuda.cu | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index db2d8ee4c..4026f48f0 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -661,6 +661,9 @@ void test_cuda_digamma() for (int i = 5; i < 7; ++i) { VERIFY_IS_EQUAL(out(i), expected_out(i)); } + + cudaFree(d_in); + cudaFree(d_out); } template @@ -725,6 +728,10 @@ void test_cuda_zeta() VERIFY_IS_APPROX(out(i), expected_out(i)); } } + + cudaFree(d_in_x); + cudaFree(d_in_q); + cudaFree(d_out); } template @@ -787,6 +794,10 @@ void test_cuda_polygamma() for (int i = 0; i < 7; ++i) { VERIFY_IS_APPROX(out(i), expected_out(i)); } + + cudaFree(d_in_x); + cudaFree(d_in_n); + cudaFree(d_out); } template @@ -826,9 +837,9 @@ void test_cuda_igamma() Scalar* d_a; Scalar* d_x; Scalar* d_out; - cudaMalloc((void**)(&d_a), bytes); - cudaMalloc((void**)(&d_x), bytes); - cudaMalloc((void**)(&d_out), bytes); + assert(cudaMalloc((void**)(&d_a), bytes) == cudaSuccess); + assert(cudaMalloc((void**)(&d_x), bytes) == cudaSuccess); + assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess); cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice); @@ -854,6 +865,10 @@ void test_cuda_igamma() } } } + + cudaFree(d_a); + cudaFree(d_x); + cudaFree(d_out); } template @@ -920,6 +935,10 @@ void test_cuda_igammac() } } } + + cudaFree(d_a); + cudaFree(d_x); + cudaFree(d_out); } template @@ -935,8 +954,8 @@ void test_cuda_erf(const Scalar stddev) Scalar* d_in; Scalar* d_out; - cudaMalloc((void**)(&d_in), bytes); - cudaMalloc((void**)(&d_out), bytes); + assert(cudaMalloc((void**)(&d_in), bytes) == cudaSuccess); + assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess); cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); -- cgit v1.2.3 From 0f3c4c8ff4a6635db77195a8919c743f34181cc2 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 29 Apr 2016 18:26:08 +0200 Subject: Fix compilation of sparse.cast<>().transpose(). --- Eigen/src/SparseCore/SparseCwiseUnaryOp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h index fe4a97120..9143a4c82 100644 --- a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +++ b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h @@ -22,7 +22,7 @@ struct unary_evaluator, IteratorBased> typedef CwiseUnaryOp XprType; class InnerIterator; -// class ReverseInnerIterator; + class ReverseInnerIterator; enum { CoeffReadCost = evaluator::CoeffReadCost + functor_traits::Cost, -- cgit v1.2.3