diff options
author | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2016-04-29 13:41:26 -0700 |
---|---|---|
committer | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2016-04-29 13:41:26 -0700 |
commit | 07a247dcf4e86f9f741b68e1d8e0897de3eeca57 (patch) | |
tree | d103bd20faa1f103035bac2f21507ecc65f97f68 | |
parent | fa5a8f055aebbf4f39fca26e857351103fab4d11 (diff) | |
parent | 0f3c4c8ff4a6635db77195a8919c743f34181cc2 (diff) |
Pulled latest updates from upstream
113 files changed, 3576 insertions, 953 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 003d00c06..5546184a8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -221,10 +221,17 @@ if(NOT MSVC) message(STATUS "Enabling FMA in tests/examples") endif() +<<<<<<< local option(EIGEN_TEST_AVX512 "Enable/Disable AVX512 in tests/examples" OFF) if(EIGEN_TEST_AVX512) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512dq") message(STATUS "Enabling AVX512 in tests/examples") +======= + option(EIGEN_TEST_F16C "Enable/Disable F16C in tests/examples" OFF) + if(EIGEN_TEST_F16C) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mf16c") + message(STATUS "Enabling F16C in tests/examples") +>>>>>>> other endif() option(EIGEN_TEST_ALTIVEC "Enable/Disable AltiVec in tests/examples" OFF) diff --git a/Eigen/Core b/Eigen/Core index c7249df21..d67cb67af 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -33,13 +33,13 @@ #ifdef EIGEN_EXCEPTIONS #undef EIGEN_EXCEPTIONS #endif - + // All functions callable from CUDA code must be qualified with __device__ #define EIGEN_DEVICE_FUNC __host__ __device__ - + #else #define EIGEN_DEVICE_FUNC - + #endif // When compiling CUDA device code with NVCC, pull in math functions from the @@ -212,7 +212,7 @@ #endif #endif -#if defined(__F16C__) +#if defined(__F16C__) && !defined(EIGEN_COMP_CLANG) // We can use the optimized fp16 to float and float to fp16 conversion routines #define EIGEN_HAS_FP16_C #endif @@ -222,10 +222,14 @@ #include <vector_types.h> #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 #define EIGEN_HAS_CUDA_FP16 - #include <cuda_fp16.h> #endif #endif +#if defined EIGEN_HAS_CUDA_FP16 + #include <host_defines.h> + #include <cuda_fp16.h> +#endif + #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE) #define EIGEN_HAS_OPENMP #endif @@ -306,7 +310,7 @@ inline static const char *SimdInstructionSetsInUse(void) { // we use size_t frequently and we'll never remember to prepend it with std:: everytime just to // ensure QNX/QCC support using std::size_t; -// gcc 4.6.0 wants std:: for ptrdiff_t +// gcc 4.6.0 wants std:: for ptrdiff_t using std::ptrdiff_t; /** \defgroup Core_Module Core module @@ -455,6 +459,7 @@ using std::ptrdiff_t; #include "src/Core/products/TriangularSolverVector.h" #include "src/Core/BandMatrix.h" #include "src/Core/CoreIterators.h" +#include "src/Core/ConditionEstimator.h" #include "src/Core/BooleanRedux.h" #include "src/Core/Select.h" diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h index 1d767d5c8..538aff956 100644 --- a/Eigen/src/Cholesky/LDLT.h +++ b/Eigen/src/Cholesky/LDLT.h @@ -13,7 +13,7 @@ #ifndef EIGEN_LDLT_H #define EIGEN_LDLT_H -namespace Eigen { +namespace Eigen { namespace internal { template<typename MatrixType, int UpLo> struct LDLT_Traits; @@ -73,11 +73,11 @@ template<typename _MatrixType, int _UpLo> class LDLT * The default constructor is useful in cases in which the user intends to * perform decompositions via LDLT::compute(const MatrixType&). */ - LDLT() - : m_matrix(), - m_transpositions(), + LDLT() + : m_matrix(), + m_transpositions(), m_sign(internal::ZeroSign), - m_isInitialized(false) + m_isInitialized(false) {} /** \brief Default Constructor with memory preallocation @@ -168,7 +168,7 @@ template<typename _MatrixType, int _UpLo> class LDLT * \note_about_checking_solutions * * More precisely, this method solves \f$ A x = b \f$ using the decomposition \f$ A = P^T L D L^* P \f$ - * by solving the systems \f$ P^T y_1 = b \f$, \f$ L y_2 = y_1 \f$, \f$ D y_3 = y_2 \f$, + * by solving the systems \f$ P^T y_1 = b \f$, \f$ L y_2 = y_1 \f$, \f$ D y_3 = y_2 \f$, * \f$ L^* y_4 = y_3 \f$ and \f$ P x = y_4 \f$ in succession. If the matrix \f$ A \f$ is singular, then * \f$ D \f$ will also be singular (all the other matrices are invertible). In that case, the * least-square solution of \f$ D y_3 = y_2 \f$ is computed. This does not mean that this function @@ -192,6 +192,15 @@ template<typename _MatrixType, int _UpLo> class LDLT template<typename InputType> LDLT& compute(const EigenBase<InputType>& matrix); + /** \returns an estimate of the reciprocal condition number of the matrix of + * which \c *this is the LDLT decomposition. + */ + RealScalar rcond() const + { + eigen_assert(m_isInitialized && "LDLT is not initialized."); + return internal::rcond_estimate_helper(m_l1_norm, *this); + } + template <typename Derived> LDLT& rankUpdate(const MatrixBase<Derived>& w, const RealScalar& alpha=1); @@ -207,6 +216,13 @@ template<typename _MatrixType, int _UpLo> class LDLT MatrixType reconstructedMatrix() const; + /** \returns the adjoint of \c *this, that is, a const reference to the decomposition itself as the underlying matrix is self-adjoint. + * + * This method is provided for compatibility with other matrix decompositions, thus enabling generic code such as: + * \code x = decomposition.adjoint().solve(b) \endcode + */ + const LDLT& adjoint() const { return *this; }; + inline Index rows() const { return m_matrix.rows(); } inline Index cols() const { return m_matrix.cols(); } @@ -220,7 +236,7 @@ template<typename _MatrixType, int _UpLo> class LDLT eigen_assert(m_isInitialized && "LDLT is not initialized."); return Success; } - + #ifndef EIGEN_PARSED_BY_DOXYGEN template<typename RhsType, typename DstType> EIGEN_DEVICE_FUNC @@ -228,7 +244,7 @@ template<typename _MatrixType, int _UpLo> class LDLT #endif protected: - + static void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); @@ -241,6 +257,7 @@ template<typename _MatrixType, int _UpLo> class LDLT * is not stored), and the diagonal entries correspond to D. */ MatrixType m_matrix; + RealScalar m_l1_norm; TranspositionType m_transpositions; TmpMatrixType m_temporary; internal::SignMatrix m_sign; @@ -314,7 +331,7 @@ template<> struct ldlt_inplace<Lower> if(rs>0) A21.noalias() -= A20 * temp.head(k); } - + // In some previous versions of Eigen (e.g., 3.2.1), the scaling was omitted if the pivot // was smaller than the cutoff value. However, since LDLT is not rank-revealing // we should only make sure that we do not introduce INF or NaN values. @@ -433,12 +450,25 @@ template<typename InputType> LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>& a) { check_template_parameters(); - + eigen_assert(a.rows()==a.cols()); const Index size = a.rows(); m_matrix = a.derived(); + // Compute matrix L1 norm = max abs column sum. + m_l1_norm = RealScalar(0); + // TODO move this code to SelfAdjointView + for (Index col = 0; col < size; ++col) { + RealScalar abs_col_sum; + if (_UpLo == Lower) + abs_col_sum = m_matrix.col(col).tail(size - col).template lpNorm<1>() + m_matrix.row(col).head(col).template lpNorm<1>(); + else + abs_col_sum = m_matrix.col(col).head(col).template lpNorm<1>() + m_matrix.row(col).tail(size - col).template lpNorm<1>(); + if (abs_col_sum > m_l1_norm) + m_l1_norm = abs_col_sum; + } + m_transpositions.resize(size); m_isInitialized = false; m_temporary.resize(size); @@ -466,7 +496,7 @@ LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Deri eigen_assert(m_matrix.rows()==size); } else - { + { m_matrix.resize(size,size); m_matrix.setZero(); m_transpositions.resize(size); @@ -505,7 +535,7 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons // diagonal element is not well justified and leads to numerical issues in some cases. // Moreover, Lapack's xSYTRS routines use 0 for the tolerance. RealScalar tolerance = RealScalar(1) / NumTraits<RealScalar>::highest(); - + for (Index i = 0; i < vecD.size(); ++i) { if(abs(vecD(i)) > tolerance) diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index 74cf5bfe1..19578b216 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -10,7 +10,7 @@ #ifndef EIGEN_LLT_H #define EIGEN_LLT_H -namespace Eigen { +namespace Eigen { namespace internal{ template<typename MatrixType, int UpLo> struct LLT_Traits; @@ -40,7 +40,7 @@ template<typename MatrixType, int UpLo> struct LLT_Traits; * * Example: \include LLT_example.cpp * Output: \verbinclude LLT_example.out - * + * * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT */ /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH) @@ -135,6 +135,16 @@ template<typename _MatrixType, int _UpLo> class LLT template<typename InputType> LLT& compute(const EigenBase<InputType>& matrix); + /** \returns an estimate of the reciprocal condition number of the matrix of + * which \c *this is the Cholesky decomposition. + */ + RealScalar rcond() const + { + eigen_assert(m_isInitialized && "LLT is not initialized."); + eigen_assert(m_info == Success && "LLT failed because matrix appears to be negative"); + return internal::rcond_estimate_helper(m_l1_norm, *this); + } + /** \returns the LLT decomposition matrix * * TODO: document the storage layout @@ -159,12 +169,19 @@ template<typename _MatrixType, int _UpLo> class LLT return m_info; } + /** \returns the adjoint of \c *this, that is, a const reference to the decomposition itself as the underlying matrix is self-adjoint. + * + * This method is provided for compatibility with other matrix decompositions, thus enabling generic code such as: + * \code x = decomposition.adjoint().solve(b) \endcode + */ + const LLT& adjoint() const { return *this; }; + inline Index rows() const { return m_matrix.rows(); } inline Index cols() const { return m_matrix.cols(); } template<typename VectorType> LLT rankUpdate(const VectorType& vec, const RealScalar& sigma = 1); - + #ifndef EIGEN_PARSED_BY_DOXYGEN template<typename RhsType, typename DstType> EIGEN_DEVICE_FUNC @@ -172,17 +189,18 @@ template<typename _MatrixType, int _UpLo> class LLT #endif protected: - + static void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } - + /** \internal * Used to compute and store L * The strict upper part is not used and even not initialized. */ MatrixType m_matrix; + RealScalar m_l1_norm; bool m_isInitialized; ComputationInfo m_info; }; @@ -268,7 +286,7 @@ template<typename Scalar> struct llt_inplace<Scalar, Lower> static Index unblocked(MatrixType& mat) { using std::sqrt; - + eigen_assert(mat.rows()==mat.cols()); const Index size = mat.rows(); for(Index k = 0; k < size; ++k) @@ -328,7 +346,7 @@ template<typename Scalar> struct llt_inplace<Scalar, Lower> return Eigen::internal::llt_rank_update_lower(mat, vec, sigma); } }; - + template<typename Scalar> struct llt_inplace<Scalar, Upper> { typedef typename NumTraits<Scalar>::Real RealScalar; @@ -387,12 +405,25 @@ template<typename InputType> LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>& a) { check_template_parameters(); - + eigen_assert(a.rows()==a.cols()); const Index size = a.rows(); m_matrix.resize(size, size); m_matrix = a.derived(); + // Compute matrix L1 norm = max abs column sum. + m_l1_norm = RealScalar(0); + // TODO move this code to SelfAdjointView + for (Index col = 0; col < size; ++col) { + RealScalar abs_col_sum; + if (_UpLo == Lower) + abs_col_sum = m_matrix.col(col).tail(size - col).template lpNorm<1>() + m_matrix.row(col).head(col).template lpNorm<1>(); + else + abs_col_sum = m_matrix.col(col).head(col).template lpNorm<1>() + m_matrix.row(col).tail(size - col).template lpNorm<1>(); + if (abs_col_sum > m_l1_norm) + m_l1_norm = abs_col_sum; + } + m_isInitialized = true; bool ok = Traits::inplace_decomposition(m_matrix); m_info = ok ? Success : NumericalIssue; @@ -419,7 +450,7 @@ LLT<_MatrixType,_UpLo> LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, c return *this; } - + #ifndef EIGEN_PARSED_BY_DOXYGEN template<typename _MatrixType,int _UpLo> template<typename RhsType, typename DstType> @@ -431,7 +462,7 @@ void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const #endif /** \internal use x = llt_object.solve(x); - * + * * This is the \em in-place version of solve(). * * \param bAndX represents both the right-hand side matrix b and result x. @@ -483,7 +514,7 @@ SelfAdjointView<MatrixType, UpLo>::llt() const return LLT<PlainObject,UpLo>(m_matrix); } #endif // __CUDACC__ - + } // end namespace Eigen #endif // EIGEN_LLT_H diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index 3de8aa9a2..9d4b315a0 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -29,13 +29,10 @@ struct copy_using_evaluator_traits { typedef typename DstEvaluator::XprType Dst; typedef typename Dst::Scalar DstScalar; - // TODO distinguish between linear traversal and inner-traversals - typedef typename find_best_packet<DstScalar,Dst::SizeAtCompileTime>::type PacketType; enum { DstFlags = DstEvaluator::Flags, - SrcFlags = SrcEvaluator::Flags, - RequiredAlignment = unpacket_traits<PacketType>::alignment + SrcFlags = SrcEvaluator::Flags }; public: @@ -55,10 +52,25 @@ private: : int(DstFlags)&RowMajorBit ? int(Dst::MaxColsAtCompileTime) : int(Dst::MaxRowsAtCompileTime), OuterStride = int(outer_stride_at_compile_time<Dst>::ret), - MaxSizeAtCompileTime = Dst::SizeAtCompileTime, - PacketSize = unpacket_traits<PacketType>::size + MaxSizeAtCompileTime = Dst::SizeAtCompileTime + }; + + // TODO distinguish between linear traversal and inner-traversals + typedef typename find_best_packet<DstScalar,Dst::SizeAtCompileTime>::type LinearPacketType; + typedef typename find_best_packet<DstScalar,InnerSize>::type InnerPacketType; + + enum { + LinearPacketSize = unpacket_traits<LinearPacketType>::size, + InnerPacketSize = unpacket_traits<InnerPacketType>::size }; +public: + enum { + LinearRequiredAlignment = unpacket_traits<LinearPacketType>::alignment, + InnerRequiredAlignment = unpacket_traits<InnerPacketType>::alignment + }; + +private: enum { DstIsRowMajor = DstFlags&RowMajorBit, SrcIsRowMajor = SrcFlags&RowMajorBit, @@ -67,16 +79,16 @@ private: && (int(DstFlags) & int(SrcFlags) & ActualPacketAccessBit) && (functor_traits<AssignFunc>::PacketAccess), MayInnerVectorize = MightVectorize - && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0 - && int(OuterStride)!=Dynamic && int(OuterStride)%int(PacketSize)==0 - && int(JointAlignment)>=int(RequiredAlignment), + && int(InnerSize)!=Dynamic && int(InnerSize)%int(InnerPacketSize)==0 + && int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0 + && int(JointAlignment)>=int(InnerRequiredAlignment), MayLinearize = StorageOrdersAgree && (int(DstFlags) & int(SrcFlags) & LinearAccessBit), MayLinearVectorize = MightVectorize && MayLinearize && DstHasDirectAccess - && ((int(DstAlignment)>=int(RequiredAlignment)) || MaxSizeAtCompileTime == Dynamic), + && ((int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic), /* If the destination isn't aligned, we have to do runtime checks and we don't unroll, so it's only good for large enough sizes. */ MaySliceVectorize = MightVectorize && DstHasDirectAccess - && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=3*PacketSize) + && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=3*InnerPacketSize) /* slice vectorization can be slow, so we only want it if the slices are big, which is indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block in a fixed-size matrix */ @@ -84,7 +96,8 @@ private: public: enum { - Traversal = int(MayInnerVectorize) ? int(InnerVectorizedTraversal) + Traversal = int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize) ? int(LinearVectorizedTraversal) + : int(MayInnerVectorize) ? int(InnerVectorizedTraversal) : int(MayLinearVectorize) ? int(LinearVectorizedTraversal) : int(MaySliceVectorize) ? int(SliceVectorizedTraversal) : int(MayLinearize) ? int(LinearTraversal) @@ -94,9 +107,14 @@ public: || int(Traversal) == SliceVectorizedTraversal }; + typedef typename conditional<int(Traversal)==LinearVectorizedTraversal, LinearPacketType, InnerPacketType>::type PacketType; + private: enum { - UnrollingLimit = EIGEN_UNROLLING_LIMIT * (Vectorized ? int(PacketSize) : 1), + ActualPacketSize = int(Traversal)==LinearVectorizedTraversal ? LinearPacketSize + : Vectorized ? InnerPacketSize + : 1, + UnrollingLimit = EIGEN_UNROLLING_LIMIT * ActualPacketSize, MayUnrollCompletely = int(Dst::SizeAtCompileTime) != Dynamic && int(Dst::SizeAtCompileTime) * int(SrcEvaluator::CoeffReadCost) <= int(UnrollingLimit), MayUnrollInner = int(InnerSize) != Dynamic @@ -112,7 +130,7 @@ public: : int(NoUnrolling) ) : int(Traversal) == int(LinearVectorizedTraversal) - ? ( bool(MayUnrollCompletely) && (int(DstAlignment)>=int(RequiredAlignment)) ? int(CompleteUnrolling) + ? ( bool(MayUnrollCompletely) && (int(DstAlignment)>=int(LinearRequiredAlignment)) ? int(CompleteUnrolling) : int(NoUnrolling) ) : int(Traversal) == int(LinearTraversal) ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) @@ -131,11 +149,13 @@ public: std::cerr.unsetf(std::ios::hex); EIGEN_DEBUG_VAR(DstAlignment) EIGEN_DEBUG_VAR(SrcAlignment) - EIGEN_DEBUG_VAR(RequiredAlignment) + EIGEN_DEBUG_VAR(LinearRequiredAlignment) + EIGEN_DEBUG_VAR(InnerRequiredAlignment) EIGEN_DEBUG_VAR(JointAlignment) EIGEN_DEBUG_VAR(InnerSize) EIGEN_DEBUG_VAR(InnerMaxSize) - EIGEN_DEBUG_VAR(PacketSize) + EIGEN_DEBUG_VAR(LinearPacketSize) + EIGEN_DEBUG_VAR(InnerPacketSize) EIGEN_DEBUG_VAR(StorageOrdersAgree) EIGEN_DEBUG_VAR(MightVectorize) EIGEN_DEBUG_VAR(MayLinearize) @@ -370,7 +390,7 @@ struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling> typedef typename Kernel::Scalar Scalar; typedef typename Kernel::PacketType PacketType; enum { - requestedAlignment = Kernel::AssignmentTraits::RequiredAlignment, + requestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment, packetSize = unpacket_traits<PacketType>::size, dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment), dstAlignment = packet_traits<Scalar>::AlignedOnScalar ? int(requestedAlignment) @@ -484,7 +504,7 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling> typedef typename Kernel::PacketType PacketType; enum { packetSize = unpacket_traits<PacketType>::size, - requestedAlignment = int(Kernel::AssignmentTraits::RequiredAlignment), + requestedAlignment = int(Kernel::AssignmentTraits::InnerRequiredAlignment), alignable = packet_traits<Scalar>::AlignedOnScalar || int(Kernel::AssignmentTraits::DstAlignment)>=sizeof(Scalar), dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment), dstAlignment = alignable ? int(requestedAlignment) diff --git a/Eigen/src/Core/ConditionEstimator.h b/Eigen/src/Core/ConditionEstimator.h new file mode 100644 index 000000000..68c5e918e --- /dev/null +++ b/Eigen/src/Core/ConditionEstimator.h @@ -0,0 +1,166 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Rasmus Munk Larsen (rmlarsen@google.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CONDITIONESTIMATOR_H +#define EIGEN_CONDITIONESTIMATOR_H + +namespace Eigen { + +namespace internal { + +template <typename Vector, typename RealVector, bool IsComplex> +struct rcond_compute_sign { + static inline Vector run(const Vector& v) { + const RealVector v_abs = v.cwiseAbs(); + return (v_abs.array() == static_cast<typename Vector::RealScalar>(0)) + .select(Vector::Ones(v.size()), v.cwiseQuotient(v_abs)); + } +}; + +// Partial specialization to avoid elementwise division for real vectors. +template <typename Vector> +struct rcond_compute_sign<Vector, Vector, false> { + static inline Vector run(const Vector& v) { + return (v.array() < static_cast<typename Vector::RealScalar>(0)) + .select(-Vector::Ones(v.size()), Vector::Ones(v.size())); + } +}; + +/** \brief Reciprocal condition number estimator. + * + * Computing a decomposition of a dense matrix takes O(n^3) operations, while + * this method estimates the condition number quickly and reliably in O(n^2) + * operations. + * + * \returns an estimate of the reciprocal condition number + * (1 / (||matrix||_1 * ||inv(matrix)||_1)) of matrix, given ||matrix||_1 and + * its decomposition. Supports the following decompositions: FullPivLU, + * PartialPivLU, LDLT, and LLT. + * + * \sa FullPivLU, PartialPivLU, LDLT, LLT. + */ +template <typename Decomposition> +typename Decomposition::RealScalar +rcond_estimate_helper(typename Decomposition::RealScalar matrix_norm, const Decomposition& dec) +{ + typedef typename Decomposition::RealScalar RealScalar; + eigen_assert(dec.rows() == dec.cols()); + if (dec.rows() == 0) return RealScalar(1); + if (matrix_norm == RealScalar(0)) return RealScalar(0); + if (dec.rows() == 1) return RealScalar(1); + const RealScalar inverse_matrix_norm = rcond_invmatrix_L1_norm_estimate(dec); + return (inverse_matrix_norm == RealScalar(0) ? RealScalar(0) + : (RealScalar(1) / inverse_matrix_norm) / matrix_norm); +} + +/** + * \returns an estimate of ||inv(matrix)||_1 given a decomposition of + * \a matrix that implements .solve() and .adjoint().solve() methods. + * + * This function implements Algorithms 4.1 and 5.1 from + * http://www.maths.manchester.ac.uk/~higham/narep/narep135.pdf + * which also forms the basis for the condition number estimators in + * LAPACK. Since at most 10 calls to the solve method of dec are + * performed, the total cost is O(dims^2), as opposed to O(dims^3) + * needed to compute the inverse matrix explicitly. + * + * The most common usage is in estimating the condition number + * ||matrix||_1 * ||inv(matrix)||_1. The first term ||matrix||_1 can be + * computed directly in O(n^2) operations. + * + * Supports the following decompositions: FullPivLU, PartialPivLU, LDLT, and + * LLT. + * + * \sa FullPivLU, PartialPivLU, LDLT, LLT. + */ +template <typename Decomposition> +typename Decomposition::RealScalar rcond_invmatrix_L1_norm_estimate(const Decomposition& dec) +{ + typedef typename Decomposition::MatrixType MatrixType; + typedef typename Decomposition::Scalar Scalar; + typedef typename Decomposition::RealScalar RealScalar; + typedef typename internal::plain_col_type<MatrixType>::type Vector; + typedef typename internal::plain_col_type<MatrixType, RealScalar>::type RealVector; + const bool is_complex = (NumTraits<Scalar>::IsComplex != 0); + + eigen_assert(dec.rows() == dec.cols()); + const Index n = dec.rows(); + if (n == 0) + return 0; + + Vector v = dec.solve(Vector::Ones(n) / Scalar(n)); + + // lower_bound is a lower bound on + // ||inv(matrix)||_1 = sup_v ||inv(matrix) v||_1 / ||v||_1 + // and is the objective maximized by the ("super-") gradient ascent + // algorithm below. + RealScalar lower_bound = v.template lpNorm<1>(); + if (n == 1) + return lower_bound; + + // Gradient ascent algorithm follows: We know that the optimum is achieved at + // one of the simplices v = e_i, so in each iteration we follow a + // super-gradient to move towards the optimal one. + RealScalar old_lower_bound = lower_bound; + Vector sign_vector(n); + Vector old_sign_vector; + Index v_max_abs_index = -1; + Index old_v_max_abs_index = v_max_abs_index; + for (int k = 0; k < 4; ++k) + { + sign_vector = internal::rcond_compute_sign<Vector, RealVector, is_complex>::run(v); + if (k > 0 && !is_complex && sign_vector == old_sign_vector) { + // Break if the solution stagnated. + break; + } + // v_max_abs_index = argmax |real( inv(matrix)^T * sign_vector )| + v = dec.adjoint().solve(sign_vector); + v.real().cwiseAbs().maxCoeff(&v_max_abs_index); + if (v_max_abs_index == old_v_max_abs_index) { + // Break if the solution stagnated. + break; + } + // Move to the new simplex e_j, where j = v_max_abs_index. + v = dec.solve(Vector::Unit(n, v_max_abs_index)); // v = inv(matrix) * e_j. + lower_bound = v.template lpNorm<1>(); + if (lower_bound <= old_lower_bound) { + // Break if the gradient step did not increase the lower_bound. + break; + } + if (!is_complex) { + old_sign_vector = sign_vector; + } + old_v_max_abs_index = v_max_abs_index; + old_lower_bound = lower_bound; + } + // The following calculates an independent estimate of ||matrix||_1 by + // multiplying matrix by a vector with entries of slowly increasing + // magnitude and alternating sign: + // v_i = (-1)^{i} (1 + (i / (dim-1))), i = 0,...,dim-1. + // This improvement to Hager's algorithm above is due to Higham. It was + // added to make the algorithm more robust in certain corner cases where + // large elements in the matrix might otherwise escape detection due to + // exact cancellation (especially when op and op_adjoint correspond to a + // sequence of backsubstitutions and permutations), which could cause + // Hager's algorithm to vastly underestimate ||matrix||_1. + Scalar alternating_sign(RealScalar(1)); + for (Index i = 0; i < n; ++i) { + v[i] = alternating_sign * (RealScalar(1) + (RealScalar(i) / (RealScalar(n - 1)))); + alternating_sign = -alternating_sign; + } + v = dec.solve(v); + const RealScalar alternate_lower_bound = (2 * v.template lpNorm<1>()) / (3 * RealScalar(n)); + return numext::maxi(lower_bound, alternate_lower_bound); +} + +} // namespace internal + +} // namespace Eigen + +#endif diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index 53f934999..f7c5f4276 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -81,6 +81,8 @@ public: * This is a compile time mapping from {1,Small,Large}^3 -> {product types} */ // FIXME I'm not sure the current mapping is the ideal one. template<int M, int N> struct product_type_selector<M,N,1> { enum { ret = OuterProduct }; }; +template<int M> struct product_type_selector<M, 1, 1> { enum { ret = LazyCoeffBasedProductMode }; }; +template<int N> struct product_type_selector<1, N, 1> { enum { ret = LazyCoeffBasedProductMode }; }; template<int Depth> struct product_type_selector<1, 1, Depth> { enum { ret = InnerProduct }; }; template<> struct product_type_selector<1, 1, 1> { enum { ret = InnerProduct }; }; template<> struct product_type_selector<Small,1, Small> { enum { ret = CoeffBasedProductMode }; }; diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 8e7dd2b73..5771abf7d 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -1025,6 +1025,66 @@ double log(const double &x) { return ::log(x); } template<typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +typename NumTraits<T>::Real abs(const T &x) { + EIGEN_USING_STD_MATH(abs); + return abs(x); +} + +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float abs(const float &x) { return ::fabsf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double abs(const double &x) { return ::fabs(x); } +#endif + +template<typename T> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T exp(const T &x) { + EIGEN_USING_STD_MATH(exp); + return exp(x); +} + +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float exp(const float &x) { return ::expf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double exp(const double &x) { return ::exp(x); } +#endif + +template<typename T> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T cos(const T &x) { + EIGEN_USING_STD_MATH(cos); + return cos(x); +} + +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float cos(const float &x) { return ::cosf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double cos(const double &x) { return ::cos(x); } +#endif + +template<typename T> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T sin(const T &x) { + EIGEN_USING_STD_MATH(sin); + return sin(x); +} + +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float sin(const float &x) { return ::sinf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double sin(const double &x) { return ::sin(x); } +#endif + +template<typename T> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tan(const T &x) { EIGEN_USING_STD_MATH(tan); return tan(x); @@ -1040,34 +1100,94 @@ double tan(const double &x) { return ::tan(x); } template<typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -typename NumTraits<T>::Real abs(const T &x) { - EIGEN_USING_STD_MATH(abs); - return abs(x); +T acos(const T &x) { + EIGEN_USING_STD_MATH(acos); + return acos(x); } #ifdef __CUDACC__ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -float abs(const float &x) { return ::fabsf(x); } +float acos(const float &x) { return ::acosf(x); } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -double abs(const double &x) { return ::fabs(x); } +double acos(const double &x) { return ::acos(x); } #endif template<typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -T exp(const T &x) { - EIGEN_USING_STD_MATH(exp); - return exp(x); +T asin(const T &x) { + EIGEN_USING_STD_MATH(asin); + return asin(x); } #ifdef __CUDACC__ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -float exp(const float &x) { return ::expf(x); } +float asin(const float &x) { return ::asinf(x); } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -double exp(const double &x) { return ::exp(x); } +double asin(const double &x) { return ::asin(x); } +#endif + +template<typename T> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T atan(const T &x) { + EIGEN_USING_STD_MATH(atan); + return atan(x); +} + +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float atan(const float &x) { return ::atanf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double atan(const double &x) { return ::atan(x); } +#endif + + +template<typename T> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T cosh(const T &x) { + EIGEN_USING_STD_MATH(cosh); + return cosh(x); +} + +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float cosh(const float &x) { return ::coshf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double cosh(const double &x) { return ::cosh(x); } #endif +template<typename T> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T sinh(const T &x) { + EIGEN_USING_STD_MATH(sinh); + return sinh(x); +} + +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float sinh(const float &x) { return ::sinhf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double sinh(const double &x) { return ::sinh(x); } +#endif + +template<typename T> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T tanh(const T &x) { + EIGEN_USING_STD_MATH(tanh); + return tanh(x); +} + +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float tanh(const float &x) { return ::tanhf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double tanh(const double &x) { return ::tanh(x); } +#endif template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 3ce86e8cd..d9fd888cf 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -410,8 +410,6 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape, typedef Product<Lhs, Rhs, LazyProduct> XprType; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketScalar PacketScalar; - typedef typename XprType::PacketReturnType PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit product_evaluator(const XprType& xpr) @@ -437,16 +435,20 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape, typedef evaluator<LhsNestedCleaned> LhsEtorType; typedef evaluator<RhsNestedCleaned> RhsEtorType; - + enum { RowsAtCompileTime = LhsNestedCleaned::RowsAtCompileTime, ColsAtCompileTime = RhsNestedCleaned::ColsAtCompileTime, InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(LhsNestedCleaned::ColsAtCompileTime, RhsNestedCleaned::RowsAtCompileTime), MaxRowsAtCompileTime = LhsNestedCleaned::MaxRowsAtCompileTime, - MaxColsAtCompileTime = RhsNestedCleaned::MaxColsAtCompileTime, - - PacketSize = packet_traits<Scalar>::size, + MaxColsAtCompileTime = RhsNestedCleaned::MaxColsAtCompileTime + }; + + typedef typename find_best_packet<Scalar,RowsAtCompileTime>::type LhsVecPacketType; + typedef typename find_best_packet<Scalar,ColsAtCompileTime>::type RhsVecPacketType; + enum { + LhsCoeffReadCost = LhsEtorType::CoeffReadCost, RhsCoeffReadCost = RhsEtorType::CoeffReadCost, CoeffReadCost = InnerSize==0 ? NumTraits<Scalar>::ReadCost @@ -459,19 +461,23 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape, LhsFlags = LhsEtorType::Flags, RhsFlags = RhsEtorType::Flags, - LhsAlignment = LhsEtorType::Alignment, - RhsAlignment = RhsEtorType::Alignment, - LhsRowMajor = LhsFlags & RowMajorBit, RhsRowMajor = RhsFlags & RowMajorBit, + + LhsVecPacketSize = unpacket_traits<LhsVecPacketType>::size, + RhsVecPacketSize = unpacket_traits<RhsVecPacketType>::size, + + // Here, we don't care about alignment larger than the usable packet size. + LhsAlignment = EIGEN_PLAIN_ENUM_MIN(LhsEtorType::Alignment,LhsVecPacketSize*int(sizeof(typename LhsNestedCleaned::Scalar))), + RhsAlignment = EIGEN_PLAIN_ENUM_MIN(RhsEtorType::Alignment,RhsVecPacketSize*int(sizeof(typename RhsNestedCleaned::Scalar))), SameType = is_same<typename LhsNestedCleaned::Scalar,typename RhsNestedCleaned::Scalar>::value, CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit) - && (ColsAtCompileTime == Dynamic || ((ColsAtCompileTime % PacketSize) == 0) ), + && (ColsAtCompileTime == Dynamic || ((ColsAtCompileTime % RhsVecPacketSize) == 0) ), CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit) - && (RowsAtCompileTime == Dynamic || ((RowsAtCompileTime % PacketSize) == 0) ), + && (RowsAtCompileTime == Dynamic || ((RowsAtCompileTime % LhsVecPacketSize) == 0) ), EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1 : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0 @@ -491,10 +497,10 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape, : 0, /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside - * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner - * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect - * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI. - */ + * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner + * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect + * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI. + */ CanVectorizeInner = SameType && LhsRowMajor && (!RhsRowMajor) @@ -1000,7 +1006,7 @@ struct transposition_matrix_product const Index size = tr.size(); StorageIndex j = 0; - if(!(is_same<MatrixTypeCleaned,Dest>::value && extract_data(dst) == extract_data(mat))) + if(!is_same_dense(dst,mat)) dst = mat; for(Index k=(Transposed?size-1:0) ; Transposed?k>=0:k<size ; Transposed?--k:++k) diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index d170cae29..98b2fd868 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -27,8 +27,9 @@ template<typename Func, typename Derived> struct redux_traits { public: + typedef typename find_best_packet<typename Derived::Scalar,Derived::SizeAtCompileTime>::type PacketType; enum { - PacketSize = packet_traits<typename Derived::Scalar>::size, + PacketSize = unpacket_traits<PacketType>::size, InnerMaxSize = int(Derived::IsRowMajor) ? Derived::MaxColsAtCompileTime : Derived::MaxRowsAtCompileTime @@ -137,12 +138,12 @@ template<typename Func, typename Derived, int Start, int Length> struct redux_vec_unroller { enum { - PacketSize = packet_traits<typename Derived::Scalar>::size, + PacketSize = redux_traits<Func, Derived>::PacketSize, HalfLength = Length/2 }; typedef typename Derived::Scalar Scalar; - typedef typename packet_traits<Scalar>::type PacketScalar; + typedef typename redux_traits<Func, Derived>::PacketType PacketScalar; static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func) { @@ -156,14 +157,14 @@ template<typename Func, typename Derived, int Start> struct redux_vec_unroller<Func, Derived, Start, 1> { enum { - index = Start * packet_traits<typename Derived::Scalar>::size, + index = Start * redux_traits<Func, Derived>::PacketSize, outer = index / int(Derived::InnerSizeAtCompileTime), inner = index % int(Derived::InnerSizeAtCompileTime), alignment = Derived::Alignment }; typedef typename Derived::Scalar Scalar; - typedef typename packet_traits<Scalar>::type PacketScalar; + typedef typename redux_traits<Func, Derived>::PacketType PacketScalar; static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&) { @@ -209,13 +210,13 @@ template<typename Func, typename Derived> struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling> { typedef typename Derived::Scalar Scalar; - typedef typename packet_traits<Scalar>::type PacketScalar; + typedef typename redux_traits<Func, Derived>::PacketType PacketScalar; static Scalar run(const Derived &mat, const Func& func) { const Index size = mat.size(); - const Index packetSize = packet_traits<Scalar>::size; + const Index packetSize = redux_traits<Func, Derived>::PacketSize; const int packetAlignment = unpacket_traits<PacketScalar>::alignment; enum { alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned), @@ -268,7 +269,7 @@ template<typename Func, typename Derived, int Unrolling> struct redux_impl<Func, Derived, SliceVectorizedTraversal, Unrolling> { typedef typename Derived::Scalar Scalar; - typedef typename packet_traits<Scalar>::type PacketType; + typedef typename redux_traits<Func, Derived>::PacketType PacketType; EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func) { @@ -276,7 +277,7 @@ struct redux_impl<Func, Derived, SliceVectorizedTraversal, Unrolling> const Index innerSize = mat.innerSize(); const Index outerSize = mat.outerSize(); enum { - packetSize = packet_traits<Scalar>::size + packetSize = redux_traits<Func, Derived>::PacketSize }; const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize; Scalar res; @@ -306,9 +307,10 @@ template<typename Func, typename Derived> struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling> { typedef typename Derived::Scalar Scalar; - typedef typename packet_traits<Scalar>::type PacketScalar; + + typedef typename redux_traits<Func, Derived>::PacketType PacketScalar; enum { - PacketSize = packet_traits<Scalar>::size, + PacketSize = redux_traits<Func, Derived>::PacketSize, Size = Derived::SizeAtCompileTime, VectorizedSize = (Size / PacketSize) * PacketSize }; @@ -367,11 +369,11 @@ public: { return m_evaluator.coeff(index); } template<int LoadMode, typename PacketType> - PacketReturnType packet(Index row, Index col) const + PacketType packet(Index row, Index col) const { return m_evaluator.template packet<LoadMode,PacketType>(row, col); } template<int LoadMode, typename PacketType> - PacketReturnType packet(Index index) const + PacketType packet(Index index) const { return m_evaluator.template packet<LoadMode,PacketType>(index); } EIGEN_DEVICE_FUNC @@ -379,7 +381,7 @@ public: { return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } template<int LoadMode, typename PacketType> - PacketReturnType packetByOuterInner(Index outer, Index inner) const + PacketType packetByOuterInner(Index outer, Index inner) const { return m_evaluator.template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } const XprType & nestedExpression() const { return m_xpr; } diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h index 5a2010449..a33356423 100644 --- a/Eigen/src/Core/SolveTriangular.h +++ b/Eigen/src/Core/SolveTriangular.h @@ -213,7 +213,7 @@ template<int Side, typename TriangularType, typename Rhs> struct triangular_solv template<typename Dest> inline void evalTo(Dest& dst) const { - if(!(is_same<RhsNestedCleaned,Dest>::value && extract_data(dst) == extract_data(m_rhs))) + if(!is_same_dense(dst,m_rhs)) dst = m_rhs; m_triangularMatrix.template solveInPlace<Side>(dst); } diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index adb055b15..3513a5c63 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -281,20 +281,18 @@ struct digamma_impl { */ Scalar p, q, nz, s, w, y; - bool negative; + bool negative = false; const Scalar maxnum = NumTraits<Scalar>::infinity(); - const Scalar m_pi = EIGEN_PI; + const Scalar m_pi(EIGEN_PI); - negative = 0; - nz = 0.0; - - const Scalar zero = 0.0; - const Scalar one = 1.0; - const Scalar half = 0.5; + const Scalar zero = Scalar(0); + const Scalar one = Scalar(1); + const Scalar half = Scalar(0.5); + nz = zero; if (x <= zero) { - negative = one; + negative = true; q = x; p = numext::floor(q); if (p == q) { @@ -463,7 +461,7 @@ template <typename Scalar> struct igammac_impl { EIGEN_DEVICE_FUNC static Scalar run(Scalar a, Scalar x) { - /* igamc() + /* igamc() * * Incomplete gamma integral (modified for Eigen) * @@ -519,26 +517,51 @@ struct igammac_impl { */ const Scalar zero = 0; const Scalar one = 1; + const Scalar nan = NumTraits<Scalar>::quiet_NaN(); + + if ((x < zero) || (a <= zero)) { + // domain error + return nan; + } + + if ((x < one) || (x < a)) { + /* The checks above ensure that we meet the preconditions for + * igamma_impl::Impl(), so call it, rather than igamma_impl::Run(). + * Calling Run() would also work, but in that case the compiler may not be + * able to prove that igammac_impl::Run and igamma_impl::Run are not + * mutually recursive. This leads to worse code, particularly on + * platforms like nvptx, where recursion is allowed only begrudgingly. + */ + return (one - igamma_impl<Scalar>::Impl(a, x)); + } + + return Impl(a, x); + } + + private: + /* igamma_impl calls igammac_impl::Impl. */ + friend struct igamma_impl<Scalar>; + + /* Actually computes igamc(a, x). + * + * Preconditions: + * a > 0 + * x >= 1 + * x >= a + */ + EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) { + const Scalar zero = 0; + const Scalar one = 1; const Scalar two = 2; const Scalar machep = igamma_helper<Scalar>::machep(); const Scalar maxlog = numext::log(NumTraits<Scalar>::highest()); const Scalar big = igamma_helper<Scalar>::big(); const Scalar biginv = 1 / big; - const Scalar nan = NumTraits<Scalar>::quiet_NaN(); const Scalar inf = NumTraits<Scalar>::infinity(); Scalar ans, ax, c, yc, r, t, y, z; Scalar pk, pkm1, pkm2, qk, qkm1, qkm2; - if ((x < zero) || ( a <= zero)) { - // domain error - return nan; - } - - if ((x < one) || (x < a)) { - return (one - igamma_impl<Scalar>::run(a, x)); - } - if (x == inf) return zero; // std::isinf crashes on CUDA /* Compute x**a * exp(-x) / gamma(a) */ @@ -618,7 +641,7 @@ template <typename Scalar> struct igamma_impl { EIGEN_DEVICE_FUNC static Scalar run(Scalar a, Scalar x) { - /* igam() + /* igam() * Incomplete gamma integral * * @@ -680,22 +703,47 @@ struct igamma_impl { */ const Scalar zero = 0; const Scalar one = 1; - const Scalar machep = igamma_helper<Scalar>::machep(); - const Scalar maxlog = numext::log(NumTraits<Scalar>::highest()); const Scalar nan = NumTraits<Scalar>::quiet_NaN(); - double ans, ax, c, r; - if (x == zero) return zero; - if ((x < zero) || ( a <= zero)) { // domain error + if ((x < zero) || (a <= zero)) { // domain error return nan; } if ((x > one) && (x > a)) { - return (one - igammac_impl<Scalar>::run(a, x)); + /* The checks above ensure that we meet the preconditions for + * igammac_impl::Impl(), so call it, rather than igammac_impl::Run(). + * Calling Run() would also work, but in that case the compiler may not be + * able to prove that igammac_impl::Run and igamma_impl::Run are not + * mutually recursive. This leads to worse code, particularly on + * platforms like nvptx, where recursion is allowed only begrudgingly. + */ + return (one - igammac_impl<Scalar>::Impl(a, x)); } + return Impl(a, x); + } + + private: + /* igammac_impl calls igamma_impl::Impl. */ + friend struct igammac_impl<Scalar>; + + /* Actually computes igam(a, x). + * + * Preconditions: + * x > 0 + * a > 0 + * !(x > 1 && x > a) + */ + EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) { + const Scalar zero = 0; + const Scalar one = 1; + const Scalar machep = igamma_helper<Scalar>::machep(); + const Scalar maxlog = numext::log(NumTraits<Scalar>::highest()); + + double ans, ax, c, r; + /* Compute x**a * exp(-x) / gamma(a) */ ax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a); if (ax < -maxlog) { diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h index 7fe39808b..d2fe1e199 100644 --- a/Eigen/src/Core/StableNorm.h +++ b/Eigen/src/Core/StableNorm.h @@ -168,11 +168,12 @@ MatrixBase<Derived>::stableNorm() const DerivedCopy copy(derived()); enum { - CanAlign = (int(Flags)&DirectAccessBit) || (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME + CanAlign = ( (int(DerivedCopyClean::Flags)&DirectAccessBit) + || (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough + ) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT) // ifwe cannot allocate on the stack, then let's not bother about this optimization }; typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<DerivedCopyClean>::Alignment>, - typename DerivedCopyClean - ::ConstSegmentReturnType>::type SegmentWrapper; + typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper; Index n = size(); if(n==1) diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h index e6d137e40..5c5e5028e 100644 --- a/Eigen/src/Core/TriangularMatrix.h +++ b/Eigen/src/Core/TriangularMatrix.h @@ -532,7 +532,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat template<typename RhsType, typename DstType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _solve_impl(const RhsType &rhs, DstType &dst) const { - if(!(internal::is_same<RhsType,DstType>::value && internal::extract_data(dst) == internal::extract_data(rhs))) + if(!internal::is_same_dense(dst,rhs)) dst = rhs; this->solveInPlace(dst); } diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 281b8e4c6..6387f2870 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -70,12 +70,18 @@ struct half : public __half { explicit EIGEN_DEVICE_FUNC half(bool b) : __half(internal::raw_uint16_to_half(b ? 0x3c00 : 0)) {} + explicit EIGEN_DEVICE_FUNC half(unsigned int ui) + : __half(internal::float_to_half_rtne(static_cast<float>(ui))) {} explicit EIGEN_DEVICE_FUNC half(int i) : __half(internal::float_to_half_rtne(static_cast<float>(i))) {} + explicit EIGEN_DEVICE_FUNC half(unsigned long ul) + : __half(internal::float_to_half_rtne(static_cast<float>(ul))) {} explicit EIGEN_DEVICE_FUNC half(long l) : __half(internal::float_to_half_rtne(static_cast<float>(l))) {} explicit EIGEN_DEVICE_FUNC half(long long ll) : __half(internal::float_to_half_rtne(static_cast<float>(ll))) {} + explicit EIGEN_DEVICE_FUNC half(unsigned long long ull) + : __half(internal::float_to_half_rtne(static_cast<float>(ull))) {} explicit EIGEN_DEVICE_FUNC half(float f) : __half(internal::float_to_half_rtne(f)) {} explicit EIGEN_DEVICE_FUNC half(double d) @@ -401,6 +407,7 @@ static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const Eigen::half& a) static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const Eigen::half& a) { return !(Eigen::numext::isinf)(a) && !(Eigen::numext::isnan)(a); } + template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half abs(const Eigen::half& a) { Eigen::half result; result.x = a.x & 0x7FFF; @@ -418,6 +425,18 @@ template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrt(const Eigen::h template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half pow(const Eigen::half& a, const Eigen::half& b) { return Eigen::half(::powf(float(a), float(b))); } +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sin(const Eigen::half& a) { + return Eigen::half(::sinf(float(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half cos(const Eigen::half& a) { + return Eigen::half(::cosf(float(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half tan(const Eigen::half& a) { + return Eigen::half(::tanf(float(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half tanh(const Eigen::half& a) { + return Eigen::half(::tanhf(float(a))); +} template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floor(const Eigen::half& a) { return Eigen::half(::floorf(float(a))); } @@ -425,6 +444,51 @@ template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceil(const Eigen::h return Eigen::half(::ceilf(float(a))); } +template <> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half mini(const Eigen::half& a, const Eigen::half& b) { +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hlt(b, a) ? b : a; +#else + const float f1 = static_cast<float>(a); + const float f2 = static_cast<float>(b); + return f2 < f1 ? b : a; +#endif +} +template <> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half maxi(const Eigen::half& a, const Eigen::half& b) { +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hlt(a, b) ? b : a; +#else + const float f1 = static_cast<float>(a); + const float f2 = static_cast<float>(b); + return f1 < f2 ? b : a; +#endif +} + +#ifdef EIGEN_HAS_C99_MATH +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half lgamma(const Eigen::half& a) { + return Eigen::half(Eigen::numext::lgamma(static_cast<float>(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half digamma(const Eigen::half& a) { + return Eigen::half(Eigen::numext::digamma(static_cast<float>(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half zeta(const Eigen::half& x, const Eigen::half& q) { + return Eigen::half(Eigen::numext::zeta(static_cast<float>(x), static_cast<float>(q))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half polygamma(const Eigen::half& n, const Eigen::half& x) { + return Eigen::half(Eigen::numext::polygamma(static_cast<float>(n), static_cast<float>(x))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erf(const Eigen::half& a) { + return Eigen::half(Eigen::numext::erf(static_cast<float>(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erfc(const Eigen::half& a) { + return Eigen::half(Eigen::numext::erfc(static_cast<float>(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma(const Eigen::half& a, const Eigen::half& x) { + return Eigen::half(Eigen::numext::igamma(static_cast<float>(a), static_cast<float>(x))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igammac(const Eigen::half& a, const Eigen::half& x) { + return Eigen::half(Eigen::numext::igammac(static_cast<float>(a), static_cast<float>(x))); +} +#endif } // end namespace numext } // end namespace Eigen @@ -466,6 +530,11 @@ static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (isfinite)(const Eigen::half& a namespace std { +EIGEN_ALWAYS_INLINE ostream& operator << (ostream& os, const Eigen::half& v) { + os << static_cast<float>(v); + return os; +} + #if __cplusplus > 199711L template <> struct hash<Eigen::half> { diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h index e28fecfd0..5cd8ca950 100644 --- a/Eigen/src/Core/functors/BinaryFunctors.h +++ b/Eigen/src/Core/functors/BinaryFunctors.h @@ -345,6 +345,22 @@ template<> struct functor_traits<scalar_boolean_or_op> { }; /** \internal + * \brief Template functor to compute the xor of two booleans + * + * \sa class CwiseBinaryOp, ArrayBase::operator^ + */ +struct scalar_boolean_xor_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_xor_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a ^ b; } +}; +template<> struct functor_traits<scalar_boolean_xor_op> { + enum { + Cost = NumTraits<bool>::AddCost, + PacketAccess = false + }; +}; + +/** \internal * \brief Template functor to compute the incomplete gamma function igamma(a, x) * * \sa class CwiseBinaryOp, Cwise::igamma diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index 7ba0abedc..5baba1494 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -234,9 +234,33 @@ template<typename Scalar> struct scalar_exp_op { template <typename Packet> EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pexp(a); } }; -template<typename Scalar> -struct functor_traits<scalar_exp_op<Scalar> > -{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasExp }; }; +template <typename Scalar> +struct functor_traits<scalar_exp_op<Scalar> > { + enum { + PacketAccess = packet_traits<Scalar>::HasExp, + // The following numbers are based on the AVX implementation. +#ifdef EIGEN_VECTORIZE_FMA + // Haswell can issue 2 add/mul/madd per cycle. + Cost = + (sizeof(Scalar) == 4 + // float: 8 pmadd, 4 pmul, 2 padd/psub, 6 other + ? (8 * NumTraits<Scalar>::AddCost + 6 * NumTraits<Scalar>::MulCost) + // double: 7 pmadd, 5 pmul, 3 padd/psub, 1 div, 13 other + : (14 * NumTraits<Scalar>::AddCost + + 6 * NumTraits<Scalar>::MulCost + + NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost)) +#else + Cost = + (sizeof(Scalar) == 4 + // float: 7 pmadd, 6 pmul, 4 padd/psub, 10 other + ? (21 * NumTraits<Scalar>::AddCost + 13 * NumTraits<Scalar>::MulCost) + // double: 7 pmadd, 5 pmul, 3 padd/psub, 1 div, 13 other + : (23 * NumTraits<Scalar>::AddCost + + 12 * NumTraits<Scalar>::MulCost + + NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost)) +#endif + }; +}; /** \internal * @@ -250,9 +274,24 @@ template<typename Scalar> struct scalar_log_op { template <typename Packet> EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog(a); } }; -template<typename Scalar> -struct functor_traits<scalar_log_op<Scalar> > -{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasLog }; }; +template <typename Scalar> +struct functor_traits<scalar_log_op<Scalar> > { + enum { + PacketAccess = packet_traits<Scalar>::HasLog, + Cost = + (PacketAccess + // The following numbers are based on the AVX implementation. +#ifdef EIGEN_VECTORIZE_FMA + // 8 pmadd, 6 pmul, 8 padd/psub, 16 other, can issue 2 add/mul/madd per cycle. + ? (20 * NumTraits<Scalar>::AddCost + 7 * NumTraits<Scalar>::MulCost) +#else + // 8 pmadd, 6 pmul, 8 padd/psub, 20 other + ? (36 * NumTraits<Scalar>::AddCost + 14 * NumTraits<Scalar>::MulCost) +#endif + // Measured cost of std::log. + : sizeof(Scalar)==4 ? 40 : 85) + }; +}; /** \internal * @@ -280,10 +319,19 @@ template<typename Scalar> struct scalar_sqrt_op { template <typename Packet> EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psqrt(a); } }; -template<typename Scalar> -struct functor_traits<scalar_sqrt_op<Scalar> > -{ enum { - Cost = 5 * NumTraits<Scalar>::MulCost, +template <typename Scalar> +struct functor_traits<scalar_sqrt_op<Scalar> > { + enum { +#if EIGEN_FAST_MATH + // The following numbers are based on the AVX implementation. + Cost = (sizeof(Scalar) == 8 ? 28 + // 4 pmul, 1 pmadd, 3 other + : (3 * NumTraits<Scalar>::AddCost + + 5 * NumTraits<Scalar>::MulCost)), +#else + // The following numbers are based on min VSQRT throughput on Haswell. + Cost = (sizeof(Scalar) == 8 ? 28 : 14), +#endif PacketAccess = packet_traits<Scalar>::HasSqrt }; }; @@ -313,7 +361,7 @@ struct functor_traits<scalar_rsqrt_op<Scalar> > */ template<typename Scalar> struct scalar_cos_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_cos_op) - EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { using std::cos; return cos(a); } + EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return numext::cos(a); } template <typename Packet> EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pcos(a); } }; @@ -332,7 +380,7 @@ struct functor_traits<scalar_cos_op<Scalar> > */ template<typename Scalar> struct scalar_sin_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_sin_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::sin; return sin(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::sin(a); } template <typename Packet> EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psin(a); } }; @@ -352,7 +400,7 @@ struct functor_traits<scalar_sin_op<Scalar> > */ template<typename Scalar> struct scalar_tan_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_tan_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::tan; return tan(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::tan(a); } template <typename Packet> EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::ptan(a); } }; @@ -371,7 +419,7 @@ struct functor_traits<scalar_tan_op<Scalar> > */ template<typename Scalar> struct scalar_acos_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_acos_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::acos; return acos(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::acos(a); } template <typename Packet> EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pacos(a); } }; @@ -390,7 +438,7 @@ struct functor_traits<scalar_acos_op<Scalar> > */ template<typename Scalar> struct scalar_asin_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_asin_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::asin; return asin(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::asin(a); } template <typename Packet> EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pasin(a); } }; @@ -546,7 +594,7 @@ struct functor_traits<scalar_erfc_op<Scalar> > */ template<typename Scalar> struct scalar_atan_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_atan_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::atan; return atan(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::atan(a); } template <typename Packet> EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::patan(a); } }; @@ -566,7 +614,7 @@ struct functor_traits<scalar_atan_op<Scalar> > */ template<typename Scalar> struct scalar_tanh_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_tanh_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::tanh; return tanh(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::tanh(a); } template <typename Packet> EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::ptanh(a); } }; @@ -574,8 +622,24 @@ template<typename Scalar> struct functor_traits<scalar_tanh_op<Scalar> > { enum { - Cost = 5 * NumTraits<Scalar>::MulCost, - PacketAccess = packet_traits<Scalar>::HasTanh + PacketAccess = packet_traits<Scalar>::HasTanh, + Cost = + (PacketAccess + // The following numbers are based on the AVX implementation, +#ifdef EIGEN_VECTORIZE_FMA + // Haswell can issue 2 add/mul/madd per cycle. + // 9 pmadd, 2 pmul, 1 div, 2 other + ? (2 * NumTraits<Scalar>::AddCost + 6 * NumTraits<Scalar>::MulCost + + NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost) +#else + ? (11 * NumTraits<Scalar>::AddCost + + 11 * NumTraits<Scalar>::MulCost + + NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost) +#endif + // This number assumes a naive implementation of tanh + : (6 * NumTraits<Scalar>::AddCost + 3 * NumTraits<Scalar>::MulCost + + 2 * NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost + + functor_traits<scalar_exp_op<Scalar> >::Cost)) }; }; @@ -585,7 +649,7 @@ struct functor_traits<scalar_tanh_op<Scalar> > */ template<typename Scalar> struct scalar_sinh_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_sinh_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::sinh; return sinh(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::sinh(a); } template <typename Packet> EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psinh(a); } }; @@ -604,7 +668,7 @@ struct functor_traits<scalar_sinh_op<Scalar> > */ template<typename Scalar> struct scalar_cosh_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_cosh_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::cosh; return cosh(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::cosh(a); } template <typename Packet> EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pcosh(a); } }; diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 4c1a63d40..a96c7bfd4 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -11,8 +11,8 @@ #define EIGEN_GENERAL_BLOCK_PANEL_H -namespace Eigen { - +namespace Eigen { + namespace internal { template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false> @@ -36,7 +36,7 @@ const std::ptrdiff_t defaultL3CacheSize = 512*1024; #endif /** \internal */ -struct CacheSizes { +struct CacheSizes { CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) { int l1CacheSize, l2CacheSize, l3CacheSize; queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize); @@ -89,7 +89,7 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff * * \sa setCpuCacheSizes */ -template<typename LhsScalar, typename RhsScalar, int KcFactor> +template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index> void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1) { typedef gebp_traits<LhsScalar,RhsScalar> Traits; @@ -107,21 +107,17 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n enum { kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)), ksub = Traits::mr * Traits::nr * sizeof(ResScalar), - k_mask = -8, - + kr = 8, mr = Traits::mr, - mr_mask = -mr, - - nr = Traits::nr, - nr_mask = -nr + nr = Traits::nr }; // Increasing k gives us more time to prefetch the content of the "C" // registers. However once the latency is hidden there is no point in // increasing the value of k, so we'll cap it at 320 (value determined // experimentally). - const Index k_cache = (std::min<Index>)((l1-ksub)/kdiv, 320); + const Index k_cache = (numext::mini<Index>)((l1-ksub)/kdiv, 320); if (k_cache < k) { - k = k_cache & k_mask; + k = k_cache - (k_cache % kr); eigen_internal_assert(k > 0); } @@ -130,10 +126,10 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n if (n_cache <= n_per_thread) { // Don't exceed the capacity of the l2 cache. eigen_internal_assert(n_cache >= static_cast<Index>(nr)); - n = n_cache & nr_mask; + n = n_cache - (n_cache % nr); eigen_internal_assert(n > 0); } else { - n = (std::min<Index>)(n, (n_per_thread + nr - 1) & nr_mask); + n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr)); } if (l3 > l2) { @@ -141,10 +137,10 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads); const Index m_per_thread = numext::div_ceil(m, num_threads); if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) { - m = m_cache & mr_mask; + m = m_cache - (m_cache % mr); eigen_internal_assert(m > 0); } else { - m = (std::min<Index>)(m, (m_per_thread + mr - 1) & mr_mask); + m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr)); } } } @@ -156,29 +152,29 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n l2 = 32*1024; l3 = 512*1024; #endif - + // Early return for small problems because the computation below are time consuming for small problems. // Perhaps it would make more sense to consider k*n*m?? // Note that for very tiny problem, this function should be bypassed anyway // because we use the coefficient-based implementation for them. - if((std::max)(k,(std::max)(m,n))<48) + if((numext::maxi)(k,(numext::maxi)(m,n))<48) return; - + typedef typename Traits::ResScalar ResScalar; enum { k_peeling = 8, k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)), k_sub = Traits::mr * Traits::nr * sizeof(ResScalar) }; - + // ---- 1st level of blocking on L1, yields kc ---- - + // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache. // We also include a register-level block of the result (mx x nr). // (In an ideal world only the lhs panel would stay in L1) // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of: - const Index max_kc = std::max<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1); + const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1); const Index old_k = k; if(k>max_kc) { @@ -187,12 +183,12 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // while keeping the same number of sweeps over the result. k = (k%max_kc)==0 ? max_kc : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1))); - + eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same"); } - + // ---- 2nd level of blocking on max(L2,L3), yields nc ---- - + // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is: // actual_l2 = max(l2, l3/nb_core_sharing_l3) // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it) @@ -202,7 +198,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n #else const Index actual_l2 = 1572864; // == 1.5 MB #endif - + // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2. // The second half is implicitly reserved to access the result and lhs coefficients. // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful @@ -223,7 +219,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar)); } // WARNING Below, we assume that Traits::nr is a power of two. - Index nc = std::min<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1)); + Index nc = numext::mini<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1)); if(n>nc) { // We are really blocking over the columns: @@ -252,9 +248,9 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // we have both L2 and L3, and problem is small enough to be kept in L2 // Let's choose m such that lhs's block fit in 1/3 of L2 actual_lm = l2; - max_mc = (std::min<Index>)(576,max_mc); + max_mc = (numext::mini<Index>)(576,max_mc); } - Index mc = (std::min<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc); + Index mc = (numext::mini<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc); if (mc > Traits::mr) mc -= mc % Traits::mr; else if (mc==0) return; m = (m%mc)==0 ? mc @@ -263,13 +259,14 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n } } +template <typename Index> inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n) { #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) { - k = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K); - m = std::min<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M); - n = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N); + k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K); + m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M); + n = numext::mini<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N); return true; } #else @@ -296,11 +293,11 @@ inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n) * * \sa setCpuCacheSizes */ -template<typename LhsScalar, typename RhsScalar, int KcFactor> +template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index> void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) { if (!useSpecificBlockingSizes(k, m, n)) { - evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor>(k, m, n, num_threads); + evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads); } typedef gebp_traits<LhsScalar,RhsScalar> Traits; @@ -314,10 +311,10 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads if (n > nr) n -= n % nr; } -template<typename LhsScalar, typename RhsScalar> +template<typename LhsScalar, typename RhsScalar, typename Index> inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) { - computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n, num_threads); + computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads); } #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD @@ -2225,6 +2222,16 @@ inline std::ptrdiff_t l2CacheSize() return l2; } +/** \returns the currently set level 3 cpu cache size (in bytes) used to estimate the ideal blocking size paramete\ +rs. +* \sa setCpuCacheSize */ +inline std::ptrdiff_t l3CacheSize() +{ + std::ptrdiff_t l1, l2, l3; + internal::manage_caching_sizes(GetAction, &l1, &l2, &l3); + return l3; +} + /** Set the cpu L1 and L2 cache sizes (in bytes). * These values are use to adjust the size of the blocks * for the algorithms working per blocks. diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index 831089dee..80ba89465 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -43,7 +43,7 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder, typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride, const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride, - const ResScalar& alpha, level3_blocking<LhsScalar,RhsScalar>& blocking) + const ResScalar& alpha, level3_blocking<RhsScalar,LhsScalar>& blocking) { general_matrix_matrix_triangular_product<Index, RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs, diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h index 7c014b72a..f79840aa7 100644 --- a/Eigen/src/Core/products/TriangularMatrixVector.h +++ b/Eigen/src/Core/products/TriangularMatrixVector.h @@ -27,13 +27,13 @@ struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,C HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag }; static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha); + const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha); }; template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version> EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version> ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha) + const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha) { static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH; Index size = (std::min)(_rows,_cols); diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h index 208593718..1bed66ed8 100644 --- a/Eigen/src/Core/products/TriangularSolverMatrix.h +++ b/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -83,7 +83,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju // coherence when accessing the rhs elements std::ptrdiff_t l1, l2, l3; manage_caching_sizes(GetAction, &l1, &l2, &l3); - Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * otherStride) : 0; + Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * std::max<Index>(otherStride,size)) : 0; subcols = std::max<Index>((subcols/Traits::nr)*Traits::nr, Traits::nr); for(Index k2=IsLower ? 0 : size; diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index a0cbd2247..e2bb147dc 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -371,10 +371,10 @@ // Does the compiler support const expressions? #ifdef __CUDACC__ // Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above -#if __cplusplus > 199711L && defined(__CUDACC_VER__) && (defined(__clang__) || __CUDACC_VER__ >= 70500) +#if __cplusplus > 199711L && defined(__CUDACC_VER__) && (EIGEN_COMP_CLANG || __CUDACC_VER__ >= 70500) #define EIGEN_HAS_CONSTEXPR 1 #endif -#elif (defined(__cplusplus) && __cplusplus >= 201402L) || \ +#elif __has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \ EIGEN_GNUC_AT_LEAST(4,8) #define EIGEN_HAS_CONSTEXPR 1 #endif @@ -572,12 +572,12 @@ namespace Eigen { //------------------------------------------------------------------------------------------ // Static and dynamic alignment control -// +// // The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES // as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively. // The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not, // a default value is automatically computed based on architecture, compiler, and OS. -// +// // This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX} // to be used to declare statically aligned buffers. //------------------------------------------------------------------------------------------ @@ -640,7 +640,7 @@ namespace Eigen { #ifndef EIGEN_MAX_STATIC_ALIGN_BYTES // Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES - + // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in @@ -667,13 +667,13 @@ namespace Eigen { #else #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0 #endif - + #if EIGEN_ARCH_WANTS_STACK_ALIGNMENT #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES #else #define EIGEN_MAX_STATIC_ALIGN_BYTES 0 #endif - + #endif // If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_ALIGN_BYTES diff --git a/Eigen/src/Householder/HouseholderSequence.h b/Eigen/src/Householder/HouseholderSequence.h index 74cd0a472..e9f3ebf88 100644 --- a/Eigen/src/Householder/HouseholderSequence.h +++ b/Eigen/src/Householder/HouseholderSequence.h @@ -243,8 +243,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS { workspace.resize(rows()); Index vecs = m_length; - if( internal::is_same<typename internal::remove_all<VectorsType>::type,Dest>::value - && internal::extract_data(dst) == internal::extract_data(m_vectors)) + if(is_same_dense(dst,m_vectors)) { // in-place dst.diagonal().setOnes(); diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h index 1721213d6..64b9eb7f1 100644 --- a/Eigen/src/LU/FullPivLU.h +++ b/Eigen/src/LU/FullPivLU.h @@ -231,6 +231,15 @@ template<typename _MatrixType> class FullPivLU return Solve<FullPivLU, Rhs>(*this, b.derived()); } + /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is + the LU decomposition. + */ + inline RealScalar rcond() const + { + eigen_assert(m_isInitialized && "PartialPivLU is not initialized."); + return internal::rcond_estimate_helper(m_l1_norm, *this); + } + /** \returns the determinant of the matrix of which * *this is the LU decomposition. It has only linear complexity * (that is, O(n) where n is the dimension of the square matrix) @@ -410,6 +419,7 @@ template<typename _MatrixType> class FullPivLU IntColVectorType m_rowsTranspositions; IntRowVectorType m_colsTranspositions; Index m_det_pq, m_nonzero_pivots; + RealScalar m_l1_norm; RealScalar m_maxpivot, m_prescribedThreshold; bool m_isInitialized, m_usePrescribedThreshold; }; @@ -455,11 +465,12 @@ FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const EigenBase<InputType> // the permutations are stored as int indices, so just to be sure: eigen_assert(matrix.rows()<=NumTraits<int>::highest() && matrix.cols()<=NumTraits<int>::highest()); - m_isInitialized = true; m_lu = matrix.derived(); + m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff(); computeInPlace(); + m_isInitialized = true; return *this; } diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h index ab7797d2a..2e6d91939 100644 --- a/Eigen/src/LU/PartialPivLU.h +++ b/Eigen/src/LU/PartialPivLU.h @@ -76,7 +76,6 @@ template<typename _MatrixType> class PartialPivLU typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime> TranspositionType; typedef typename MatrixType::PlainObject PlainObject; - /** * \brief Default Constructor. * @@ -152,6 +151,15 @@ template<typename _MatrixType> class PartialPivLU return Solve<PartialPivLU, Rhs>(*this, b.derived()); } + /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is + the LU decomposition. + */ + inline RealScalar rcond() const + { + eigen_assert(m_isInitialized && "PartialPivLU is not initialized."); + return internal::rcond_estimate_helper(m_l1_norm, *this); + } + /** \returns the inverse of the matrix of which *this is the LU decomposition. * * \warning The matrix being decomposed here is assumed to be invertible. If you need to check for @@ -178,7 +186,7 @@ template<typename _MatrixType> class PartialPivLU * * \sa MatrixBase::determinant() */ - typename internal::traits<MatrixType>::Scalar determinant() const; + Scalar determinant() const; MatrixType reconstructedMatrix() const; @@ -247,6 +255,7 @@ template<typename _MatrixType> class PartialPivLU PermutationType m_p; TranspositionType m_rowsTranspositions; Index m_det_p; + RealScalar m_l1_norm; bool m_isInitialized; }; @@ -256,6 +265,7 @@ PartialPivLU<MatrixType>::PartialPivLU() m_p(), m_rowsTranspositions(), m_det_p(0), + m_l1_norm(0), m_isInitialized(false) { } @@ -266,6 +276,7 @@ PartialPivLU<MatrixType>::PartialPivLU(Index size) m_p(size), m_rowsTranspositions(size), m_det_p(0), + m_l1_norm(0), m_isInitialized(false) { } @@ -277,6 +288,7 @@ PartialPivLU<MatrixType>::PartialPivLU(const EigenBase<InputType>& matrix) m_p(matrix.rows()), m_rowsTranspositions(matrix.rows()), m_det_p(0), + m_l1_norm(0), m_isInitialized(false) { compute(matrix.derived()); @@ -467,6 +479,7 @@ PartialPivLU<MatrixType>& PartialPivLU<MatrixType>::compute(const EigenBase<Inpu eigen_assert(matrix.rows()<NumTraits<int>::highest()); m_lu = matrix.derived(); + m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff(); eigen_assert(matrix.rows() == matrix.cols() && "PartialPivLU is only for square (and moreover invertible) matrices"); const Index size = matrix.rows(); @@ -484,7 +497,7 @@ PartialPivLU<MatrixType>& PartialPivLU<MatrixType>::compute(const EigenBase<Inpu } template<typename MatrixType> -typename internal::traits<MatrixType>::Scalar PartialPivLU<MatrixType>::determinant() const +typename PartialPivLU<MatrixType>::Scalar PartialPivLU<MatrixType>::determinant() const { eigen_assert(m_isInitialized && "PartialPivLU is not initialized."); return Scalar(m_det_p) * m_lu.diagonal().prod(); diff --git a/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/Eigen/src/QR/CompleteOrthogonalDecomposition.h index e71944fd7..230d0d23c 100644 --- a/Eigen/src/QR/CompleteOrthogonalDecomposition.h +++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h @@ -397,6 +397,10 @@ CompleteOrthogonalDecomposition<MatrixType>& CompleteOrthogonalDecomposition< const Index rank = m_cpqr.rank(); const Index cols = matrix.cols(); + const Index rows = matrix.rows(); + m_zCoeffs.resize((std::min)(rows, cols)); + m_temp.resize(cols); + if (rank < cols) { // We have reduced the (permuted) matrix to the form // [R11 R12] diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h index bf5ff48c3..1940c8294 100644 --- a/Eigen/src/SVD/JacobiSVD.h +++ b/Eigen/src/SVD/JacobiSVD.h @@ -350,7 +350,8 @@ template<typename MatrixType, int QRPreconditioner> struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, false> { typedef JacobiSVD<MatrixType, QRPreconditioner> SVD; - static void run(typename SVD::WorkMatrixType&, SVD&, Index, Index) {} + typedef typename MatrixType::RealScalar RealScalar; + static bool run(typename SVD::WorkMatrixType&, SVD&, Index, Index, RealScalar&) { return true; } }; template<typename MatrixType, int QRPreconditioner> @@ -359,19 +360,30 @@ struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, true> typedef JacobiSVD<MatrixType, QRPreconditioner> SVD; typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; - static void run(typename SVD::WorkMatrixType& work_matrix, SVD& svd, Index p, Index q) + static bool run(typename SVD::WorkMatrixType& work_matrix, SVD& svd, Index p, Index q, RealScalar& maxDiagEntry) { using std::sqrt; + using std::abs; Scalar z; JacobiRotation<Scalar> rot; RealScalar n = sqrt(numext::abs2(work_matrix.coeff(p,p)) + numext::abs2(work_matrix.coeff(q,p))); - + + const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)(); + const RealScalar precision = NumTraits<Scalar>::epsilon(); + if(n==0) { - z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q); - work_matrix.row(p) *= z; - if(svd.computeU()) svd.m_matrixU.col(p) *= conj(z); - if(work_matrix.coeff(q,q)!=Scalar(0)) + // make sure first column is zero + work_matrix.coeffRef(p,p) = work_matrix.coeffRef(q,p) = Scalar(0); + + if(abs(numext::imag(work_matrix.coeff(p,q)))>considerAsZero) + { + // work_matrix.coeff(p,q) can be zero if work_matrix.coeff(q,p) is not zero but small enough to underflow when computing n + z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q); + work_matrix.row(p) *= z; + if(svd.computeU()) svd.m_matrixU.col(p) *= conj(z); + } + if(abs(numext::imag(work_matrix.coeff(q,q)))>considerAsZero) { z = abs(work_matrix.coeff(q,q)) / work_matrix.coeff(q,q); work_matrix.row(q) *= z; @@ -385,19 +397,25 @@ struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, true> rot.s() = work_matrix.coeff(q,p) / n; work_matrix.applyOnTheLeft(p,q,rot); if(svd.computeU()) svd.m_matrixU.applyOnTheRight(p,q,rot.adjoint()); - if(work_matrix.coeff(p,q) != Scalar(0)) + if(abs(numext::imag(work_matrix.coeff(p,q)))>considerAsZero) { z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q); work_matrix.col(q) *= z; if(svd.computeV()) svd.m_matrixV.col(q) *= z; } - if(work_matrix.coeff(q,q) != Scalar(0)) + if(abs(numext::imag(work_matrix.coeff(q,q)))>considerAsZero) { z = abs(work_matrix.coeff(q,q)) / work_matrix.coeff(q,q); work_matrix.row(q) *= z; if(svd.computeU()) svd.m_matrixU.col(q) *= conj(z); } } + + // update largest diagonal entry + maxDiagEntry = numext::maxi(maxDiagEntry,numext::maxi(abs(work_matrix.coeff(p,p)), abs(work_matrix.coeff(q,q)))); + // and check whether the 2x2 block is already diagonal + RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry); + return abs(work_matrix.coeff(p,q))>threshold || abs(work_matrix.coeff(q,p)) > threshold; } }; @@ -414,7 +432,6 @@ void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q, JacobiRotation<RealScalar> rot1; RealScalar t = m.coeff(0,0) + m.coeff(1,1); RealScalar d = m.coeff(1,0) - m.coeff(0,1); - if(d == RealScalar(0)) { rot1.s() = RealScalar(0); @@ -707,6 +724,7 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig } /*** step 2. The main Jacobi SVD iteration. ***/ + RealScalar maxDiagEntry = m_workMatrix.cwiseAbs().diagonal().maxCoeff(); bool finished = false; while(!finished) @@ -722,25 +740,27 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig // if this 2x2 sub-matrix is not diagonal already... // notice that this comparison will evaluate to false if any NaN is involved, ensuring that NaN's don't // keep us iterating forever. Similarly, small denormal numbers are considered zero. - RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, - precision * numext::maxi<RealScalar>(abs(m_workMatrix.coeff(p,p)), - abs(m_workMatrix.coeff(q,q)))); - // We compare both values to threshold instead of calling max to be robust to NaN (See bug 791) + RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry); if(abs(m_workMatrix.coeff(p,q))>threshold || abs(m_workMatrix.coeff(q,p)) > threshold) { finished = false; - // perform SVD decomposition of 2x2 sub-matrix corresponding to indices p,q to make it diagonal - internal::svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner>::run(m_workMatrix, *this, p, q); - JacobiRotation<RealScalar> j_left, j_right; - internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right); - - // accumulate resulting Jacobi rotations - m_workMatrix.applyOnTheLeft(p,q,j_left); - if(computeU()) m_matrixU.applyOnTheRight(p,q,j_left.transpose()); - - m_workMatrix.applyOnTheRight(p,q,j_right); - if(computeV()) m_matrixV.applyOnTheRight(p,q,j_right); + // the complex to real operation returns true is the updated 2x2 block is not already diagonal + if(internal::svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner>::run(m_workMatrix, *this, p, q, maxDiagEntry)) + { + JacobiRotation<RealScalar> j_left, j_right; + internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right); + + // accumulate resulting Jacobi rotations + m_workMatrix.applyOnTheLeft(p,q,j_left); + if(computeU()) m_matrixU.applyOnTheRight(p,q,j_left.transpose()); + + m_workMatrix.applyOnTheRight(p,q,j_right); + if(computeV()) m_matrixV.applyOnTheRight(p,q,j_right); + + // keep track of the largest diagonal coefficient + maxDiagEntry = numext::maxi(maxDiagEntry,numext::maxi(abs(m_workMatrix.coeff(p,p)), abs(m_workMatrix.coeff(q,q)))); + } } } } diff --git a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h index fe4a97120..9143a4c82 100644 --- a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +++ b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h @@ -22,7 +22,7 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased> typedef CwiseUnaryOp<UnaryOp, ArgType> XprType; class InnerIterator; -// class ReverseInnerIterator; + class ReverseInnerIterator; enum { CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<UnaryOp>::Cost, diff --git a/Eigen/src/SuperLUSupport/SuperLUSupport.h b/Eigen/src/SuperLUSupport/SuperLUSupport.h index 0ae3017cc..7e2efd452 100644 --- a/Eigen/src/SuperLUSupport/SuperLUSupport.h +++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h @@ -986,7 +986,7 @@ void SuperILU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest &m_sluStat, &info, Scalar()); StatFree(&m_sluStat); - if(&x.coeffRef(0) != x_ref.data()) + if(x.derived().data() != x_ref.data()) x = x_ref; m_info = info==0 ? Success : NumericalIssue; diff --git a/Eigen/src/plugins/ArrayCwiseBinaryOps.h b/Eigen/src/plugins/ArrayCwiseBinaryOps.h index 9422c40bc..5694592d6 100644 --- a/Eigen/src/plugins/ArrayCwiseBinaryOps.h +++ b/Eigen/src/plugins/ArrayCwiseBinaryOps.h @@ -280,3 +280,21 @@ operator||(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const return CwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>(derived(),other.derived()); } +/** \returns an expression of the coefficient-wise ^ operator of *this and \a other + * + * \warning this operator is for expression of bool only. + * + * Example: \include Cwise_boolean_xor.cpp + * Output: \verbinclude Cwise_boolean_xor.out + * + * \sa operator&&(), select() + */ +template<typename OtherDerived> +EIGEN_DEVICE_FUNC +inline const CwiseBinaryOp<internal::scalar_boolean_xor_op, const Derived, const OtherDerived> +operator^(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const +{ + EIGEN_STATIC_ASSERT((internal::is_same<bool,Scalar>::value && internal::is_same<bool,typename OtherDerived::Scalar>::value), + THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL); + return CwiseBinaryOp<internal::scalar_boolean_xor_op, const Derived, const OtherDerived>(derived(),other.derived()); +} diff --git a/bench/BenchTimer.h b/bench/BenchTimer.h index 64666d75f..ea28496b7 100644 --- a/bench/BenchTimer.h +++ b/bench/BenchTimer.h @@ -22,7 +22,6 @@ # endif # include <windows.h> #elif defined(__APPLE__) -#include <CoreServices/CoreServices.h> #include <mach/mach_time.h> #else # include <unistd.h> diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index 90b9bc741..62533a608 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -201,9 +201,15 @@ template <typename Device, typename T> class BenchmarkSuite { size_b[1] = k_/2; TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b); +#ifndef EIGEN_HAS_INDEX_LIST Eigen::array<TensorIndex, 2> strides; strides[0] = 1; strides[1] = 2; +#else + // Take advantage of cxx11 to give the compiler information it can use to + // optimize the code. + Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> > strides; +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { diff --git a/bench/tensors/tensor_benchmarks_fp16_gpu.cu b/bench/tensors/tensor_benchmarks_fp16_gpu.cu index d34bd73ca..14876556e 100644 --- a/bench/tensors/tensor_benchmarks_fp16_gpu.cu +++ b/bench/tensors/tensor_benchmarks_fp16_gpu.cu @@ -29,8 +29,8 @@ BM_FuncGPU(padding); BM_FuncGPU(striding); BM_FuncGPU(broadcasting); BM_FuncGPU(coeffWiseOp); -//BM_FuncGPU(algebraicFunc); -//BM_FuncGPU(transcendentalFunc); +BM_FuncGPU(algebraicFunc); +BM_FuncGPU(transcendentalFunc); BM_FuncGPU(rowReduction); BM_FuncGPU(colReduction); @@ -48,11 +48,11 @@ BM_FuncGPU(colReduction); BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000); -/*BM_FuncWithInputDimsGPU(contraction, N, N, N); +BM_FuncWithInputDimsGPU(contraction, N, N, N); BM_FuncWithInputDimsGPU(contraction, 64, N, N); BM_FuncWithInputDimsGPU(contraction, N, 64, N); BM_FuncWithInputDimsGPU(contraction, N, N, 64); -*/ + // Convolutions #define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \ diff --git a/blas/level3_impl.h b/blas/level3_impl.h index beb36c47d..6c802cd5f 100644 --- a/blas/level3_impl.h +++ b/blas/level3_impl.h @@ -159,7 +159,7 @@ int EIGEN_BLAS_FUNC(trsm)(const char *side, const char *uplo, const char *opa, c return 0; int code = OP(*opa) | (SIDE(*side) << 2) | (UPLO(*uplo) << 3) | (DIAG(*diag) << 4); - + if(SIDE(*side)==LEFT) { internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m,1,false); @@ -385,7 +385,7 @@ int EIGEN_BLAS_FUNC(syrk)(const char *uplo, const char *op, const int *n, const int info = 0; if(UPLO(*uplo)==INVALID) info = 1; - else if(OP(*op)==INVALID) info = 2; + else if(OP(*op)==INVALID || (ISCOMPLEX && OP(*op)==ADJ) ) info = 2; else if(*n<0) info = 3; else if(*k<0) info = 4; else if(*lda<std::max(1,(OP(*op)==NOTR)?*n:*k)) info = 7; @@ -447,7 +447,7 @@ int EIGEN_BLAS_FUNC(syr2k)(const char *uplo, const char *op, const int *n, const int info = 0; if(UPLO(*uplo)==INVALID) info = 1; - else if(OP(*op)==INVALID) info = 2; + else if(OP(*op)==INVALID || (ISCOMPLEX && OP(*op)==ADJ) ) info = 2; else if(*n<0) info = 3; else if(*k<0) info = 4; else if(*lda<std::max(1,(OP(*op)==NOTR)?*n:*k)) info = 7; @@ -609,7 +609,7 @@ int EIGEN_BLAS_FUNC(herk)(const char *uplo, const char *op, const int *n, const else if(beta==Scalar(0)) matrix(c, *n, *n, *ldc).triangularView<Lower>().setZero(); else matrix(c, *n, *n, *ldc).triangularView<StrictlyLower>() *= beta; - + if(beta!=Scalar(0)) { matrix(c, *n, *n, *ldc).diagonal().real() *= beta; diff --git a/test/cholesky.cpp b/test/cholesky.cpp index d652af5bf..b7abc230b 100644 --- a/test/cholesky.cpp +++ b/test/cholesky.cpp @@ -17,6 +17,12 @@ #include <Eigen/Cholesky> #include <Eigen/QR> +template<typename MatrixType, int UpLo> +typename MatrixType::RealScalar matrix_l1_norm(const MatrixType& m) { + MatrixType symm = m.template selfadjointView<UpLo>(); + return symm.cwiseAbs().colwise().sum().maxCoeff(); +} + template<typename MatrixType,template <typename,int> class CholType> void test_chol_update(const MatrixType& symm) { typedef typename MatrixType::Scalar Scalar; @@ -77,7 +83,7 @@ template<typename MatrixType> void cholesky(const MatrixType& m) { SquareMatrixType symmUp = symm.template triangularView<Upper>(); SquareMatrixType symmLo = symm.template triangularView<Lower>(); - + LLT<SquareMatrixType,Lower> chollo(symmLo); VERIFY_IS_APPROX(symm, chollo.reconstructedMatrix()); vecX = chollo.solve(vecB); @@ -85,6 +91,14 @@ template<typename MatrixType> void cholesky(const MatrixType& m) matX = chollo.solve(matB); VERIFY_IS_APPROX(symm * matX, matB); + const MatrixType symmLo_inverse = chollo.solve(MatrixType::Identity(rows,cols)); + RealScalar rcond = (RealScalar(1) / matrix_l1_norm<MatrixType, Lower>(symmLo)) / + matrix_l1_norm<MatrixType, Lower>(symmLo_inverse); + RealScalar rcond_est = chollo.rcond(); + // Verify that the estimated condition number is within a factor of 10 of the + // truth. + VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10); + // test the upper mode LLT<SquareMatrixType,Upper> cholup(symmUp); VERIFY_IS_APPROX(symm, cholup.reconstructedMatrix()); @@ -93,6 +107,15 @@ template<typename MatrixType> void cholesky(const MatrixType& m) matX = cholup.solve(matB); VERIFY_IS_APPROX(symm * matX, matB); + // Verify that the estimated condition number is within a factor of 10 of the + // truth. + const MatrixType symmUp_inverse = cholup.solve(MatrixType::Identity(rows,cols)); + rcond = (RealScalar(1) / matrix_l1_norm<MatrixType, Upper>(symmUp)) / + matrix_l1_norm<MatrixType, Upper>(symmUp_inverse); + rcond_est = cholup.rcond(); + VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10); + + MatrixType neg = -symmLo; chollo.compute(neg); VERIFY(chollo.info()==NumericalIssue); @@ -101,7 +124,7 @@ template<typename MatrixType> void cholesky(const MatrixType& m) VERIFY_IS_APPROX(MatrixType(chollo.matrixU().transpose().conjugate()), MatrixType(chollo.matrixL())); VERIFY_IS_APPROX(MatrixType(cholup.matrixL().transpose().conjugate()), MatrixType(cholup.matrixU())); VERIFY_IS_APPROX(MatrixType(cholup.matrixU().transpose().conjugate()), MatrixType(cholup.matrixL())); - + // test some special use cases of SelfCwiseBinaryOp: MatrixType m1 = MatrixType::Random(rows,cols), m2(rows,cols); m2 = m1; @@ -137,6 +160,15 @@ template<typename MatrixType> void cholesky(const MatrixType& m) matX = ldltlo.solve(matB); VERIFY_IS_APPROX(symm * matX, matB); + const MatrixType symmLo_inverse = ldltlo.solve(MatrixType::Identity(rows,cols)); + RealScalar rcond = (RealScalar(1) / matrix_l1_norm<MatrixType, Lower>(symmLo)) / + matrix_l1_norm<MatrixType, Lower>(symmLo_inverse); + RealScalar rcond_est = ldltlo.rcond(); + // Verify that the estimated condition number is within a factor of 10 of the + // truth. + VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10); + + LDLT<SquareMatrixType,Upper> ldltup(symmUp); VERIFY_IS_APPROX(symm, ldltup.reconstructedMatrix()); vecX = ldltup.solve(vecB); @@ -144,6 +176,14 @@ template<typename MatrixType> void cholesky(const MatrixType& m) matX = ldltup.solve(matB); VERIFY_IS_APPROX(symm * matX, matB); + // Verify that the estimated condition number is within a factor of 10 of the + // truth. + const MatrixType symmUp_inverse = ldltup.solve(MatrixType::Identity(rows,cols)); + rcond = (RealScalar(1) / matrix_l1_norm<MatrixType, Upper>(symmUp)) / + matrix_l1_norm<MatrixType, Upper>(symmUp_inverse); + rcond_est = ldltup.rcond(); + VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10); + VERIFY_IS_APPROX(MatrixType(ldltlo.matrixL().transpose().conjugate()), MatrixType(ldltlo.matrixU())); VERIFY_IS_APPROX(MatrixType(ldltlo.matrixU().transpose().conjugate()), MatrixType(ldltlo.matrixL())); VERIFY_IS_APPROX(MatrixType(ldltup.matrixL().transpose().conjugate()), MatrixType(ldltup.matrixU())); @@ -167,7 +207,7 @@ template<typename MatrixType> void cholesky(const MatrixType& m) // restore if(sign == -1) symm = -symm; - + // check matrices coming from linear constraints with Lagrange multipliers if(rows>=3) { @@ -183,7 +223,7 @@ template<typename MatrixType> void cholesky(const MatrixType& m) vecX = ldltlo.solve(vecB); VERIFY_IS_APPROX(A * vecX, vecB); } - + // check non-full rank matrices if(rows>=3) { @@ -199,7 +239,7 @@ template<typename MatrixType> void cholesky(const MatrixType& m) vecX = ldltlo.solve(vecB); VERIFY_IS_APPROX(A * vecX, vecB); } - + // check matrices with a wide spectrum if(rows>=3) { @@ -225,7 +265,7 @@ template<typename MatrixType> void cholesky(const MatrixType& m) { RealScalar large_tol = std::sqrt(test_precision<RealScalar>()); VERIFY((A * vecX).isApprox(vecB, large_tol)); - + ++g_test_level; VERIFY_IS_APPROX(A * vecX,vecB); --g_test_level; @@ -314,14 +354,14 @@ template<typename MatrixType> void cholesky_bug241(const MatrixType& m) } // LDLT is not guaranteed to work for indefinite matrices, but happens to work fine if matrix is diagonal. -// This test checks that LDLT reports correctly that matrix is indefinite. +// This test checks that LDLT reports correctly that matrix is indefinite. // See http://forum.kde.org/viewtopic.php?f=74&t=106942 and bug 736 template<typename MatrixType> void cholesky_definiteness(const MatrixType& m) { eigen_assert(m.rows() == 2 && m.cols() == 2); MatrixType mat; LDLT<MatrixType> ldlt(2); - + { mat << 1, 0, 0, -1; ldlt.compute(mat); @@ -384,11 +424,11 @@ void test_cholesky() CALL_SUBTEST_3( cholesky_definiteness(Matrix2d()) ); CALL_SUBTEST_4( cholesky(Matrix3f()) ); CALL_SUBTEST_5( cholesky(Matrix4d()) ); - - s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE); + + s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE); CALL_SUBTEST_2( cholesky(MatrixXd(s,s)) ); TEST_SET_BUT_UNUSED_VARIABLE(s) - + s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2); CALL_SUBTEST_6( cholesky_cplx(MatrixXcd(s,s)) ); TEST_SET_BUT_UNUSED_VARIABLE(s) @@ -402,6 +442,6 @@ void test_cholesky() // Test problem size constructors CALL_SUBTEST_9( LLT<MatrixXf>(10) ); CALL_SUBTEST_9( LDLT<MatrixXf>(10) ); - + TEST_SET_BUT_UNUSED_VARIABLE(nb_temporaries) } diff --git a/test/lu.cpp b/test/lu.cpp index f14435114..9787f4d86 100644 --- a/test/lu.cpp +++ b/test/lu.cpp @@ -11,6 +11,11 @@ #include <Eigen/LU> using namespace std; +template<typename MatrixType> +typename MatrixType::RealScalar matrix_l1_norm(const MatrixType& m) { + return m.cwiseAbs().colwise().sum().maxCoeff(); +} + template<typename MatrixType> void lu_non_invertible() { typedef typename MatrixType::Index Index; @@ -143,7 +148,14 @@ template<typename MatrixType> void lu_invertible() m3 = MatrixType::Random(size,size); m2 = lu.solve(m3); VERIFY_IS_APPROX(m3, m1*m2); - VERIFY_IS_APPROX(m2, lu.inverse()*m3); + MatrixType m1_inverse = lu.inverse(); + VERIFY_IS_APPROX(m2, m1_inverse*m3); + + RealScalar rcond = (RealScalar(1) / matrix_l1_norm(m1)) / matrix_l1_norm(m1_inverse); + const RealScalar rcond_est = lu.rcond(); + // Verify that the estimated condition number is within a factor of 10 of the + // truth. + VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10); // test solve with transposed lu.template _solve_impl_transposed<false>(m3, m2); @@ -170,6 +182,7 @@ template<typename MatrixType> void lu_partial_piv() PartialPivLU.h */ typedef typename MatrixType::Index Index; + typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar; Index size = internal::random<Index>(1,4); MatrixType m1(size, size), m2(size, size), m3(size, size); @@ -181,7 +194,13 @@ template<typename MatrixType> void lu_partial_piv() m3 = MatrixType::Random(size,size); m2 = plu.solve(m3); VERIFY_IS_APPROX(m3, m1*m2); - VERIFY_IS_APPROX(m2, plu.inverse()*m3); + MatrixType m1_inverse = plu.inverse(); + VERIFY_IS_APPROX(m2, m1_inverse*m3); + + RealScalar rcond = (RealScalar(1) / matrix_l1_norm(m1)) / matrix_l1_norm(m1_inverse); + const RealScalar rcond_est = plu.rcond(); + // Verify that the estimate is within a factor of 10 of the truth. + VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10); // test solve with transposed plu.template _solve_impl_transposed<false>(m3, m2); diff --git a/test/main.h b/test/main.h index bba5e7570..b0e3b7818 100644 --- a/test/main.h +++ b/test/main.h @@ -275,6 +275,10 @@ inline void verify_impl(bool condition, const char *testname, const char *file, #define VERIFY(a) ::verify_impl(a, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a)) +#define VERIFY_GE(a, b) ::verify_impl(a >= b, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a >= b)) +#define VERIFY_LE(a, b) ::verify_impl(a <= b, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a <= b)) + + #define VERIFY_IS_EQUAL(a, b) VERIFY(test_is_equal(a, b)) #define VERIFY_IS_NOT_EQUAL(a, b) VERIFY(!test_is_equal(a, b)) #define VERIFY_IS_APPROX(a, b) VERIFY(verifyIsApprox(a, b)) @@ -316,9 +320,9 @@ inline bool test_isMuchSmallerThan(const float& a, const float& b) { return internal::isMuchSmallerThan(a, b, test_precision<float>()); } inline bool test_isApproxOrLessThan(const float& a, const float& b) { return internal::isApproxOrLessThan(a, b, test_precision<float>()); } + inline bool test_isApprox(const double& a, const double& b) { return internal::isApprox(a, b, test_precision<double>()); } - inline bool test_isMuchSmallerThan(const double& a, const double& b) { return internal::isMuchSmallerThan(a, b, test_precision<double>()); } inline bool test_isApproxOrLessThan(const double& a, const double& b) @@ -359,6 +363,12 @@ inline bool test_isApproxOrLessThan(const long double& a, const long double& b) { return internal::isApproxOrLessThan(a, b, test_precision<long double>()); } #endif // EIGEN_TEST_NO_LONGDOUBLE +inline bool test_isApprox(const half& a, const half& b) +{ return internal::isApprox(a, b, test_precision<half>()); } +inline bool test_isMuchSmallerThan(const half& a, const half& b) +{ return internal::isMuchSmallerThan(a, b, test_precision<half>()); } +inline bool test_isApproxOrLessThan(const half& a, const half& b) +{ return internal::isApproxOrLessThan(a, b, test_precision<half>()); } // test_relative_error returns the relative difference between a and b as a real scalar as used in isApprox. template<typename T1,typename T2> @@ -426,9 +436,7 @@ template<typename T1,typename T2> typename NumTraits<T1>::Real test_relative_error(const T1 &a, const T2 &b, typename internal::enable_if<internal::is_arithmetic<typename NumTraits<T1>::Real>::value, T1>::type* = 0) { typedef typename NumTraits<T1>::Real RealScalar; - using std::min; - using std::sqrt; - return sqrt(RealScalar(numext::abs2(a-b))/RealScalar((min)(numext::abs2(a),numext::abs2(b)))); + return numext::sqrt(RealScalar(numext::abs2(a-b))/RealScalar((numext::mini)(numext::abs2(a),numext::abs2(b)))); } template<typename T> diff --git a/test/mixingtypes.cpp b/test/mixingtypes.cpp index a3b469af8..0b381ec6c 100644 --- a/test/mixingtypes.cpp +++ b/test/mixingtypes.cpp @@ -148,10 +148,14 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType) VERIFY_IS_APPROX(sd*vd.adjoint()*mcd, sd*vd.adjoint().template cast<CD>().eval()*mcd); VERIFY_IS_APPROX(scd*vd.adjoint()*mcd, scd*vd.adjoint().template cast<CD>().eval()*mcd); - VERIFY_IS_APPROX(sd*vcd.adjoint()*md.template triangularView<Upper>(), sd*vcd.adjoint()*md.template cast<CD>().eval().template triangularView<Upper>()); + VERIFY_IS_APPROX( sd*vcd.adjoint()*md.template triangularView<Upper>(), sd*vcd.adjoint()*md.template cast<CD>().eval().template triangularView<Upper>()); VERIFY_IS_APPROX(scd*vcd.adjoint()*md.template triangularView<Lower>(), scd*vcd.adjoint()*md.template cast<CD>().eval().template triangularView<Lower>()); - VERIFY_IS_APPROX(sd*vd.adjoint()*mcd.template triangularView<Lower>(), sd*vd.adjoint().template cast<CD>().eval()*mcd.template triangularView<Lower>()); + VERIFY_IS_APPROX( sd*vcd.adjoint()*md.transpose().template triangularView<Upper>(), sd*vcd.adjoint()*md.transpose().template cast<CD>().eval().template triangularView<Upper>()); + VERIFY_IS_APPROX(scd*vcd.adjoint()*md.transpose().template triangularView<Lower>(), scd*vcd.adjoint()*md.transpose().template cast<CD>().eval().template triangularView<Lower>()); + VERIFY_IS_APPROX( sd*vd.adjoint()*mcd.template triangularView<Lower>(), sd*vd.adjoint().template cast<CD>().eval()*mcd.template triangularView<Lower>()); VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.template triangularView<Upper>(), scd*vd.adjoint().template cast<CD>().eval()*mcd.template triangularView<Upper>()); + VERIFY_IS_APPROX( sd*vd.adjoint()*mcd.transpose().template triangularView<Lower>(), sd*vd.adjoint().template cast<CD>().eval()*mcd.transpose().template triangularView<Lower>()); + VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.transpose().template triangularView<Upper>(), scd*vd.adjoint().template cast<CD>().eval()*mcd.transpose().template triangularView<Upper>()); // Not supported yet: trmm // VERIFY_IS_APPROX(sd*mcd*md.template triangularView<Lower>(), sd*mcd*md.template cast<CD>().eval().template triangularView<Lower>()); diff --git a/test/product_large.cpp b/test/product_large.cpp index 98f84c53b..845cd40ca 100644 --- a/test/product_large.cpp +++ b/test/product_large.cpp @@ -71,7 +71,7 @@ void test_product_large() std::ptrdiff_t m1 = internal::random<int>(10,100)*16; std::ptrdiff_t n1 = internal::random<int>(10,100)*16; // only makes sure it compiles fine - internal::computeProductBlockingSizes<float,float>(k1,m1,n1,1); + internal::computeProductBlockingSizes<float,float,std::ptrdiff_t>(k1,m1,n1,1); } { diff --git a/test/rand.cpp b/test/rand.cpp index 6790acf15..eeec34191 100644 --- a/test/rand.cpp +++ b/test/rand.cpp @@ -29,6 +29,9 @@ template<typename Scalar> void check_all_in_range(Scalar x, Scalar y) { mask( check_in_range(x,y)-x )++; } + for(Index i=0; i<mask.size(); ++i) + if(mask(i)==0) + std::cout << "WARNING: value " << x+i << " not reached." << std::endl; VERIFY( (mask>0).all() ); } diff --git a/test/svd_fill.h b/test/svd_fill.h index 7e44b3d05..1bbe645ee 100644 --- a/test/svd_fill.h +++ b/test/svd_fill.h @@ -80,6 +80,8 @@ void svd_fill_random(MatrixType &m, int Option = 0) Index i = internal::random<Index>(0,m.rows()-1); Index j = internal::random<Index>(0,m.cols()-1); m(j,i) = m(i,j) = samples(internal::random<Index>(0,samples.size()-1)); + if(NumTraits<Scalar>::IsComplex) + *(&numext::real_ref(m(j,i))+1) = *(&numext::real_ref(m(i,j))+1) = samples.real()(internal::random<Index>(0,samples.size()-1)); } } } @@ -91,8 +93,14 @@ void svd_fill_random(MatrixType &m, int Option = 0) if(!(dup && unit_uv)) { Index n = internal::random<Index>(0,m.size()-1); - for(Index i=0; i<n; ++i) - m(internal::random<Index>(0,m.rows()-1), internal::random<Index>(0,m.cols()-1)) = samples(internal::random<Index>(0,samples.size()-1)); + for(Index k=0; k<n; ++k) + { + Index i = internal::random<Index>(0,m.rows()-1); + Index j = internal::random<Index>(0,m.cols()-1); + m(i,j) = samples(internal::random<Index>(0,samples.size()-1)); + if(NumTraits<Scalar>::IsComplex) + *(&numext::real_ref(m(i,j))+1) = samples.real()(internal::random<Index>(0,samples.size()-1)); + } } } } diff --git a/test/swap.cpp b/test/swap.cpp index 5d6f0e6af..f76e3624d 100644 --- a/test/swap.cpp +++ b/test/swap.cpp @@ -74,10 +74,13 @@ template<typename MatrixType> void swap(const MatrixType& m) m1 = m1_copy; m3 = m3_copy; - // test assertion on mismatching size -- matrix case - VERIFY_RAISES_ASSERT(m1.swap(m1.row(0))); - // test assertion on mismatching size -- xpr case - VERIFY_RAISES_ASSERT(m1.row(0).swap(m1)); + if(m1.rows()>1) + { + // test assertion on mismatching size -- matrix case + VERIFY_RAISES_ASSERT(m1.swap(m1.row(0))); + // test assertion on mismatching size -- xpr case + VERIFY_RAISES_ASSERT(m1.row(0).swap(m1)); + } } void test_swap() diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp index 35fbb9781..ee446c3c1 100644 --- a/test/vectorization_logic.cpp +++ b/test/vectorization_logic.cpp @@ -22,7 +22,11 @@ template<typename Dst, typename Src> bool test_assign(const Dst&, const Src&, int traversal, int unrolling) { typedef internal::copy_using_evaluator_traits<internal::evaluator<Dst>,internal::evaluator<Src>, internal::assign_op<typename Dst::Scalar> > traits; - bool res = traits::Traversal==traversal && traits::Unrolling==unrolling; + bool res = traits::Traversal==traversal; + if(unrolling==InnerUnrolling+CompleteUnrolling) + res = res && (int(traits::Unrolling)==InnerUnrolling || int(traits::Unrolling)==CompleteUnrolling); + else + res = res && int(traits::Unrolling)==unrolling; if(!res) { std::cerr << "Src: " << demangle_flags(Src::Flags) << std::endl; @@ -147,10 +151,10 @@ struct vectorization_logic VERIFY(test_assign(Matrix44c().col(1),Matrix44c().col(2)+Matrix44c().col(3), InnerVectorizedTraversal,CompleteUnrolling)); - + VERIFY(test_assign(Matrix44r().row(2),Matrix44r().row(1)+Matrix44r().row(1), InnerVectorizedTraversal,CompleteUnrolling)); - + if(PacketSize>1) { typedef Matrix<Scalar,3,3,ColMajor> Matrix33c; @@ -158,17 +162,29 @@ struct vectorization_logic LinearTraversal,CompleteUnrolling)); VERIFY(test_assign(Matrix33c().col(0),Matrix33c().col(1)+Matrix33c().col(1), LinearTraversal,CompleteUnrolling)); - - VERIFY(test_assign(Matrix3(),Matrix3().cwiseQuotient(Matrix3()), - PacketTraits::HasDiv ? LinearVectorizedTraversal : LinearTraversal,CompleteUnrolling)); - + + VERIFY(test_assign(Matrix3(),Matrix3().cwiseProduct(Matrix3()), + LinearVectorizedTraversal,CompleteUnrolling)); + VERIFY(test_assign(Matrix<Scalar,17,17>(),Matrix<Scalar,17,17>()+Matrix<Scalar,17,17>(), HalfPacketSize==1 ? InnerVectorizedTraversal : LinearTraversal,NoUnrolling)); - + VERIFY(test_assign(Matrix11(),Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(2,3)+Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(8,4), DefaultTraversal,PacketSize>4?InnerUnrolling:CompleteUnrolling)); + + VERIFY(test_assign(Vector1(),Matrix11()*Vector1(), + InnerVectorizedTraversal,CompleteUnrolling)); + + VERIFY(test_assign(Matrix11(),Matrix11().lazyProduct(Matrix11()), + InnerVectorizedTraversal,InnerUnrolling+CompleteUnrolling)); } - + + VERIFY(test_redux(Vector1(), + LinearVectorizedTraversal,CompleteUnrolling)); + + VERIFY(test_redux(Matrix<Scalar,PacketSize,3>(), + LinearVectorizedTraversal,CompleteUnrolling)); + VERIFY(test_redux(Matrix3(), LinearVectorizedTraversal,CompleteUnrolling)); @@ -226,6 +242,7 @@ struct vectorization_logic_half typedef Matrix<Scalar,PacketSize,1> Vector1; typedef Matrix<Scalar,PacketSize,PacketSize> Matrix11; typedef Matrix<Scalar,5*PacketSize,7,ColMajor> Matrix57; + typedef Matrix<Scalar,3*PacketSize,5,ColMajor> Matrix35; typedef Matrix<Scalar,5*PacketSize,7,DontAlign|ColMajor> Matrix57u; // typedef Matrix<Scalar,(Matrix11::Flags&RowMajorBit)?16:4*PacketSize,(Matrix11::Flags&RowMajorBit)?4*PacketSize:16> Matrix44; // typedef Matrix<Scalar,(Matrix11::Flags&RowMajorBit)?16:4*PacketSize,(Matrix11::Flags&RowMajorBit)?4*PacketSize:16,DontAlign|EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION> Matrix44u; @@ -291,12 +308,24 @@ struct vectorization_logic_half VERIFY(test_assign(Matrix11(),Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(2,3)+Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(8,4), DefaultTraversal,PacketSize>4?InnerUnrolling:CompleteUnrolling)); + + VERIFY(test_assign(Vector1(),Matrix11()*Vector1(), + InnerVectorizedTraversal,CompleteUnrolling)); + + VERIFY(test_assign(Matrix11(),Matrix11().lazyProduct(Matrix11()), + InnerVectorizedTraversal,InnerUnrolling+CompleteUnrolling)); } + VERIFY(test_redux(Vector1(), + LinearVectorizedTraversal,CompleteUnrolling)); + + VERIFY(test_redux(Matrix<Scalar,PacketSize,3>(), + LinearVectorizedTraversal,CompleteUnrolling)); + VERIFY(test_redux(Matrix3(), LinearVectorizedTraversal,CompleteUnrolling)); - VERIFY(test_redux(Matrix57(), + VERIFY(test_redux(Matrix35(), LinearVectorizedTraversal,CompleteUnrolling)); VERIFY(test_redux(Matrix57().template block<PacketSize,3>(1,0), diff --git a/unsupported/Eigen/CXX11/CMakeLists.txt b/unsupported/Eigen/CXX11/CMakeLists.txt index f1d9f0482..a40bc4715 100644 --- a/unsupported/Eigen/CXX11/CMakeLists.txt +++ b/unsupported/Eigen/CXX11/CMakeLists.txt @@ -1,4 +1,4 @@ -set(Eigen_CXX11_HEADERS Core Tensor TensorSymmetry) +set(Eigen_CXX11_HEADERS Tensor TensorSymmetry ThreadPool) install(FILES ${Eigen_CXX11_HEADERS} diff --git a/unsupported/Eigen/CXX11/Core b/unsupported/Eigen/CXX11/Core deleted file mode 100644 index 946145f5a..000000000 --- a/unsupported/Eigen/CXX11/Core +++ /dev/null @@ -1,51 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2013 Christian Seiler <christian@iwakd.de> -// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_CORE_MODULE -#define EIGEN_CXX11_CORE_MODULE - -#include <Eigen/Core> - -#include <Eigen/src/Core/util/DisableStupidWarnings.h> - -/** \defgroup CXX11_Core_Module C++11 Core Module - * - * This module provides common core features for all modules that - * explicitly depend on C++11. Currently, this is only the Tensor - * module. Note that at this stage, you should not need to include - * this module directly. - * - * It also provides a limited fallback for compilers that don't support - * CXX11 yet, such as nvcc. - * - * \code - * #include <Eigen/CXX11/Core> - * \endcode - */ - -#include <vector> - -#include "src/Core/util/EmulateArray.h" -#include "src/Core/util/MaxSizeVector.h" - -// Emulate the cxx11 functionality that we need if the compiler doesn't support it. -// Visual studio 2015 doesn't advertise itself as cxx11 compliant, although it -// supports enough of the standard for our needs -#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900 -#include "src/Core/util/CXX11Workarounds.h" -#include "src/Core/util/CXX11Meta.h" -#else -#include "src/Core/util/EmulateCXX11Meta.h" -#endif - -#include <Eigen/src/Core/util/ReenableStupidWarnings.h> - -#endif // EIGEN_CXX11_CORE_MODULE - diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 16132398d..1e97ad3c0 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -11,10 +11,12 @@ //#ifndef EIGEN_CXX11_TENSOR_MODULE //#define EIGEN_CXX11_TENSOR_MODULE -#include "Core" +#include "../../../Eigen/Core" #include <Eigen/src/Core/util/DisableStupidWarnings.h> +#include "src/util/CXX11Meta.h" +#include "src/util/MaxSizeVector.h" /** \defgroup CXX11_Tensor_Module Tensor Module * @@ -26,6 +28,7 @@ * \endcode */ +#include <cmath> #include <cstddef> #include <cstring> @@ -51,11 +54,7 @@ typedef unsigned __int64 uint64_t; #endif #ifdef EIGEN_USE_THREADS -#include <atomic> -#include <condition_variable> -#include <deque> -#include <mutex> -#include <thread> +#include "ThreadPool" #endif #ifdef EIGEN_USE_GPU @@ -84,6 +83,7 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorBase.h" +#include "src/Tensor/TensorCostModel.h" #include "src/Tensor/TensorEvaluator.h" #include "src/Tensor/TensorExpr.h" #include "src/Tensor/TensorReduction.h" diff --git a/unsupported/Eigen/CXX11/TensorSymmetry b/unsupported/Eigen/CXX11/TensorSymmetry index f1dc25fea..fb1b0c0fb 100644 --- a/unsupported/Eigen/CXX11/TensorSymmetry +++ b/unsupported/Eigen/CXX11/TensorSymmetry @@ -14,6 +14,8 @@ #include <Eigen/src/Core/util/DisableStupidWarnings.h> +#include "src/util/CXX11Meta.h" + /** \defgroup CXX11_TensorSymmetry_Module Tensor Symmetry Module * * This module provides a classes that allow for the definition of diff --git a/unsupported/Eigen/CXX11/ThreadPool b/unsupported/Eigen/CXX11/ThreadPool new file mode 100644 index 000000000..09d637e9a --- /dev/null +++ b/unsupported/Eigen/CXX11/ThreadPool @@ -0,0 +1,65 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_THREADPOOL_MODULE +#define EIGEN_CXX11_THREADPOOL_MODULE + +#include "../../../Eigen/Core" + +#include <Eigen/src/Core/util/DisableStupidWarnings.h> + +/** \defgroup CXX11_ThreadPool_Module C++11 ThreadPool Module + * + * This module provides 2 threadpool implementations + * - a simple reference implementation + * - a faster non blocking implementation + * + * This module requires C++11. + * + * \code + * #include <Eigen/CXX11/ThreadPool> + * \endcode + */ + + +// The code depends on CXX11, so only include the module if the +// compiler supports it. +#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900 +#include <cstddef> +#include <cstring> +#include <stdint.h> +#include <time.h> + +#include <vector> +#include <atomic> +#include <condition_variable> +#include <deque> +#include <mutex> +#include <thread> +#include <functional> +#include <memory> + +#include "src/util/CXX11Meta.h" +#include "src/util/MaxSizeVector.h" + +#include "src/ThreadPool/ThreadLocal.h" +#include "src/ThreadPool/ThreadYield.h" +#include "src/ThreadPool/EventCount.h" +#include "src/ThreadPool/RunQueue.h" +#include "src/ThreadPool/ThreadPoolInterface.h" +#include "src/ThreadPool/ThreadEnvironment.h" +#include "src/ThreadPool/SimpleThreadPool.h" +#include "src/ThreadPool/NonBlockingThreadPool.h" + +#endif + +#include <Eigen/src/Core/util/ReenableStupidWarnings.h> + +#endif // EIGEN_CXX11_THREADPOOL_MODULE + diff --git a/unsupported/Eigen/CXX11/src/CMakeLists.txt b/unsupported/Eigen/CXX11/src/CMakeLists.txt index d90ee1b0f..1734262bb 100644 --- a/unsupported/Eigen/CXX11/src/CMakeLists.txt +++ b/unsupported/Eigen/CXX11/src/CMakeLists.txt @@ -1,3 +1,4 @@ -add_subdirectory(Core) +add_subdirectory(util) +add_subdirectory(ThreadPool) add_subdirectory(Tensor) add_subdirectory(TensorSymmetry) diff --git a/unsupported/Eigen/CXX11/src/Core/CMakeLists.txt b/unsupported/Eigen/CXX11/src/Core/CMakeLists.txt deleted file mode 100644 index 28571dcb9..000000000 --- a/unsupported/Eigen/CXX11/src/Core/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -add_subdirectory(util) diff --git a/unsupported/Eigen/CXX11/src/Core/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/Core/util/CMakeLists.txt deleted file mode 100644 index 1e3b14712..000000000 --- a/unsupported/Eigen/CXX11/src/Core/util/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_CXX11_Core_util_SRCS "*.h") - -INSTALL(FILES - ${Eigen_CXX11_Core_util_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/Core/util COMPONENT Devel - ) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index f1ec04c49..babafe108 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -112,6 +112,11 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device> return CoeffReturnType(index, m_impl.coeff(index)); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 199d2ce41..5abff0800 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -89,6 +89,12 @@ template<typename LeftArgType, typename RightArgType, typename Device> struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device> { typedef TensorAssignOp<LeftArgType, RightArgType> XprType; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned, @@ -104,12 +110,6 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device> EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); } - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions; - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { // The dimensions of the lhs and the rhs tensors should be equal to prevent @@ -150,6 +150,19 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device> return m_leftImpl.template packet<LoadMode>(index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + // We assume that evalPacket or evalScalar is called to perform the + // assignment and account for the cost of the write here, but reduce left + // cost by one load because we are using m_leftImpl.coeffRef. + TensorOpCost left = m_leftImpl.costPerCoeff(vectorized); + return m_rightImpl.costPerCoeff(vectorized) + + TensorOpCost( + numext::maxi(0.0, left.bytes_loaded() - sizeof(CoeffReturnType)), + left.bytes_stored(), left.compute_cycles()) + + TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_leftImpl.data(); } private: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 69d1802d5..1a34f3ccc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -334,6 +334,12 @@ class TensorBase<Derived, ReadOnlyAccessors> return binaryExpr(other.derived(), internal::scalar_boolean_or_op()); } + template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp<internal::scalar_boolean_xor_op, const Derived, const OtherDerived> + operator^(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_boolean_xor_op()); + } + // Comparisons and tests. template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LT>, const Derived, const OtherDerived> diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index b6e6db12a..c771496e2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -101,6 +101,9 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> typedef DSizes<Index, NumDims> Dimensions; typedef typename XprType::Scalar Scalar; typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = false, @@ -140,9 +143,6 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> } } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -247,9 +247,8 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); const Index originalIndex = index; @@ -284,12 +283,12 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> // Todo: this could be extended to the second dimension if we're not // broadcasting alongside the first dimension, and so on. - if (innermostLoc + packetSize <= m_impl.dimensions()[0]) { + if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) { return m_impl.template packet<Unaligned>(inputIndex); } else { - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; values[0] = m_impl.coeff(inputIndex); - for (int i = 1; i < packetSize; ++i) { + for (int i = 1; i < PacketSize; ++i) { values[i] = coeffColMajor(originalIndex+i); } PacketReturnType rslt = internal::pload<PacketReturnType>(values); @@ -300,9 +299,8 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); const Index originalIndex = index; @@ -337,12 +335,12 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> // Todo: this could be extended to the second dimension if we're not // broadcasting alongside the first dimension, and so on. - if (innermostLoc + packetSize <= m_impl.dimensions()[NumDims-1]) { + if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims-1]) { return m_impl.template packet<Unaligned>(inputIndex); } else { - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; values[0] = m_impl.coeff(inputIndex); - for (int i = 1; i < packetSize; ++i) { + for (int i = 1; i < PacketSize; ++i) { values[i] = coeffRowMajor(originalIndex+i); } PacketReturnType rslt = internal::pload<PacketReturnType>(values); @@ -350,6 +348,29 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + double compute_cost = TensorOpCost::AddCost<Index>(); + if (NumDims > 0) { + for (int i = NumDims - 1; i > 0; --i) { + compute_cost += TensorOpCost::DivCost<Index>(); + if (internal::index_statically_eq<Broadcast>()(i, 1)) { + compute_cost += + TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>(); + } else { + if (!internal::index_statically_eq<InputDimensions>()(i, 1)) { + compute_cost += TensorOpCost::MulCost<Index>() + + TensorOpCost::ModCost<Index>() + + TensorOpCost::AddCost<Index>(); + } + } + compute_cost += + TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>(); + } + } + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index c21a98fe0..2742dbb95 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -134,6 +134,10 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> typedef typename XprType::Index Index; typedef DSizes<Index, NumDims> Dimensions; typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; + enum { // Alignment can't be guaranteed at compile time since it depends on the @@ -180,9 +184,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> m_inputOffset = m_stride * op.offset(); } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -202,17 +203,16 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) || (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(m_stride == 1); Index inputIndex = index * m_inputStride + m_inputOffset; - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = m_impl.coeff(inputIndex); inputIndex += m_inputStride; } @@ -226,13 +226,13 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> } else { const Index idx = index / m_stride; const Index rem = index - idx * m_stride; - if (rem + packetSize <= m_stride) { + if (rem + PacketSize <= m_stride) { Index inputIndex = idx * m_inputStride + m_inputOffset + rem; return m_impl.template packet<LoadMode>(inputIndex); } else { // Cross the stride boundary. Fallback to slow path. - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index); ++index; } @@ -242,6 +242,28 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + double cost = 0; + if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && + m_dim.actualDim() == 0) || + (static_cast<int>(Layout) == static_cast<int>(RowMajor) && + m_dim.actualDim() == NumInputDims - 1)) { + cost += TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>(); + } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && + m_dim.actualDim() == NumInputDims - 1) || + (static_cast<int>(Layout) == static_cast<int>(RowMajor) && + m_dim.actualDim() == 0)) { + cost += TensorOpCost::AddCost<Index>(); + } else { + cost += 3 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>() + + 3 * TensorOpCost::AddCost<Index>(); + } + + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, cost, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { CoeffReturnType* result = const_cast<CoeffReturnType*>(m_impl.data()); if (((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumDims) || @@ -298,6 +320,9 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device> typedef typename XprType::Index Index; typedef DSizes<Index, NumDims> Dimensions; typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = false, @@ -309,9 +334,6 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device> : Base(op, device) { } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { return this->m_impl.coeffRef(this->srcCoeff(index)); @@ -320,17 +342,16 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device> template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - static const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == 0) || (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(this->m_stride == 1); - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; internal::pstore<CoeffReturnType, PacketReturnType>(values, x); Index inputIndex = index * this->m_inputStride + this->m_inputOffset; - for (int i = 0; i < packetSize; ++i) { + for (int i = 0; i < PacketSize; ++i) { this->m_impl.coeffRef(inputIndex) = values[i]; inputIndex += this->m_inputStride; } @@ -342,14 +363,14 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device> } else { const Index idx = index / this->m_stride; const Index rem = index - idx * this->m_stride; - if (rem + packetSize <= this->m_stride) { + if (rem + PacketSize <= this->m_stride) { const Index inputIndex = idx * this->m_inputStride + this->m_inputOffset + rem; this->m_impl.template writePacket<StoreMode>(inputIndex, x); } else { // Cross stride boundary. Fallback to slow path. - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; internal::pstore<CoeffReturnType, PacketReturnType>(values, x); - for (int i = 0; i < packetSize; ++i) { + for (int i = 0; i < PacketSize; ++i) { this->coeffRef(index) = values[i]; ++index; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 7738f18fb..839c6e3e5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -260,6 +260,21 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy return rslt; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() + + 2 * TensorOpCost::MulCost<Index>() + + TensorOpCost::DivCost<Index>() + + TensorOpCost::ModCost<Index>()); + const double lhs_size = m_leftImpl.dimensions().TotalSize(); + const double rhs_size = m_rightImpl.dimensions().TotalSize(); + return (lhs_size / (lhs_size + rhs_size)) * + m_leftImpl.costPerCoeff(vectorized) + + (rhs_size / (lhs_size + rhs_size)) * + m_rightImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index f070ba61e..97182258d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -426,6 +426,99 @@ struct TensorContractionEvaluatorBase buffer, resIncr, alpha); } + template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> + EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + // define mr, nr, and all of my data mapper types + typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar; + typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar; + typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits; + + const Index nr = Traits::nr; + const Index mr = Traits::mr; + + typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator; + typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator; + + const Index lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size; + const Index rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size; + + typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, + LeftEvaluator, left_nocontract_t, + contract_t, lhs_packet_size, + lhs_inner_dim_contiguous, + false, Unaligned> LhsMapper; + + typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, + RightEvaluator, right_nocontract_t, + contract_t, rhs_packet_size, + rhs_inner_dim_contiguous, + rhs_inner_dim_reordered, Unaligned> RhsMapper; + + typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper; + + // Declare GEBP packing and kernel structs + internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, ColMajor> pack_lhs; + internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, nr, ColMajor> pack_rhs; + + internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, mr, nr, false, false> gebp; + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + // Sizes of the blocks to load in cache. See the Goto paper for details. + internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, 1); + const Index kc = blocking.kc(); + const Index mc = numext::mini(m, blocking.mc()); + const Index nc = numext::mini(n, blocking.nc()); + const Index sizeA = mc * kc; + const Index sizeB = kc * nc; + + LhsScalar* blockA = static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar))); + RhsScalar* blockB = static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar))); + + for(Index i2=0; i2<m; i2+=mc) + { + const Index actual_mc = numext::mini(i2+mc,m)-i2; + for (Index k2 = 0; k2 < k; k2 += kc) { + // make sure we don't overshoot right edge of left matrix, then pack vertical panel + const Index actual_kc = numext::mini(k2 + kc, k) - k2; + pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0); + + // series of horizontal blocks + for (Index j2 = 0; j2 < n; j2 += nc) { + // make sure we don't overshoot right edge of right matrix, then pack block + const Index actual_nc = numext::mini(j2 + nc, n) - j2; + pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc, 0, 0); + + // call gebp (matrix kernel) + // The parameters here are copied from Eigen's GEMM implementation + gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, 1.0, -1, -1, 0, 0); + } + } + } + + this->m_device.deallocate(blockA); + this->m_device.deallocate(blockB); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_leftImpl.cleanup(); m_rightImpl.cleanup(); @@ -440,6 +533,10 @@ struct TensorContractionEvaluatorBase return m_result[index]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0); + } + template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return internal::ploadt<PacketReturnType, LoadMode>(m_result + index); @@ -529,100 +626,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT return; } - evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer); - } - - template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> - EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const { - // columns in left side, rows in right side - const Index k = this->m_k_size; - - // rows in left side - const Index m = this->m_i_size; - - // columns in right side - const Index n = this->m_j_size; - - // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - - // define mr, nr, and all of my data mapper types - typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar; - typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar; - typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits; - - const Index nr = Traits::nr; - const Index mr = Traits::mr; - - typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator; - typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator; - - const Index lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size; - const Index rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size; - - typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, - LeftEvaluator, left_nocontract_t, - contract_t, lhs_packet_size, - lhs_inner_dim_contiguous, - false, Unaligned> LhsMapper; - - typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, - RightEvaluator, right_nocontract_t, - contract_t, rhs_packet_size, - rhs_inner_dim_contiguous, - rhs_inner_dim_reordered, Unaligned> RhsMapper; - - typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper; - - // Declare GEBP packing and kernel structs - internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, ColMajor> pack_lhs; - internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, nr, ColMajor> pack_rhs; - - internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, mr, nr, false, false> gebp; - - // initialize data mappers - LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, - this->m_left_contracting_strides, this->m_k_strides); - - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, - this->m_right_contracting_strides, this->m_k_strides); - - OutputMapper output(buffer, m); - - // Sizes of the blocks to load in cache. See the Goto paper for details. - internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, 1); - const Index kc = blocking.kc(); - const Index mc = numext::mini(m, blocking.mc()); - const Index nc = numext::mini(n, blocking.nc()); - const Index sizeA = mc * kc; - const Index sizeB = kc * nc; - - LhsScalar* blockA = static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar))); - RhsScalar* blockB = static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar))); - - for(Index i2=0; i2<m; i2+=mc) - { - const Index actual_mc = numext::mini(i2+mc,m)-i2; - for (Index k2 = 0; k2 < k; k2 += kc) { - // make sure we don't overshoot right edge of left matrix, then pack vertical panel - const Index actual_kc = numext::mini(k2 + kc, k) - k2; - pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0); - - // series of horizontal blocks - for (Index j2 = 0; j2 < n; j2 += nc) { - // make sure we don't overshoot right edge of right matrix, then pack block - const Index actual_nc = numext::mini(j2 + nc, n) - j2; - pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc, 0, 0); - - // call gebp (matrix kernel) - // The parameters here are copied from Eigen's GEMM implementation - gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, 1.0, -1, -1, 0, 0); - } - } - } - - this->m_device.deallocate(blockA); - this->m_device.deallocate(blockB); + this->template evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer); } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h index 3d3f6904f..5cf7b4f71 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h @@ -35,9 +35,7 @@ class TensorContractionBlocking { computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, mc_, nc_, num_threads); } else { - if (kc_ && mc_ && nc_) { - mc_ = (((m / num_threads) + 15) / 16) * 16; - } + computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, nc_, mc_, num_threads); } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index a96776a77..a2f1f71f5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -177,7 +177,6 @@ template <typename Eval, typename Scalar> struct ConversionSubExprEval<true, Eva }; - // Eval as rvalue template<typename TargetType, typename ArgType, typename Device> struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device> @@ -190,6 +189,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device> typedef typename internal::remove_all<typename internal::traits<ArgType>::Scalar>::type SrcType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; typedef typename PacketType<SrcType, Device>::type PacketSourceType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = false, @@ -231,6 +231,21 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device> return converter.template packet<LoadMode>(index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double cast_cost = TensorOpCost::CastCost<SrcType, TargetType>(); + if (vectorized) { + const double SrcCoeffRatio = + internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio; + const double TgtCoeffRatio = + internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio; + return m_impl.costPerCoeff(vectorized) * (SrcCoeffRatio / PacketSize) + + TensorOpCost(0, 0, TgtCoeffRatio * (cast_cost / PacketSize)); + } else { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, cast_cost); + } + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 4fe1fb943..ff3c5662d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -297,6 +297,11 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr typedef typename XprType::Index Index; typedef DSizes<Index, NumDims> Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; + enum { IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned, PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess, @@ -367,10 +372,6 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr } } - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { @@ -405,7 +406,6 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr template<int LoadMode> EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const { - const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; Index indices[2] = {index, index+PacketSize-1}; Index startInputs[2] = {0, 0}; if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { @@ -448,6 +448,23 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double kernel_size = m_kernelImpl.dimensions().TotalSize(); + // We ignore the use of fused multiply-add. + const double convolve_compute_cost = + TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + + TensorOpCost::DivCost<Index>()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, + PacketSize)); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } private: @@ -773,6 +790,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType; typedef typename InputArgType::Scalar Scalar; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } @@ -1044,6 +1062,25 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost + // model. + const double kernel_size = m_kernelImpl.dimensions().TotalSize(); + // We ignore the use of fused multiply-add. + const double convolve_compute_cost = + TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + + TensorOpCost::DivCost<Index>()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, + PacketSize)); + } + private: // No assignment (copies are needed by the kernels) TensorEvaluator& operator = (const TensorEvaluator&); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h new file mode 100644 index 000000000..4e8f86674 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h @@ -0,0 +1,214 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H +#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H + +//#if !defined(EIGEN_USE_GPU) +//#define EIGEN_USE_COST_MODEL +//#endif + +namespace Eigen { + +/** \class TensorEvaluator + * \ingroup CXX11_Tensor_Module + * + * \brief A cost model used to limit the number of threads used for evaluating + * tensor expression. + * + */ + +// Class storing the cost of evaluating a tensor expression in terms of the +// estimated number of operand bytes loads, bytes stored, and compute cycles. +class TensorOpCost { + public: + // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple + // model based on minimal reciprocal throughput numbers from Intel or + // Agner Fog's tables would be better than what is there now. + template <typename ArgType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int MulCost() { + return internal::functor_traits< + internal::scalar_product_op<ArgType, ArgType>>::Cost; + } + template <typename ArgType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int AddCost() { + return internal::functor_traits<internal::scalar_sum_op<ArgType>>::Cost; + } + template <typename ArgType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int DivCost() { + return internal::functor_traits< + internal::scalar_quotient_op<ArgType, ArgType>>::Cost; + } + template <typename ArgType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int ModCost() { + return internal::functor_traits<internal::scalar_mod_op<ArgType>>::Cost; + } + template <typename SrcType, typename TargetType> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int CastCost() { + return internal::functor_traits< + internal::scalar_cast_op<SrcType, TargetType>>::Cost; + } + + TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {} + TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles) + : bytes_loaded_(bytes_loaded), + bytes_stored_(bytes_stored), + compute_cycles_(compute_cycles) {} + + TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles, + bool vectorized, double packet_size) + : bytes_loaded_(bytes_loaded), + bytes_stored_(bytes_stored), + compute_cycles_(vectorized ? compute_cycles / packet_size + : compute_cycles) { + using std::isfinite; + eigen_assert(bytes_loaded >= 0 && (isfinite)(bytes_loaded)); + eigen_assert(bytes_stored >= 0 && (isfinite)(bytes_stored)); + eigen_assert(compute_cycles >= 0 && (isfinite)(compute_cycles)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const { + return bytes_loaded_; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const { + return bytes_stored_; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const { + return compute_cycles_; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost( + double load_cost, double store_cost, double compute_cost) const { + return load_cost * bytes_loaded_ + store_cost * bytes_stored_ + + compute_cost * compute_cycles_; + } + + // Drop memory access component. Intended for cases when memory accesses are + // sequential or are completely masked by computations. + EIGEN_DEVICE_FUNC void dropMemoryCost() { + bytes_loaded_ = 0; + bytes_stored_ = 0; + } + + // TODO(rmlarsen): Define min in terms of total cost, not elementwise. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMin( + const TensorOpCost& rhs) { + bytes_loaded_ = numext::mini(bytes_loaded_, rhs.bytes_loaded()); + bytes_stored_ = numext::mini(bytes_stored_, rhs.bytes_stored()); + compute_cycles_ = numext::mini(compute_cycles_, rhs.compute_cycles()); + return *this; + } + + // TODO(rmlarsen): Define max in terms of total cost, not elementwise. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMax( + const TensorOpCost& rhs) { + bytes_loaded_ = numext::maxi(bytes_loaded_, rhs.bytes_loaded()); + bytes_stored_ = numext::maxi(bytes_stored_, rhs.bytes_stored()); + compute_cycles_ = numext::maxi(compute_cycles_, rhs.compute_cycles()); + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=( + const TensorOpCost& rhs) { + bytes_loaded_ += rhs.bytes_loaded(); + bytes_stored_ += rhs.bytes_stored(); + compute_cycles_ += rhs.compute_cycles(); + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) { + bytes_loaded_ *= rhs; + bytes_stored_ *= rhs; + compute_cycles_ *= rhs; + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+( + TensorOpCost lhs, const TensorOpCost& rhs) { + lhs += rhs; + return lhs; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*( + TensorOpCost lhs, double rhs) { + lhs *= rhs; + return lhs; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*( + double lhs, TensorOpCost rhs) { + rhs *= lhs; + return rhs; + } + + friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) { + return os << "[bytes_loaded = " << tc.bytes_loaded() + << ", bytes_stored = " << tc.bytes_stored() + << ", compute_cycles = " << tc.compute_cycles() << "]"; + } + + private: + double bytes_loaded_; + double bytes_stored_; + double compute_cycles_; +}; + +// TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads +// in [1:max_threads] instead of just switching multi-threading off for small +// work units. +template <typename Device> +class TensorCostModel { + public: + // Scaling from Eigen compute cost to device cycles. + static const int kDeviceCyclesPerComputeCycle = 1; + + // Costs in device cycles. + static const int kStartupCycles = 100000; + static const int kPerThreadCycles = 100000; + static const int kTaskSize = 40000; + + // Returns the number of threads in [1:max_threads] to use for + // evaluating an expression with the given output size and cost per + // coefficient. + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads( + double output_size, const TensorOpCost& cost_per_coeff, int max_threads) { + double cost = totalCost(output_size, cost_per_coeff); + int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9; + return numext::mini(max_threads, numext::maxi(1, threads)); + } + + // taskSize assesses parallel task size. + // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task + // granularity needs to be increased to mitigate parallelization overheads. + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize( + double output_size, const TensorOpCost& cost_per_coeff) { + return totalCost(output_size, cost_per_coeff) / kTaskSize; + } + + private: + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost( + double output_size, const TensorOpCost& cost_per_coeff) { + // Cost of memory fetches from L2 cache. 64 is typical cache line size. + // 11 is L2 cache latency on Haswell. + // We don't know whether data is in L1, L2 or L3. But we are most interested + // in single-threaded computational time around 100us-10ms (smaller time + // is too small for parallelization, larger time is not intersting + // either because we are probably using all available threads already). + // And for the target time range, L2 seems to be what matters. Data set + // fitting into L1 is too small to take noticeable time. Data set fitting + // only into L3 presumably will take more than 10ms to load and process. + const double kLoadCycles = 1.0 / 64 * 11; + const double kStoreCycles = 1.0 / 64 * 11; + // Scaling from Eigen compute cost to device cycles. + return output_size * + cost_per_coeff.total_cost(kLoadCycles, kStoreCycles, + kDeviceCyclesPerComputeCycle); + } +}; + +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h index b58e513b4..e020d076f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -83,8 +83,10 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi typedef typename internal::traits<ArgType>::Index Index; static const int NumDims = internal::traits<ArgType>::NumDimensions; typedef DSizes<Index, NumDims> Dimensions; - typedef - typename internal::remove_const<typename ArgType::Scalar>::type Scalar; + typedef typename internal::remove_const<typename ArgType::Scalar>::type Scalar; + typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = false, @@ -101,9 +103,6 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi m_dimensions = op.func().dimensions(op.expression()); } - typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { @@ -134,6 +133,11 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi return internal::ploadt<PacketReturnType, LoadMode>(m_result + index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): Extend CustomOp API to return its cost estimate. + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } protected: @@ -236,6 +240,9 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, static const int NumDims = internal::traits<XprType>::NumDimensions; typedef DSizes<Index, NumDims> Dimensions; typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = false, @@ -252,9 +259,6 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, m_dimensions = op.func().dimensions(op.lhsExpression(), op.rhsExpression()); } - typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { @@ -284,6 +288,11 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, return internal::ploadt<PacketReturnType, LoadMode>(m_result + index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): Extend CustomOp API to return its cost estimate. + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index 821835cf3..1d2d162dc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -291,15 +291,9 @@ struct GpuDevice { int max_blocks_; }; -#ifndef __CUDA_ARCH__ #define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ assert(cudaGetLastError() == cudaSuccess); -#else -#define LAUNCH_CUDA_KERNEL(kernel, ...) \ - { const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); } \ - eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__); -#endif // FIXME: Should be device and kernel specific. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h index 267f6f8e3..9d141395b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h @@ -44,6 +44,26 @@ struct DefaultDevice { #endif } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { +#ifndef __CUDA_ARCH__ + // Running on the host CPU + return l1CacheSize(); +#else + // Running on a CUDA device, return the amount of shared memory available. + return 48*1024; +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { +#ifndef __CUDA_ARCH__ + // Running single threaded on the host CPU + return l3CacheSize(); +#else + // Running on a CUDA device + return firstLevelCacheSize(); +#endif + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { #ifndef __CUDA_ARCH__ // Running single threaded on the host CPU diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index cd3dd214b..c02891465 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -12,145 +12,15 @@ namespace Eigen { -// This defines an interface that ThreadPoolDevice can take to use -// custom thread pools underneath. -class ThreadPoolInterface { - public: - virtual void Schedule(std::function<void()> fn) = 0; - - virtual ~ThreadPoolInterface() {} -}; - -// The implementation of the ThreadPool type ensures that the Schedule method -// runs the functions it is provided in FIFO order when the scheduling is done -// by a single thread. -// Environment provides a way to create threads and also allows to intercept -// task submission and execution. -template <typename Environment> -class ThreadPoolTempl : public ThreadPoolInterface { - public: - // Construct a pool that contains "num_threads" threads. - explicit ThreadPoolTempl(int num_threads, Environment env = Environment()) - : env_(env), threads_(num_threads), waiters_(num_threads) { - for (int i = 0; i < num_threads; i++) { - threads_.push_back(env.CreateThread([this]() { WorkerLoop(); })); - } - } - - // Wait until all scheduled work has finished and then destroy the - // set of threads. - ~ThreadPoolTempl() { - { - // Wait for all work to get done. - std::unique_lock<std::mutex> l(mu_); - while (!pending_.empty()) { - empty_.wait(l); - } - exiting_ = true; - - // Wakeup all waiters. - for (auto w : waiters_) { - w->ready = true; - w->task.f = nullptr; - w->cv.notify_one(); - } - } - - // Wait for threads to finish. - for (auto t : threads_) { - delete t; - } - } - - // Schedule fn() for execution in the pool of threads. The functions are - // executed in the order in which they are scheduled. - void Schedule(std::function<void()> fn) { - Task t = env_.CreateTask(std::move(fn)); - std::unique_lock<std::mutex> l(mu_); - if (waiters_.empty()) { - pending_.push_back(std::move(t)); - } else { - Waiter* w = waiters_.back(); - waiters_.pop_back(); - w->ready = true; - w->task = std::move(t); - w->cv.notify_one(); - } - } - - protected: - void WorkerLoop() { - std::unique_lock<std::mutex> l(mu_); - Waiter w; - Task t; - while (!exiting_) { - if (pending_.empty()) { - // Wait for work to be assigned to me - w.ready = false; - waiters_.push_back(&w); - while (!w.ready) { - w.cv.wait(l); - } - t = w.task; - w.task.f = nullptr; - } else { - // Pick up pending work - t = std::move(pending_.front()); - pending_.pop_front(); - if (pending_.empty()) { - empty_.notify_all(); - } - } - if (t.f) { - mu_.unlock(); - env_.ExecuteTask(t); - t.f = nullptr; - mu_.lock(); - } - } - } - - private: - typedef typename Environment::Task Task; - typedef typename Environment::EnvThread Thread; - - struct Waiter { - std::condition_variable cv; - Task task; - bool ready; - }; - - Environment env_; - std::mutex mu_; - MaxSizeVector<Thread*> threads_; // All threads - MaxSizeVector<Waiter*> waiters_; // Stack of waiting threads. - std::deque<Task> pending_; // Queue of pending work - std::condition_variable empty_; // Signaled on pending_.empty() - bool exiting_ = false; -}; - -struct StlThreadEnvironment { - struct Task { - std::function<void()> f; - }; - - // EnvThread constructor must start the thread, - // destructor must join the thread. - class EnvThread { - public: - EnvThread(std::function<void()> f) : thr_(f) {} - ~EnvThread() { thr_.join(); } - - private: - std::thread thr_; - }; - - EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(f); } - Task CreateTask(std::function<void()> f) { return Task{std::move(f)}; } - void ExecuteTask(const Task& t) { t.f(); } -}; - -typedef ThreadPoolTempl<StlThreadEnvironment> ThreadPool; +// Use the SimpleThreadPool by default. We'll switch to the new non blocking +// thread pool later. +#ifdef EIGEN_USE_NONBLOCKING_THREAD_POOL +template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>; +typedef NonBlockingThreadPool ThreadPool; +#else +template <typename Env> using ThreadPoolTempl = SimpleThreadPoolTempl<Env>; +typedef SimpleThreadPool ThreadPool; +#endif // Barrier is an object that allows one or more threads to wait until @@ -264,6 +134,15 @@ struct ThreadPoolDevice { return num_threads_; } + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { + return l1CacheSize(); + } + + EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + // The l3 cache size is shared between all the cores. + return l3CacheSize() / num_threads_; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { // Should return an enum that encodes the ISA supported by the CPU return 1; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 1fb27a65b..5c6748a43 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -88,10 +88,14 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device> typedef TensorEvalToOp<ArgType> XprType; typedef typename ArgType::Scalar Scalar; typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; + typedef typename XprType::Index Index; + typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = true, - PacketAccess = true, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented RawAccess = true @@ -104,10 +108,6 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() { } - typedef typename XprType::Index Index; - typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* scalar) { @@ -138,6 +138,13 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device> return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // We assume that evalPacket or evalScalar is called to perform the + // assignment and account for the cost of the write here. + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_buffer; } private: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 947a8ed88..ae4ce3c90 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -101,6 +101,11 @@ struct TensorEvaluator } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, + internal::unpacket_traits<PacketReturnType>::size); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; } protected: @@ -184,6 +189,11 @@ struct TensorEvaluator<const Derived, Device> return loadConstant(m_data+index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, + internal::unpacket_traits<PacketReturnType>::size); + } + EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; } protected: @@ -219,6 +229,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device> typedef typename XprType::Scalar Scalar; typedef typename internal::traits<XprType>::Scalar CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } @@ -237,6 +248,12 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device> return m_functor.template packetOp<Index, PacketReturnType>(index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, + internal::unpacket_traits<PacketReturnType>::size); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } private: @@ -270,6 +287,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device> typedef typename XprType::Scalar Scalar; typedef typename internal::traits<XprType>::Scalar CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } @@ -293,6 +311,12 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device> return m_functor.packetOp(m_argImpl.template packet<LoadMode>(index)); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double functor_cost = internal::functor_traits<UnaryOp>::Cost; + return m_argImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } private: @@ -330,6 +354,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg typedef typename XprType::Scalar Scalar; typedef typename internal::traits<XprType>::Scalar CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const @@ -358,6 +383,14 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg return m_functor.packetOp(m_leftImpl.template packet<LoadMode>(index), m_rightImpl.template packet<LoadMode>(index)); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double functor_cost = internal::functor_traits<BinaryOp>::Cost; + return m_leftImpl.costPerCoeff(vectorized) + + m_rightImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } private: @@ -398,6 +431,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType> typedef typename XprType::Index Index; typedef typename internal::traits<XprType>::Scalar CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const @@ -425,7 +459,6 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType> template<int LoadMode> EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { - const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; internal::Selector<PacketSize> select; for (Index i = 0; i < PacketSize; ++i) { select.select[i] = m_condImpl.coeff(index+i); @@ -435,6 +468,13 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType> m_elseImpl.template packet<LoadMode>(index)); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + return m_condImpl.costPerCoeff(vectorized) + + m_thenImpl.costPerCoeff(vectorized) + .cwiseMax(m_elseImpl.costPerCoeff(vectorized)); + } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } private: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 4f4e07aaf..5c3d4d630 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -59,9 +59,16 @@ class TensorExecutor<Expression, DefaultDevice, true> { const Index size = array_prod(evaluator.dimensions()); const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size; + // Manually unroll this loop since compilers don't do it. + const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; + for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) { + evaluator.evalPacket(i); + evaluator.evalPacket(i+PacketSize); + evaluator.evalPacket(i+2*PacketSize); + evaluator.evalPacket(i+3*PacketSize); + } const Index VectorizedSize = (size / PacketSize) * PacketSize; - - for (Index i = 0; i < VectorizedSize; i += PacketSize) { + for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) { evaluator.evalPacket(i); } for (Index i = VectorizedSize; i < size; ++i) { @@ -78,8 +85,9 @@ class TensorExecutor<Expression, DefaultDevice, true> #ifdef EIGEN_USE_THREADS template <typename Evaluator, typename Index, bool Vectorizable> struct EvalRange { - static void run(Evaluator evaluator, const Index first, const Index last) { - eigen_assert(last > first); + static void run(Evaluator* evaluator_in, const Index first, const Index last) { + Evaluator evaluator = *evaluator_in; + eigen_assert(last >= first); for (Index i = first; i < last; ++i) { evaluator.evalScalar(i); } @@ -88,28 +96,34 @@ struct EvalRange { template <typename Evaluator, typename Index> struct EvalRange<Evaluator, Index, true> { - static void run(Evaluator evaluator, const Index first, const Index last) { - eigen_assert(last > first); - + static void run(Evaluator* evaluator_in, const Index first, const Index last) { + Evaluator evaluator = *evaluator_in; + eigen_assert(last >= first); Index i = first; - static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; + const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; if (last - first >= PacketSize) { eigen_assert(first % PacketSize == 0); - Index lastPacket = last - (last % PacketSize); - for (; i < lastPacket; i += PacketSize) { + Index last_chunk_offset = last - 4 * PacketSize; + // Manually unroll this loop since compilers don't do it. + for (; i <= last_chunk_offset; i += 4*PacketSize) { + evaluator.evalPacket(i); + evaluator.evalPacket(i+PacketSize); + evaluator.evalPacket(i+2*PacketSize); + evaluator.evalPacket(i+3*PacketSize); + } + last_chunk_offset = last - PacketSize; + for (; i <= last_chunk_offset; i += PacketSize) { evaluator.evalPacket(i); } } - for (; i < last; ++i) { evaluator.evalScalar(i); } } }; -template<typename Expression, bool Vectorizable> -class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> -{ +template <typename Expression, bool Vectorizable> +class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> { public: typedef typename Expression::Index Index; static inline void run(const Expression& expr, const ThreadPoolDevice& device) @@ -119,24 +133,34 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { + const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1; const Index size = array_prod(evaluator.dimensions()); - - static const int PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1; - - int blocksz = std::ceil<int>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1; - const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize))); - const unsigned int numblocks = static_cast<unsigned int>(size / blocksize); - - Barrier barrier(numblocks); - for (unsigned int i = 0; i < numblocks; ++i) { - device.enqueue_with_barrier(&barrier, &EvalRange<Evaluator, Index, Vectorizable>::run, evaluator, i*blocksize, (i+1)*blocksize); + size_t num_threads = device.numThreads(); +#ifdef EIGEN_USE_COST_MODEL + if (num_threads > 1) { + num_threads = TensorCostModel<ThreadPoolDevice>::numThreads( + size, evaluator.costPerCoeff(Vectorizable), num_threads); } - - if (static_cast<Index>(numblocks) * blocksize < size) { - EvalRange<Evaluator, Index, Vectorizable>::run(evaluator, numblocks * blocksize, size); +#endif + if (num_threads == 1) { + EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size); + } else { + Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1; + const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize))); + const Index numblocks = size / blocksize; + + Barrier barrier(numblocks); + for (int i = 0; i < numblocks; ++i) { + device.enqueue_with_barrier( + &barrier, &EvalRange<Evaluator, Index, Vectorizable>::run, + &evaluator, i * blocksize, (i + 1) * blocksize); + } + if (numblocks * blocksize < size) { + EvalRange<Evaluator, Index, Vectorizable>::run( + &evaluator, numblocks * blocksize, size); + } + barrier.Wait(); } - - barrier.Wait(); } evaluator.cleanup(); } @@ -147,98 +171,78 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> // GPU: the evaluation of the expression is offloaded to a GPU. #if defined(EIGEN_USE_GPU) -template <typename Expression> -class TensorExecutor<Expression, GpuDevice, false> { +template <typename Expression, bool Vectorizable> +class TensorExecutor<Expression, GpuDevice, Vectorizable> { public: typedef typename Expression::Index Index; - static EIGEN_DEVICE_FUNC void run(const Expression& expr, const GpuDevice& device); + static void run(const Expression& expr, const GpuDevice& device); }; -template <typename Expression> -class TensorExecutor<Expression, GpuDevice, true> { - public: - typedef typename Expression::Index Index; - static EIGEN_DEVICE_FUNC void run(const Expression& expr, const GpuDevice& device); -}; #if defined(__CUDACC__) +template <typename Evaluator, typename Index, bool Vectorizable> +struct EigenMetaKernelEval { + static __device__ EIGEN_ALWAYS_INLINE + void run(Evaluator& eval, Index first, Index last, Index step_size) { + for (Index i = first; i < last; i += step_size) { + eval.evalScalar(i); + } + } +}; + +template <typename Evaluator, typename Index> +struct EigenMetaKernelEval<Evaluator, Index, true> { + static __device__ EIGEN_ALWAYS_INLINE + void run(Evaluator& eval, Index first, Index last, Index step_size) { + const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; + const Index vectorized_size = (last / PacketSize) * PacketSize; + const Index vectorized_step_size = step_size * PacketSize; + + // Use the vector path + for (Index i = first * PacketSize; i < vectorized_size; + i += vectorized_step_size) { + eval.evalPacket(i); + } + for (Index i = vectorized_size + first; i < last; i += step_size) { + eval.evalScalar(i); + } + } +}; template <typename Evaluator, typename Index> __global__ void __launch_bounds__(1024) -EigenMetaKernel_NonVectorizable(Evaluator memcopied_eval, Index size) { - // Cuda memcopies the kernel arguments. That's fine for POD, but for more - // complex types such as evaluators we should really conform to the C++ - // standard and call a proper copy constructor. - Evaluator eval(memcopied_eval); +EigenMetaKernel(Evaluator memcopied_eval, Index size) { const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; const Index step_size = blockDim.x * gridDim.x; - // Use the scalar path - for (Index i = first_index; i < size; i += step_size) { - eval.evalScalar(i); - } -} - -template <typename Evaluator, typename Index> -__global__ void -__launch_bounds__(1024) -EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) { // Cuda memcopies the kernel arguments. That's fine for POD, but for more // complex types such as evaluators we should really conform to the C++ // standard and call a proper copy constructor. Evaluator eval(memcopied_eval); - const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; - const Index step_size = blockDim.x * gridDim.x; - - // Use the vector path - const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; - const Index vectorized_step_size = step_size * PacketSize; - const Index vectorized_size = (size / PacketSize) * PacketSize; - for (Index i = first_index * PacketSize; i < vectorized_size; - i += vectorized_step_size) { - eval.evalPacket(i); - } - for (Index i = vectorized_size + first_index; i < size; i += step_size) { - eval.evalScalar(i); - } + const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned; + EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size); } /*static*/ -template <typename Expression> -EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device) -{ +template <typename Expression, bool Vectorizable> +inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run( + const Expression& expr, const GpuDevice& device) { TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { + if (needs_assign) { const int block_size = device.maxCudaThreadsPerBlock(); - const int max_blocks = numext::mini<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size); + const int max_blocks = device.getNumCudaMultiProcessors() * + device.maxCudaThreadsPerMultiProcessor() / block_size; const Index size = array_prod(evaluator.dimensions()); - // Create a least one block to ensure we won't crash if we're called with tensors of size 0. - const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1); - LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size); - } - evaluator.cleanup(); -} - + // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. + const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1); -/*static*/ -template<typename Expression> -EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(const Expression& expr, const GpuDevice& device) -{ - TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { - const int block_size = device.maxCudaThreadsPerBlock(); - const int max_blocks = numext::mini<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size); - const Index size = array_prod(evaluator.dimensions()); - // Create a least one block to ensure we won't crash if we're called with tensors of size 0. - const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1); - LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size); + LAUNCH_CUDA_KERNEL( + (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>), + num_blocks, block_size, 0, device, evaluator, size); } evaluator.cleanup(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index d6db45ade..ece2ed91b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -129,6 +129,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D typedef typename internal::conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar; typedef OutputScalar CoeffReturnType; typedef typename PacketType<OutputScalar, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = false, @@ -176,7 +177,6 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { if (m_data) { m_device.deallocate(m_data); @@ -189,11 +189,17 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D return m_data[index]; } - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const { + template <int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType + packet(Index index) const { return internal::ploadt<PacketReturnType, LoadMode>(m_data + index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index 14f480901..1ce53ad69 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -83,10 +83,14 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device> typedef TensorForcedEvalOp<ArgType> XprType; typedef typename ArgType::Scalar Scalar; typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = true, - PacketAccess = (internal::packet_traits<Scalar>::size > 1), + PacketAccess = (PacketSize > 1), Layout = TensorEvaluator<ArgType, Device>::Layout, RawAccess = true }; @@ -95,10 +99,6 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device> : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) { } - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { @@ -132,6 +132,10 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device> return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return m_buffer; } private: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index b7c13f67f..33cd00391 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -64,7 +64,7 @@ struct scalar_sigmoid_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { const T one = T(1); - return one / (one + std::exp(-x)); + return one / (one + numext::exp(-x)); } template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -158,8 +158,8 @@ template <typename T> struct MeanReducer } protected: - int scalarCount_; - int packetCount_; + DenseIndex scalarCount_; + DenseIndex packetCount_; }; template <typename T> struct MaxReducer @@ -594,6 +594,8 @@ template <> class UniformRandomGenerator<std::complex<double> > { template <typename Scalar> struct functor_traits<UniformRandomGenerator<Scalar> > { enum { + // Rough estimate. + Cost = 100 * NumTraits<Scalar>::MulCost, PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess }; }; @@ -774,6 +776,8 @@ template <typename T> class NormalRandomGenerator { template <typename Scalar> struct functor_traits<NormalRandomGenerator<Scalar> > { enum { + // Rough estimate. + Cost = 100 * NumTraits<Scalar>::MulCost, PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess }; }; @@ -799,7 +803,7 @@ class GaussianGenerator { T offset = coordinates[i] - m_means[i]; tmp += offset * offset / m_two_sigmas[i]; } - return std::exp(-tmp); + return numext::exp(-tmp); } private: @@ -807,6 +811,15 @@ class GaussianGenerator { array<T, NumDims> m_two_sigmas; }; +template <typename T, typename Index, size_t NumDims> +struct functor_traits<GaussianGenerator<T, Index, NumDims> > { + enum { + Cost = NumDims * (2 * NumTraits<T>::AddCost + NumTraits<T>::MulCost + + functor_traits<scalar_quotient_op<T, T> >::Cost) + + functor_traits<scalar_exp_op<T> >::Cost, + PacketAccess = GaussianGenerator<T, Index, NumDims>::PacketAccess + }; +}; } // end namespace internal } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index e4154bd0b..8ff7d5815 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -145,6 +145,14 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device> return rslt; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool) const { + // TODO(rmlarsen): This is just a placeholder. Define interface to make + // generators return their cost. + return TensorOpCost(0, 0, TensorOpCost::AddCost<Scalar>() + + TensorOpCost::MulCost<Scalar>()); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 72594a05c..bafcc67bd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -159,6 +159,9 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> typedef TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> Self; typedef TensorEvaluator<ArgType, Device> Impl; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = false, @@ -307,9 +310,6 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> } } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -362,15 +362,14 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const Index packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) { return packetWithPossibleZero(index); } - const Index indices[2] = {index, index + packetSize - 1}; + const Index indices[2] = {index, index + PacketSize - 1}; const Index patchIndex = indices[0] / m_fastPatchStride; if (patchIndex != indices[1] / m_fastPatchStride) { return packetWithPossibleZero(index); @@ -434,12 +433,23 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> Index rowInflateStride() const { return m_row_inflate_strides; } Index colInflateStride() const { return m_col_inflate_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + // We conservatively estimate the cost for the code path where the computed + // index is inside the original image and + // TensorEvaluator<ArgType, Device>::CoordAccess is false. + const double compute_cost = 3 * TensorOpCost::DivCost<Index>() + + 6 * TensorOpCost::MulCost<Index>() + + 8 * TensorOpCost::MulCost<Index>(); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload<PacketReturnType>(values); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index 368e6f685..de2f67d74 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -81,6 +81,10 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device> typedef typename XprType::Index Index; static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; typedef DSizes<Index, NumDims> Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false, @@ -123,11 +127,6 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device> } } - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -190,18 +189,30 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device> template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload<PacketReturnType>(values); return rslt; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = NumDims * (3 * TensorOpCost::DivCost<Index>() + + 3 * TensorOpCost::MulCost<Index>() + + 2 * TensorOpCost::AddCost<Index>()); + const double input_size = m_impl.dimensions().TotalSize(); + const double output_size = m_dimensions.TotalSize(); + if (output_size == 0) + return TensorOpCost(); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(sizeof(CoeffReturnType) * input_size / output_size, 0, + compute_cost, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index 9b85914ff..63a8476ef 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -155,6 +155,10 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device> return m_impl.template packet<LoadMode>(index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); } const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index 6af2d45d4..cd04716bd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -24,9 +24,17 @@ const T2& choose(Cond<false>, const T1&, const T2& second) { return second; } -template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + +template <typename T, typename X, typename Y> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T divup(const X x, const Y y) { + return static_cast<T>((x + y - 1) / y); +} + +template <typename T> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T divup(const T x, const T y) { - return (x + y - 1) / y; + return static_cast<T>((x + y - 1) / y); } template <size_t n> struct max_n_1 { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index a9c222ea0..bfa65a607 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -142,6 +142,10 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> return m_impl.template packet<LoadMode>(index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); } const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } @@ -449,6 +453,11 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { Scalar* result = m_impl.data(); if (result) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index a595a0175..88b838b27 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -87,6 +87,10 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device typedef typename XprType::Index Index; static const int NumDims = internal::array_size<PaddingDimensions>::value; typedef DSizes<Index, NumDims> Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = false, @@ -129,10 +133,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device } } - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { @@ -224,21 +224,51 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device return m_impl.coeff(inputIndex); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + TensorOpCost cost = m_impl.costPerCoeff(vectorized); + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = 0; i < NumDims; ++i) + updateCostPerDimension(cost, i, i == 0); + } else { + for (int i = NumDims - 1; i >= 0; --i) + updateCostPerDimension(cost, i, i == NumDims - 1); + } + return cost; + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + private: + void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const { + const double in = static_cast<double>(m_impl.dimensions()[i]); + const double out = in + m_padding[i].first + m_padding[i].second; + if (out == 0) + return; + const double reduction = in / out; + cost *= reduction; + if (first) { + cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() + + reduction * (1 * TensorOpCost::AddCost<Index>())); + } else { + cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() + + 2 * TensorOpCost::MulCost<Index>() + + reduction * (2 * TensorOpCost::MulCost<Index>() + + 1 * TensorOpCost::DivCost<Index>())); + } + } + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); const Index initialIndex = index; Index inputIndex = 0; for (int i = NumDims - 1; i > 0; --i) { const Index first = index; - const Index last = index + packetSize - 1; + const Index last = index + PacketSize - 1; const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i]; const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i]; const Index lastPaddedRight = m_outputStrides[i+1]; @@ -263,7 +293,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device } } - const Index last = index + packetSize - 1; + const Index last = index + PacketSize - 1; const Index first = index; const Index lastPaddedLeft = m_padding[0].first; const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second); @@ -288,16 +318,15 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); const Index initialIndex = index; Index inputIndex = 0; for (int i = 0; i < NumDims - 1; ++i) { const Index first = index; - const Index last = index + packetSize - 1; + const Index last = index + PacketSize - 1; const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1]; const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1]; const Index lastPaddedRight = m_outputStrides[i]; @@ -322,7 +351,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device } } - const Index last = index + packetSize - 1; + const Index last = index + PacketSize - 1; const Index first = index; const Index lastPaddedLeft = m_padding[NumDims-1].first; const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second); @@ -347,9 +376,8 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload<PacketReturnType>(values); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 0bf460f4e..a87e45330 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -85,6 +85,10 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value + 1; typedef DSizes<Index, NumDims> Dimensions; typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; + enum { IsAligned = false, @@ -137,9 +141,6 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> } } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -183,12 +184,11 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0; - Index indices[2] = {index, index + packetSize - 1}; + Index indices[2] = {index, index + PacketSize - 1}; Index patchIndices[2] = {indices[0] / m_outputStrides[output_stride_index], indices[1] / m_outputStrides[output_stride_index]}; Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[output_stride_index], @@ -229,15 +229,15 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> inputIndices[0] += (patchIndices[0] + patchOffsets[0]); inputIndices[1] += (patchIndices[1] + patchOffsets[1]); - if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + if (inputIndices[1] - inputIndices[0] == PacketSize - 1) { PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]); return rslt; } else { - EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; + EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize]; values[0] = m_impl.coeff(inputIndices[0]); - values[packetSize-1] = m_impl.coeff(inputIndices[1]); - for (int i = 1; i < packetSize-1; ++i) { + values[PacketSize-1] = m_impl.coeff(inputIndices[1]); + for (int i = 1; i < PacketSize-1; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload<PacketReturnType>(values); @@ -245,6 +245,14 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = NumDims * (TensorOpCost::DivCost<Index>() + + TensorOpCost::MulCost<Index>() + + 2 * TensorOpCost::AddCost<Index>()); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 00f870328..885295f0a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -214,7 +214,7 @@ struct FullReducer { static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) { const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions()); - *output = InnerMostDimReducer<Self, Op>::reduce(self, 0, num_coeffs, reducer); + *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer); } }; @@ -222,18 +222,19 @@ struct FullReducer { #ifdef EIGEN_USE_THREADS // Multithreaded full reducers template <typename Self, typename Op, - bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)> + bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)> struct FullReducerShard { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer, typename Self::CoeffReturnType* output) { - *output = InnerMostDimReducer<Self, Op, vectorizable>::reduce( + *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce( self, firstIndex, numValuesToReduce, reducer); } }; -template <typename Self, typename Op> -struct FullReducer<Self, Op, ThreadPoolDevice, false> { +// Multithreaded full reducer +template <typename Self, typename Op, bool Vectorizable> +struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> { static const bool HasOptimizedImplementation = !Op::IsStateful; static const int PacketSize = unpacket_traits<typename Self::PacketReturnType>::size; @@ -247,79 +248,44 @@ struct FullReducer<Self, Op, ThreadPoolDevice, false> { *output = reducer.finalize(reducer.initialize()); return; } - const std::size_t num_threads = device.numThreads(); - if (num_threads == 1) { - *output = InnerMostDimReducer<Self, Op, false>::reduce(self, 0, num_coeffs, reducer); - return; - } else { - const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs) / num_threads); - const unsigned int numblocks = blocksize > 0 ? static_cast<unsigned int>(num_coeffs / blocksize) : 0; - eigen_assert(num_coeffs >= static_cast<Index>(numblocks) * blocksize); - - Barrier barrier(numblocks); - MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize()); - for (unsigned int i = 0; i < numblocks; ++i) { - device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, false>::run, self, - i * blocksize, blocksize, reducer, &shards[i]); - } - - typename Self::CoeffReturnType finalShard; - if (static_cast<Index>(numblocks) * blocksize < num_coeffs) { - finalShard = InnerMostDimReducer<Self, Op, false>::reduce( - self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer); - } else { - finalShard = reducer.initialize(); - } - barrier.Wait(); - for (unsigned int i = 0; i < numblocks; ++i) { - reducer.reduce(shards[i], &finalShard); - } - *output = reducer.finalize(finalShard); - } - } -}; - -template <typename Self, typename Op> -struct FullReducer<Self, Op, ThreadPoolDevice, true> { - static const bool HasOptimizedImplementation = !Op::IsStateful; - static const int PacketSize = - unpacket_traits<typename Self::PacketReturnType>::size; - - // launch one reducer per thread and accumulate the result. - static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, - typename Self::CoeffReturnType* output) { - typedef typename Self::Index Index; - const Index num_coeffs = array_prod(self.m_impl.dimensions()); - if (num_coeffs == 0) { - *output = reducer.finalize(reducer.initialize()); - return; - } - const std::size_t num_threads = device.numThreads(); +#ifdef EIGEN_USE_COST_MODEL + const TensorOpCost cost = + self.m_impl.costPerCoeff(Vectorizable) + + TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable, + PacketSize); + const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads( + num_coeffs, cost, device.numThreads()); +#else + const int num_threads = device.numThreads(); +#endif if (num_threads == 1) { - *output = InnerMostDimReducer<Self, Op, true>::reduce(self, 0, num_coeffs, reducer); + *output = + InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer); return; } - const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs) / num_threads); - const unsigned int numblocks = blocksize > 0 ? static_cast<unsigned int>(num_coeffs / blocksize) : 0; - eigen_assert(num_coeffs >= static_cast<Index>(numblocks) * blocksize); + const Index blocksize = + std::floor<Index>(static_cast<float>(num_coeffs) / num_threads); + const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; + eigen_assert(num_coeffs >= numblocks * blocksize); Barrier barrier(numblocks); MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize()); - for (unsigned int i = 0; i < numblocks; ++i) { - device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, true>::run, + for (Index i = 0; i < numblocks; ++i) { + device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run, self, i * blocksize, blocksize, reducer, &shards[i]); } typename Self::CoeffReturnType finalShard; - if (static_cast<Index>(numblocks) * blocksize < num_coeffs) { - finalShard = InnerMostDimReducer<Self, Op, true>::reduce( - self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer); + if (numblocks * blocksize < num_coeffs) { + finalShard = InnerMostDimReducer<Self, Op, Vectorizable>::reduce( + self, numblocks * blocksize, num_coeffs - numblocks * blocksize, + reducer); } else { finalShard = reducer.initialize(); } - barrier.Wait(); - for (unsigned int i = 0; i < numblocks; ++i) { + + for (Index i = 0; i < numblocks; ++i) { reducer.reduce(shards[i], &finalShard); } *output = reducer.finalize(finalShard); @@ -411,6 +377,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> typedef typename XprType::Scalar Scalar; typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> Self; static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess; + typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = false, @@ -495,8 +464,13 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static bool size_large_enough(Index total_size) { +#ifndef EIGEN_USE_COST_MODEL + return total_size > 1024 * 1024; +#else + return true || total_size; +#endif + } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) { m_impl.evalSubExprsIfNeeded(NULL); @@ -504,7 +478,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> // Use the FullReducer if possible. if (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation && ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || - (!RunningOnGPU && (internal::array_prod(m_impl.dimensions()) > 1024 * 1024)))) { + (!RunningOnGPU && size_large_enough(internal::array_prod(m_impl.dimensions()))))) { bool need_assign = false; if (!data) { @@ -584,16 +558,15 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index + packetSize - 1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; if (ReducingInnerMostDims) { const Index num_values_to_reduce = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1]; const Index firstIndex = firstInput(index); - for (Index i = 0; i < packetSize; ++i) { + for (Index i = 0; i < PacketSize; ++i) { Op reducer(m_reducer); values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce, num_values_to_reduce, reducer); @@ -602,18 +575,18 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> const Index firstIndex = firstInput(index); const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1; // TBD: extend this the the n innermost dimensions that we preserve. - if (((firstIndex % m_dimensions[innermost_dim]) + packetSize - 1) < m_dimensions[innermost_dim]) { + if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) { Op reducer(m_reducer); typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>(); internal::InnerMostDimPreserver<NumReducedDims-1, Self, Op>::reduce(*this, firstIndex, reducer, &accum); return reducer.finalizePacket(accum); } else { - for (int i = 0; i < packetSize; ++i) { + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index + i); } } } else { - for (int i = 0; i < packetSize; ++i) { + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index + i); } } @@ -621,6 +594,18 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> return rslt; } + // Must be called after evalSubExprsIfNeeded(). + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + if (RunningFullReduction && m_result) { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } else { + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const double compute_cost = num_values_to_reduce * internal::functor_traits<Op>::Cost; + return m_impl.costPerCoeff(vectorized) * num_values_to_reduce + + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } private: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index c33d54d6e..fd2587dd5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -130,13 +130,18 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> { assert(false && "Should only be called on floats"); } - static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) { + static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) { typedef typename Self::Index Index; const Index num_coeffs = array_prod(self.m_impl.dimensions()); + // Don't crash when we're called with an input tensor of size 0. + if (num_coeffs == 0) { + return; + } + const int block_size = 256; const int num_per_thread = 128; - const int num_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread)); + const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread); if (num_blocks > 1) { // We initialize the outputs outside the reduction kernel when we can't be sure that there @@ -231,7 +236,7 @@ struct InnerReducer<Self, Op, GpuDevice> { return true; } - static EIGEN_DEVICE_FUNC bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { typedef typename Self::Index Index; // It's faster to use the usual code. @@ -310,7 +315,7 @@ struct OuterReducer<Self, Op, GpuDevice> { return true; } - static EIGEN_DEVICE_FUNC bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { typedef typename Self::Index Index; // It's faster to use the usual code. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index 96d92038c..1a59cc8f7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -104,6 +104,10 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device typedef typename XprType::Index Index; static const int NumDims = internal::array_size<ReverseDimensions>::value; typedef DSizes<Index, NumDims> Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = false, @@ -135,10 +139,6 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device } } - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -195,21 +195,33 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); // TODO(ndjaitly): write a better packing routine that uses // local structure. EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type - values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload<PacketReturnType>(values); return rslt; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() + + 2 * TensorOpCost::MulCost<Index>() + + TensorOpCost::DivCost<Index>()); + for (int i = 0; i < NumDims; ++i) { + if (m_reverse[i]) { + compute_cost += 2 * TensorOpCost::AddCost<Index>(); + } + } + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: @@ -246,6 +258,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return this->m_dimensions; } @@ -256,14 +269,13 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device> template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); // This code is pilfered from TensorMorphing.h - EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; + EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize]; internal::pstore<CoeffReturnType, PacketReturnType>(values, x); - for (int i = 0; i < packetSize; ++i) { + for (int i = 0; i < PacketSize; ++i) { this->coeffRef(index+i) = values[i]; } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index c19833ea5..e76533710 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -104,6 +104,9 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; typedef DSizes<Index, NumDims> Dimensions; typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = false, @@ -145,9 +148,6 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> } } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -166,18 +166,25 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload<PacketReturnType>(values); return rslt; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() + + 2 * TensorOpCost::MulCost<Index>() + + TensorOpCost::DivCost<Index>()); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: @@ -219,6 +226,9 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device> static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; typedef DSizes<Index, NumDims> Dimensions; typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = false, @@ -230,9 +240,6 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device> : Base(op, device) { } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { return this->m_impl.coeffRef(this->srcCoeff(index)); @@ -241,12 +248,11 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device> template <int StoreMode> EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - static const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; internal::pstore<CoeffReturnType, PacketReturnType>(values, x); - for (int i = 0; i < packetSize; ++i) { + for (int i = 0; i < PacketSize; ++i) { this->coeffRef(index+i) = values[i]; } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 085f8fd3d..52b7d216a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -103,6 +103,10 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> typedef typename XprType::Index Index; static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; typedef DSizes<Index, NumDims> Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false, @@ -142,10 +146,6 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> } } - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -164,12 +164,11 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); Index inputIndices[] = {0, 0}; - Index indices[] = {index, index + packetSize - 1}; + Index indices[] = {index, index + PacketSize - 1}; if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / m_outputStrides[i]; @@ -193,15 +192,15 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> inputIndices[0] += indices[0] * m_inputStrides[NumDims-1]; inputIndices[1] += indices[1] * m_inputStrides[NumDims-1]; } - if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + if (inputIndices[1] - inputIndices[0] == PacketSize - 1) { PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]); return rslt; } else { - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; values[0] = m_impl.coeff(inputIndices[0]); - values[packetSize-1] = m_impl.coeff(inputIndices[1]); - for (int i = 1; i < packetSize-1; ++i) { + values[PacketSize-1] = m_impl.coeff(inputIndices[1]); + for (int i = 1; i < PacketSize-1; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload<PacketReturnType>(values); @@ -209,6 +208,20 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + double compute_cost = (NumDims - 1) * (TensorOpCost::AddCost<Index>() + + TensorOpCost::MulCost<Index>() + + TensorOpCost::DivCost<Index>()) + + TensorOpCost::MulCost<Index>(); + if (vectorized) { + compute_cost *= 2; // packet() computes two indices + } + const int innerDim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : (NumDims - 1); + return m_impl.costPerCoeff(vectorized && m_inputStrides[innerDim] == 1) + + // Computation is not vectorized per se, but it is done once per packet. + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: @@ -266,6 +279,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { @@ -275,12 +289,11 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device> template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < this->dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < this->dimensions().TotalSize()); Index inputIndices[] = {0, 0}; - Index indices[] = {index, index + packetSize - 1}; + Index indices[] = {index, index + PacketSize - 1}; if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / this->m_outputStrides[i]; @@ -304,15 +317,15 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device> inputIndices[0] += indices[0] * this->m_inputStrides[NumDims-1]; inputIndices[1] += indices[1] * this->m_inputStrides[NumDims-1]; } - if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + if (inputIndices[1] - inputIndices[0] == PacketSize - 1) { this->m_impl.template writePacket<Unaligned>(inputIndices[0], x); } else { - EIGEN_ALIGN_MAX Scalar values[packetSize]; + EIGEN_ALIGN_MAX Scalar values[PacketSize]; internal::pstore<Scalar, PacketReturnType>(values, x); this->m_impl.coeffRef(inputIndices[0]) = values[0]; - this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; - for (int i = 1; i < packetSize-1; ++i) { + this->m_impl.coeffRef(inputIndices[1]) = values[PacketSize-1]; + for (int i = 1; i < PacketSize-1; ++i) { this->coeffRef(index+i) = values[i]; } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h index 3e56589c3..5950f38e2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h @@ -53,9 +53,7 @@ struct TensorUInt128 template<typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE explicit TensorUInt128(const T& x) : high(0), low(x) { - typedef typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type UnsignedT; - typedef typename conditional<sizeof(LOW) == 8, uint64_t, uint32_t>::type UnsignedLow; - eigen_assert(static_cast<UnsignedT>(x) <= static_cast<UnsignedLow>(NumTraits<LOW>::highest())); + eigen_assert((static_cast<typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type>(x) <= static_cast<typename conditional<sizeof(LOW) == 8, uint64_t, uint32_t>::type>(NumTraits<LOW>::highest()))); eigen_assert(x >= 0); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index 5bdfbad46..e735fc76f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -171,6 +171,9 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D static const int NumDims = NumInputDims + 1; typedef DSizes<Index, NumDims> Dimensions; typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { IsAligned = false, @@ -336,9 +339,6 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D } } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -408,16 +408,15 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const Index packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1 || m_in_plane_strides != 1 || m_plane_inflate_strides != 1) { return packetWithPossibleZero(index); } - const Index indices[2] = {index, index + packetSize - 1}; + const Index indices[2] = {index, index + PacketSize - 1}; const Index patchIndex = indices[0] / m_fastPatchStride; if (patchIndex != indices[1] / m_fastPatchStride) { return packetWithPossibleZero(index); @@ -495,6 +494,14 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D return packetWithPossibleZero(index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double compute_cost = + 10 * TensorOpCost::DivCost<Index>() + 21 * TensorOpCost::MulCost<Index>() + + 8 * TensorOpCost::AddCost<Index>(); + return TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } @@ -518,9 +525,8 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { - const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; - for (int i = 0; i < packetSize; ++i) { + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } PacketReturnType rslt = internal::pload<PacketReturnType>(values); diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt b/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt new file mode 100644 index 000000000..88fef50c6 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt @@ -0,0 +1,6 @@ +FILE(GLOB Eigen_CXX11_ThreadPool_SRCS "*.h") + +INSTALL(FILES + ${Eigen_CXX11_ThreadPool_SRCS} + DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/ThreadPool COMPONENT Devel + ) diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h new file mode 100644 index 000000000..6dd64f185 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h @@ -0,0 +1,234 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_ +#define EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_ + +namespace Eigen { + +// EventCount allows to wait for arbitrary predicates in non-blocking +// algorithms. Think of condition variable, but wait predicate does not need to +// be protected by a mutex. Usage: +// Waiting thread does: +// +// if (predicate) +// return act(); +// EventCount::Waiter& w = waiters[my_index]; +// ec.Prewait(&w); +// if (predicate) { +// ec.CancelWait(&w); +// return act(); +// } +// ec.CommitWait(&w); +// +// Notifying thread does: +// +// predicate = true; +// ec.Notify(true); +// +// Notify is cheap if there are no waiting threads. Prewait/CommitWait are not +// cheap, but they are executed only if the preceeding predicate check has +// failed. +// +// Algorihtm outline: +// There are two main variables: predicate (managed by user) and state_. +// Operation closely resembles Dekker mutual algorithm: +// https://en.wikipedia.org/wiki/Dekker%27s_algorithm +// Waiting thread sets state_ then checks predicate, Notifying thread sets +// predicate then checks state_. Due to seq_cst fences in between these +// operations it is guaranteed than either waiter will see predicate change +// and won't block, or notifying thread will see state_ change and will unblock +// the waiter, or both. But it can't happen that both threads don't see each +// other changes, which would lead to deadlock. +class EventCount { + public: + class Waiter; + + EventCount(std::vector<Waiter>& waiters) : waiters_(waiters) { + eigen_assert(waiters.size() < (1 << kWaiterBits) - 1); + // Initialize epoch to something close to overflow to test overflow. + state_ = kStackMask | (kEpochMask - kEpochInc * waiters.size() * 2); + } + + ~EventCount() { + // Ensure there are no waiters. + eigen_assert((state_.load() & (kStackMask | kWaiterMask)) == kStackMask); + } + + // Prewait prepares for waiting. + // After calling this function the thread must re-check the wait predicate + // and call either CancelWait or CommitWait passing the same Waiter object. + void Prewait(Waiter* w) { + w->epoch = state_.fetch_add(kWaiterInc, std::memory_order_relaxed); + std::atomic_thread_fence(std::memory_order_seq_cst); + } + + // CommitWait commits waiting. + void CommitWait(Waiter* w) { + w->state = Waiter::kNotSignaled; + // Modification epoch of this waiter. + uint64_t epoch = + (w->epoch & kEpochMask) + + (((w->epoch & kWaiterMask) >> kWaiterShift) << kEpochShift); + uint64_t state = state_.load(std::memory_order_seq_cst); + for (;;) { + if (int64_t((state & kEpochMask) - epoch) < 0) { + // The preceeding waiter has not decided on its fate. Wait until it + // calls either CancelWait or CommitWait, or is notified. + EIGEN_THREAD_YIELD(); + state = state_.load(std::memory_order_seq_cst); + continue; + } + // We've already been notified. + if (int64_t((state & kEpochMask) - epoch) > 0) return; + // Remove this thread from prewait counter and add it to the waiter list. + eigen_assert((state & kWaiterMask) != 0); + uint64_t newstate = state - kWaiterInc + kEpochInc; + newstate = (newstate & ~kStackMask) | (w - &waiters_[0]); + if ((state & kStackMask) == kStackMask) + w->next.store(nullptr, std::memory_order_relaxed); + else + w->next.store(&waiters_[state & kStackMask], std::memory_order_relaxed); + if (state_.compare_exchange_weak(state, newstate, + std::memory_order_release)) + break; + } + Park(w); + } + + // CancelWait cancels effects of the previous Prewait call. + void CancelWait(Waiter* w) { + uint64_t epoch = + (w->epoch & kEpochMask) + + (((w->epoch & kWaiterMask) >> kWaiterShift) << kEpochShift); + uint64_t state = state_.load(std::memory_order_relaxed); + for (;;) { + if (int64_t((state & kEpochMask) - epoch) < 0) { + // The preceeding waiter has not decided on its fate. Wait until it + // calls either CancelWait or CommitWait, or is notified. + EIGEN_THREAD_YIELD(); + state = state_.load(std::memory_order_relaxed); + continue; + } + // We've already been notified. + if (int64_t((state & kEpochMask) - epoch) > 0) return; + // Remove this thread from prewait counter. + eigen_assert((state & kWaiterMask) != 0); + if (state_.compare_exchange_weak(state, state - kWaiterInc + kEpochInc, + std::memory_order_relaxed)) + return; + } + } + + // Notify wakes one or all waiting threads. + // Must be called after changing the associated wait predicate. + void Notify(bool all) { + std::atomic_thread_fence(std::memory_order_seq_cst); + uint64_t state = state_.load(std::memory_order_acquire); + for (;;) { + // Easy case: no waiters. + if ((state & kStackMask) == kStackMask && (state & kWaiterMask) == 0) + return; + uint64_t waiters = (state & kWaiterMask) >> kWaiterShift; + uint64_t newstate; + if (all) { + // Reset prewait counter and empty wait list. + newstate = (state & kEpochMask) + (kEpochInc * waiters) + kStackMask; + } else if (waiters) { + // There is a thread in pre-wait state, unblock it. + newstate = state + kEpochInc - kWaiterInc; + } else { + // Pop a waiter from list and unpark it. + Waiter* w = &waiters_[state & kStackMask]; + Waiter* wnext = w->next.load(std::memory_order_relaxed); + uint64_t next = kStackMask; + if (wnext != nullptr) next = wnext - &waiters_[0]; + // Note: we don't add kEpochInc here. ABA problem on the lock-free stack + // can't happen because a waiter is re-pushed onto the stack only after + // it was in the pre-wait state which inevitably leads to epoch + // increment. + newstate = (state & kEpochMask) + next; + } + if (state_.compare_exchange_weak(state, newstate, + std::memory_order_acquire)) { + if (!all && waiters) return; // unblocked pre-wait thread + if ((state & kStackMask) == kStackMask) return; + Waiter* w = &waiters_[state & kStackMask]; + if (!all) w->next.store(nullptr, std::memory_order_relaxed); + Unpark(w); + return; + } + } + } + + class Waiter { + friend class EventCount; + std::atomic<Waiter*> next; + std::mutex mu; + std::condition_variable cv; + uint64_t epoch; + unsigned state; + enum { + kNotSignaled, + kWaiting, + kSignaled, + }; + // Prevent false sharing with other Waiter objects in the same vector. + char pad_[128]; + }; + + private: + // State_ layout: + // - low kStackBits is a stack of waiters committed wait. + // - next kWaiterBits is count of waiters in prewait state. + // - next kEpochBits is modification counter. + static const uint64_t kStackBits = 16; + static const uint64_t kStackMask = (1ull << kStackBits) - 1; + static const uint64_t kWaiterBits = 16; + static const uint64_t kWaiterShift = 16; + static const uint64_t kWaiterMask = ((1ull << kWaiterBits) - 1) + << kWaiterShift; + static const uint64_t kWaiterInc = 1ull << kWaiterBits; + static const uint64_t kEpochBits = 32; + static const uint64_t kEpochShift = 32; + static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift; + static const uint64_t kEpochInc = 1ull << kEpochShift; + std::atomic<uint64_t> state_; + std::vector<Waiter>& waiters_; + + void Park(Waiter* w) { + std::unique_lock<std::mutex> lock(w->mu); + while (w->state != Waiter::kSignaled) { + w->state = Waiter::kWaiting; + w->cv.wait(lock); + } + } + + void Unpark(Waiter* waiters) { + Waiter* next = nullptr; + for (Waiter* w = waiters; w; w = next) { + next = w->next.load(std::memory_order_relaxed); + unsigned state; + { + std::unique_lock<std::mutex> lock(w->mu); + state = w->state; + w->state = Waiter::kSignaled; + } + // Avoid notifying if it wasn't waiting. + if (state == Waiter::kWaiting) w->cv.notify_one(); + } + } + + EventCount(const EventCount&) = delete; + void operator=(const EventCount&) = delete; +}; + +} // namespace Eigen + +#endif // EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_ diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h new file mode 100644 index 000000000..1c471a19f --- /dev/null +++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h @@ -0,0 +1,232 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H +#define EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H + + +namespace Eigen { + +template <typename Environment> +class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { + public: + typedef typename Environment::Task Task; + typedef RunQueue<Task, 1024> Queue; + + NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment()) + : env_(env), + threads_(num_threads), + queues_(num_threads), + waiters_(num_threads), + blocked_(), + spinning_(), + done_(), + ec_(waiters_) { + for (int i = 0; i < num_threads; i++) queues_.push_back(new Queue()); + for (int i = 0; i < num_threads; i++) + threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); })); + } + + ~NonBlockingThreadPoolTempl() { + done_.store(true, std::memory_order_relaxed); + // Now if all threads block without work, they will start exiting. + // But note that threads can continue to work arbitrary long, + // block, submit new work, unblock and otherwise live full life. + ec_.Notify(true); + + // Join threads explicitly to avoid destruction order issues. + for (size_t i = 0; i < threads_.size(); i++) delete threads_[i]; + for (size_t i = 0; i < threads_.size(); i++) delete queues_[i]; + } + + void Schedule(std::function<void()> fn) { + Task t = env_.CreateTask(std::move(fn)); + PerThread* pt = GetPerThread(); + if (pt->pool == this) { + // Worker thread of this pool, push onto the thread's queue. + Queue* q = queues_[pt->index]; + t = q->PushFront(std::move(t)); + } else { + // A free-standing thread (or worker of another pool), push onto a random + // queue. + Queue* q = queues_[Rand(&pt->rand) % queues_.size()]; + t = q->PushBack(std::move(t)); + } + // Note: below we touch this after making w available to worker threads. + // Strictly speaking, this can lead to a racy-use-after-free. Consider that + // Schedule is called from a thread that is neither main thread nor a worker + // thread of this pool. Then, execution of w directly or indirectly + // completes overall computations, which in turn leads to destruction of + // this. We expect that such scenario is prevented by program, that is, + // this is kept alive while any threads can potentially be in Schedule. + if (!t.f) + ec_.Notify(false); + else + env_.ExecuteTask(t); // Push failed, execute directly. + } + + private: + typedef typename Environment::EnvThread Thread; + + struct PerThread { + bool inited; + NonBlockingThreadPoolTempl* pool; // Parent pool, or null for normal threads. + unsigned index; // Worker thread index in pool. + unsigned rand; // Random generator state. + }; + + Environment env_; + MaxSizeVector<Thread*> threads_; + MaxSizeVector<Queue*> queues_; + std::vector<EventCount::Waiter> waiters_; + std::atomic<unsigned> blocked_; + std::atomic<bool> spinning_; + std::atomic<bool> done_; + EventCount ec_; + + // Main worker thread loop. + void WorkerLoop(unsigned index) { + PerThread* pt = GetPerThread(); + pt->pool = this; + pt->index = index; + Queue* q = queues_[index]; + EventCount::Waiter* waiter = &waiters_[index]; + std::vector<Task> stolen; + for (;;) { + Task t; + if (!stolen.empty()) { + t = std::move(stolen.back()); + stolen.pop_back(); + } + if (!t.f) t = q->PopFront(); + if (!t.f) { + if (Steal(&stolen)) { + t = std::move(stolen.back()); + stolen.pop_back(); + while (stolen.size()) { + Task t1 = q->PushFront(std::move(stolen.back())); + stolen.pop_back(); + if (t1.f) { + // There is not much we can do in this case. Just execute the + // remaining directly. + stolen.push_back(std::move(t1)); + break; + } + } + } + } + if (t.f) { + env_.ExecuteTask(t); + continue; + } + // Leave one thread spinning. This reduces latency. + if (!spinning_ && !spinning_.exchange(true)) { + bool nowork = true; + for (int i = 0; i < 1000; i++) { + if (!OutOfWork()) { + nowork = false; + break; + } + } + spinning_ = false; + if (!nowork) continue; + } + if (!WaitForWork(waiter)) return; + } + } + + // Steal tries to steal work from other worker threads in best-effort manner. + bool Steal(std::vector<Task>* stolen) { + if (queues_.size() == 1) return false; + PerThread* pt = GetPerThread(); + unsigned lastq = pt->index; + for (unsigned i = queues_.size(); i > 0; i--) { + unsigned victim = Rand(&pt->rand) % queues_.size(); + if (victim == lastq && queues_.size() > 2) { + i++; + continue; + } + // Steal half of elements from a victim queue. + // It is typical to steal just one element, but that assumes that work is + // recursively subdivided in halves so that the stolen element is exactly + // half of work. If work elements are equally-sized, then is makes sense + // to steal half of elements at once and then work locally for a while. + if (queues_[victim]->PopBackHalf(stolen)) return true; + lastq = victim; + } + // Just to make sure that we did not miss anything. + for (unsigned i = queues_.size(); i > 0; i--) + if (queues_[i - 1]->PopBackHalf(stolen)) return true; + return false; + } + + // WaitForWork blocks until new work is available, or if it is time to exit. + bool WaitForWork(EventCount::Waiter* waiter) { + // We already did best-effort emptiness check in Steal, so prepare blocking. + ec_.Prewait(waiter); + // Now do reliable emptiness check. + if (!OutOfWork()) { + ec_.CancelWait(waiter); + return true; + } + // Number of blocked threads is used as termination condition. + // If we are shutting down and all worker threads blocked without work, + // that's we are done. + blocked_++; + if (done_ && blocked_ == threads_.size()) { + ec_.CancelWait(waiter); + // Almost done, but need to re-check queues. + // Consider that all queues are empty and all worker threads are preempted + // right after incrementing blocked_ above. Now a free-standing thread + // submits work and calls destructor (which sets done_). If we don't + // re-check queues, we will exit leaving the work unexecuted. + if (!OutOfWork()) { + // Note: we must not pop from queues before we decrement blocked_, + // otherwise the following scenario is possible. Consider that instead + // of checking for emptiness we popped the only element from queues. + // Now other worker threads can start exiting, which is bad if the + // work item submits other work. So we just check emptiness here, + // which ensures that all worker threads exit at the same time. + blocked_--; + return true; + } + // Reached stable termination state. + ec_.Notify(true); + return false; + } + ec_.CommitWait(waiter); + blocked_--; + return true; + } + + bool OutOfWork() { + for (unsigned i = 0; i < queues_.size(); i++) + if (!queues_[i]->Empty()) return false; + return true; + } + + PerThread* GetPerThread() { + EIGEN_THREAD_LOCAL PerThread per_thread_; + PerThread* pt = &per_thread_; + if (pt->inited) return pt; + pt->inited = true; + pt->rand = std::hash<std::thread::id>()(std::this_thread::get_id()); + return pt; + } + + static unsigned Rand(unsigned* state) { + return *state = *state * 1103515245 + 12345; + } +}; + +typedef NonBlockingThreadPoolTempl<StlThreadEnvironment> NonBlockingThreadPool; + +} // namespace Eigen + +#endif // EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h new file mode 100644 index 000000000..0544a6e15 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h @@ -0,0 +1,210 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_ +#define EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_ + + +namespace Eigen { + +// RunQueue is a fixed-size, partially non-blocking deque or Work items. +// Operations on front of the queue must be done by a single thread (owner), +// operations on back of the queue can be done by multiple threads concurrently. +// +// Algorithm outline: +// All remote threads operating on the queue back are serialized by a mutex. +// This ensures that at most two threads access state: owner and one remote +// thread (Size aside). The algorithm ensures that the occupied region of the +// underlying array is logically continuous (can wraparound, but no stray +// occupied elements). Owner operates on one end of this region, remote thread +// operates on the other end. Synchronization between these threads +// (potential consumption of the last element and take up of the last empty +// element) happens by means of state variable in each element. States are: +// empty, busy (in process of insertion of removal) and ready. Threads claim +// elements (empty->busy and ready->busy transitions) by means of a CAS +// operation. The finishing transition (busy->empty and busy->ready) are done +// with plain store as the element is exclusively owned by the current thread. +// +// Note: we could permit only pointers as elements, then we would not need +// separate state variable as null/non-null pointer value would serve as state, +// but that would require malloc/free per operation for large, complex values +// (and this is designed to store std::function<()>). +template <typename Work, unsigned kSize> +class RunQueue { + public: + RunQueue() : front_(), back_() { + // require power-of-two for fast masking + eigen_assert((kSize & (kSize - 1)) == 0); + eigen_assert(kSize > 2); // why would you do this? + eigen_assert(kSize <= (64 << 10)); // leave enough space for counter + for (unsigned i = 0; i < kSize; i++) + array_[i].state.store(kEmpty, std::memory_order_relaxed); + } + + ~RunQueue() { eigen_assert(Size() == 0); } + + // PushFront inserts w at the beginning of the queue. + // If queue is full returns w, otherwise returns default-constructed Work. + Work PushFront(Work w) { + unsigned front = front_.load(std::memory_order_relaxed); + Elem* e = &array_[front & kMask]; + uint8_t s = e->state.load(std::memory_order_relaxed); + if (s != kEmpty || + !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) + return w; + front_.store(front + 1 + (kSize << 1), std::memory_order_relaxed); + e->w = std::move(w); + e->state.store(kReady, std::memory_order_release); + return Work(); + } + + // PopFront removes and returns the first element in the queue. + // If the queue was empty returns default-constructed Work. + Work PopFront() { + unsigned front = front_.load(std::memory_order_relaxed); + Elem* e = &array_[(front - 1) & kMask]; + uint8_t s = e->state.load(std::memory_order_relaxed); + if (s != kReady || + !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) + return Work(); + Work w = std::move(e->w); + e->state.store(kEmpty, std::memory_order_release); + front = ((front - 1) & kMask2) | (front & ~kMask2); + front_.store(front, std::memory_order_relaxed); + return w; + } + + // PushBack adds w at the end of the queue. + // If queue is full returns w, otherwise returns default-constructed Work. + Work PushBack(Work w) { + std::unique_lock<std::mutex> lock(mutex_); + unsigned back = back_.load(std::memory_order_relaxed); + Elem* e = &array_[(back - 1) & kMask]; + uint8_t s = e->state.load(std::memory_order_relaxed); + if (s != kEmpty || + !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) + return w; + back = ((back - 1) & kMask2) | (back & ~kMask2); + back_.store(back, std::memory_order_relaxed); + e->w = std::move(w); + e->state.store(kReady, std::memory_order_release); + return Work(); + } + + // PopBack removes and returns the last elements in the queue. + // Can fail spuriously. + Work PopBack() { + if (Empty()) return 0; + std::unique_lock<std::mutex> lock(mutex_, std::try_to_lock); + if (!lock) return Work(); + unsigned back = back_.load(std::memory_order_relaxed); + Elem* e = &array_[back & kMask]; + uint8_t s = e->state.load(std::memory_order_relaxed); + if (s != kReady || + !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) + return Work(); + Work w = std::move(e->w); + e->state.store(kEmpty, std::memory_order_release); + back_.store(back + 1 + (kSize << 1), std::memory_order_relaxed); + return w; + } + + // PopBackHalf removes and returns half last elements in the queue. + // Returns number of elements removed. But can also fail spuriously. + unsigned PopBackHalf(std::vector<Work>* result) { + if (Empty()) return 0; + std::unique_lock<std::mutex> lock(mutex_, std::try_to_lock); + if (!lock) return 0; + unsigned back = back_.load(std::memory_order_relaxed); + unsigned size = Size(); + unsigned mid = back; + if (size > 1) mid = back + (size - 1) / 2; + unsigned n = 0; + unsigned start = 0; + for (; static_cast<int>(mid - back) >= 0; mid--) { + Elem* e = &array_[mid & kMask]; + uint8_t s = e->state.load(std::memory_order_relaxed); + if (n == 0) { + if (s != kReady || + !e->state.compare_exchange_strong(s, kBusy, + std::memory_order_acquire)) + continue; + start = mid; + } else { + // Note: no need to store temporal kBusy, we exclusively own these + // elements. + eigen_assert(s == kReady); + } + result->push_back(std::move(e->w)); + e->state.store(kEmpty, std::memory_order_release); + n++; + } + if (n != 0) + back_.store(start + 1 + (kSize << 1), std::memory_order_relaxed); + return n; + } + + // Size returns current queue size. + // Can be called by any thread at any time. + unsigned Size() const { + // Emptiness plays critical role in thread pool blocking. So we go to great + // effort to not produce false positives (claim non-empty queue as empty). + for (;;) { + // Capture a consistent snapshot of front/tail. + unsigned front = front_.load(std::memory_order_acquire); + unsigned back = back_.load(std::memory_order_acquire); + unsigned front1 = front_.load(std::memory_order_relaxed); + if (front != front1) continue; + int size = (front & kMask2) - (back & kMask2); + // Fix overflow. + if (size < 0) size += 2 * kSize; + // Order of modification in push/pop is crafted to make the queue look + // larger than it is during concurrent modifications. E.g. pop can + // decrement size before the corresponding push has incremented it. + // So the computed size can be up to kSize + 1, fix it. + if (size > static_cast<int>(kSize)) size = kSize; + return size; + } + } + + // Empty tests whether container is empty. + // Can be called by any thread at any time. + bool Empty() const { return Size() == 0; } + + private: + static const unsigned kMask = kSize - 1; + static const unsigned kMask2 = (kSize << 1) - 1; + struct Elem { + std::atomic<uint8_t> state; + Work w; + }; + enum { + kEmpty, + kBusy, + kReady, + }; + std::mutex mutex_; + // Low log(kSize) + 1 bits in front_ and back_ contain rolling index of + // front/back, repsectively. The remaining bits contain modification counters + // that are incremented on Push operations. This allows us to (1) distinguish + // between empty and full conditions (if we would use log(kSize) bits for + // position, these conditions would be indistinguishable); (2) obtain + // consistent snapshot of front_/back_ for Size operation using the + // modification counters. + std::atomic<unsigned> front_; + std::atomic<unsigned> back_; + Elem array_[kSize]; + + RunQueue(const RunQueue&) = delete; + void operator=(const RunQueue&) = delete; +}; + +} // namespace Eigen + +#endif // EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_ diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h new file mode 100644 index 000000000..17fd1658b --- /dev/null +++ b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h @@ -0,0 +1,127 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H +#define EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H + +namespace Eigen { + +// The implementation of the ThreadPool type ensures that the Schedule method +// runs the functions it is provided in FIFO order when the scheduling is done +// by a single thread. +// Environment provides a way to create threads and also allows to intercept +// task submission and execution. +template <typename Environment> +class SimpleThreadPoolTempl : public ThreadPoolInterface { + public: + // Construct a pool that contains "num_threads" threads. + explicit SimpleThreadPoolTempl(int num_threads, Environment env = Environment()) + : env_(env), threads_(num_threads), waiters_(num_threads) { + for (int i = 0; i < num_threads; i++) { + threads_.push_back(env.CreateThread([this]() { WorkerLoop(); })); + } + } + + // Wait until all scheduled work has finished and then destroy the + // set of threads. + ~SimpleThreadPoolTempl() { + { + // Wait for all work to get done. + std::unique_lock<std::mutex> l(mu_); + while (!pending_.empty()) { + empty_.wait(l); + } + exiting_ = true; + + // Wakeup all waiters. + for (auto w : waiters_) { + w->ready = true; + w->task.f = nullptr; + w->cv.notify_one(); + } + } + + // Wait for threads to finish. + for (auto t : threads_) { + delete t; + } + } + + // Schedule fn() for execution in the pool of threads. The functions are + // executed in the order in which they are scheduled. + void Schedule(std::function<void()> fn) { + Task t = env_.CreateTask(std::move(fn)); + std::unique_lock<std::mutex> l(mu_); + if (waiters_.empty()) { + pending_.push_back(std::move(t)); + } else { + Waiter* w = waiters_.back(); + waiters_.pop_back(); + w->ready = true; + w->task = std::move(t); + w->cv.notify_one(); + } + } + + protected: + void WorkerLoop() { + std::unique_lock<std::mutex> l(mu_); + Waiter w; + Task t; + while (!exiting_) { + if (pending_.empty()) { + // Wait for work to be assigned to me + w.ready = false; + waiters_.push_back(&w); + while (!w.ready) { + w.cv.wait(l); + } + t = w.task; + w.task.f = nullptr; + } else { + // Pick up pending work + t = std::move(pending_.front()); + pending_.pop_front(); + if (pending_.empty()) { + empty_.notify_all(); + } + } + if (t.f) { + mu_.unlock(); + env_.ExecuteTask(t); + t.f = nullptr; + mu_.lock(); + } + } + } + + private: + typedef typename Environment::Task Task; + typedef typename Environment::EnvThread Thread; + + struct Waiter { + std::condition_variable cv; + Task task; + bool ready; + }; + + Environment env_; + std::mutex mu_; + MaxSizeVector<Thread*> threads_; // All threads + MaxSizeVector<Waiter*> waiters_; // Stack of waiting threads. + std::deque<Task> pending_; // Queue of pending work + std::condition_variable empty_; // Signaled on pending_.empty() + bool exiting_ = false; +}; + +typedef SimpleThreadPoolTempl<StlThreadEnvironment> SimpleThreadPool; + +} // namespace Eigen + +#endif // EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h new file mode 100644 index 000000000..d2204ad5b --- /dev/null +++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h @@ -0,0 +1,38 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H +#define EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H + +namespace Eigen { + +struct StlThreadEnvironment { + struct Task { + std::function<void()> f; + }; + + // EnvThread constructor must start the thread, + // destructor must join the thread. + class EnvThread { + public: + EnvThread(std::function<void()> f) : thr_(f) {} + ~EnvThread() { thr_.join(); } + + private: + std::thread thr_; + }; + + EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(f); } + Task CreateTask(std::function<void()> f) { return Task{std::move(f)}; } + void ExecuteTask(const Task& t) { t.f(); } +}; + +} // namespace Eigen + +#endif // EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h new file mode 100644 index 000000000..cfa221732 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h @@ -0,0 +1,22 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H +#define EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H + +// Try to come up with a portable implementation of thread local variables +#if EIGEN_COMP_GNUC && EIGEN_GNUC_AT_MOST(4, 7) +#define EIGEN_THREAD_LOCAL static __thread +#elif EIGEN_COMP_CLANG +#define EIGEN_THREAD_LOCAL static __thread +#else +#define EIGEN_THREAD_LOCAL static thread_local +#endif + +#endif // EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h new file mode 100644 index 000000000..38b40aceb --- /dev/null +++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h @@ -0,0 +1,26 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H +#define EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H + +namespace Eigen { + +// This defines an interface that ThreadPoolDevice can take to use +// custom thread pools underneath. +class ThreadPoolInterface { + public: + virtual void Schedule(std::function<void()> fn) = 0; + + virtual ~ThreadPoolInterface() {} +}; + +} // namespace Eigen + +#endif // EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h new file mode 100644 index 000000000..a859c7ba3 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h @@ -0,0 +1,20 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H +#define EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H + +// Try to come up with a portable way to yield +#if EIGEN_COMP_GNUC && EIGEN_GNUC_AT_MOST(4, 7) +#define EIGEN_THREAD_YIELD() sched_yield() +#else +#define EIGEN_THREAD_YIELD() std::this_thread::yield() +#endif + +#endif // EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H diff --git a/unsupported/Eigen/CXX11/src/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/util/CMakeLists.txt new file mode 100644 index 000000000..7eab492d6 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/util/CMakeLists.txt @@ -0,0 +1,6 @@ +FILE(GLOB Eigen_CXX11_util_SRCS "*.h") + +INSTALL(FILES + ${Eigen_CXX11_util_SRCS} + DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/util COMPONENT Devel + ) diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h index c582e21f5..f479590b9 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h @@ -10,12 +10,22 @@ #ifndef EIGEN_CXX11META_H #define EIGEN_CXX11META_H +#include <vector> +#include "EmulateArray.h" + +// Emulate the cxx11 functionality that we need if the compiler doesn't support it. +// Visual studio 2015 doesn't advertise itself as cxx11 compliant, although it +// supports enough of the standard for our needs +#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900 + +#include "CXX11Workarounds.h" + namespace Eigen { namespace internal { /** \internal - * \file CXX11/Core/util/CXX11Meta.h + * \file CXX11/util/CXX11Meta.h * This file contains generic metaprogramming classes which are not specifically related to Eigen. * This file expands upon Core/util/Meta.h and adds support for C++11 specific features. */ @@ -523,4 +533,10 @@ InstType instantiate_by_c_array(ArrType* arr) } // end namespace Eigen +#else // Non C++11, fallback to emulation mode + +#include "src/Core/util/EmulateCXX11Meta.h" + +#endif + #endif // EIGEN_CXX11META_H diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h index fe4d22803..fe4d22803 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h +++ b/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h index 579519b04..24159e54c 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h @@ -222,7 +222,7 @@ template<class T, std::size_t N> struct array_size<const array<T,N>& > { #else -// The compiler supports c++11, and we're not targetting cuda: use std::array as Eigen array +// The compiler supports c++11, and we're not targetting cuda: use std::array as Eigen::array #include <array> namespace Eigen { @@ -264,8 +264,4 @@ template<class T, std::size_t N> struct array_size<std::array<T,N> > { #endif - - - - #endif // EIGEN_EMULATE_ARRAY_H diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h index d685d4f9d..f3aa1b144 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h @@ -17,7 +17,7 @@ namespace Eigen { namespace internal { /** \internal - * \file CXX11/Core/util/EmulateCXX11Meta.h + * \file CXX11/util/EmulateCXX11Meta.h * This file emulates a subset of the functionality provided by CXXMeta.h for * compilers that don't yet support cxx11 such as nvcc. */ diff --git a/unsupported/Eigen/CXX11/src/Core/util/MaxSizeVector.h b/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h index 551124bae..551124bae 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/MaxSizeVector.h +++ b/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index f75bf9798..eed9f079e 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -116,6 +116,8 @@ if(EIGEN_TEST_CXX11) set(CMAKE_CXX_STANDARD 11) ei_add_test(cxx11_float16) + ei_add_test(cxx11_eventcount "-pthread" "${CMAKE_THREAD_LIBS_INIT}") + ei_add_test(cxx11_runqueue "-pthread" "${CMAKE_THREAD_LIBS_INIT}") ei_add_test(cxx11_meta) ei_add_test(cxx11_tensor_simple) # ei_add_test(cxx11_tensor_symmetry) diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp new file mode 100644 index 000000000..f16cc6f07 --- /dev/null +++ b/unsupported/test/cxx11_eventcount.cpp @@ -0,0 +1,140 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com> +// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_USE_THREADS +#include "main.h" +#include <Eigen/CXX11/ThreadPool> + +// Visual studio doesn't implement a rand_r() function since its +// implementation of rand() is already thread safe +int rand_reentrant(unsigned int* s) { +#ifdef EIGEN_COMP_MSVC_STRICT + EIGEN_UNUSED_VARIABLE(s); + return rand(); +#else + return rand_r(s); +#endif +} + +static void test_basic_eventcount() +{ + std::vector<EventCount::Waiter> waiters(1); + EventCount ec(waiters); + EventCount::Waiter& w = waiters[0]; + ec.Notify(false); + ec.Prewait(&w); + ec.Notify(true); + ec.CommitWait(&w); + ec.Prewait(&w); + ec.CancelWait(&w); +} + +// Fake bounded counter-based queue. +struct TestQueue { + std::atomic<int> val_; + static const int kQueueSize = 10; + + TestQueue() : val_() {} + + ~TestQueue() { VERIFY_IS_EQUAL(val_.load(), 0); } + + bool Push() { + int val = val_.load(std::memory_order_relaxed); + for (;;) { + VERIFY_GE(val, 0); + VERIFY_LE(val, kQueueSize); + if (val == kQueueSize) return false; + if (val_.compare_exchange_weak(val, val + 1, std::memory_order_relaxed)) + return true; + } + } + + bool Pop() { + int val = val_.load(std::memory_order_relaxed); + for (;;) { + VERIFY_GE(val, 0); + VERIFY_LE(val, kQueueSize); + if (val == 0) return false; + if (val_.compare_exchange_weak(val, val - 1, std::memory_order_relaxed)) + return true; + } + } + + bool Empty() { return val_.load(std::memory_order_relaxed) == 0; } +}; + +const int TestQueue::kQueueSize; + +// A number of producers send messages to a set of consumers using a set of +// fake queues. Ensure that it does not crash, consumers don't deadlock and +// number of blocked and unblocked threads match. +static void test_stress_eventcount() +{ + const int kThreads = std::thread::hardware_concurrency(); + static const int kEvents = 1 << 16; + static const int kQueues = 10; + + std::vector<EventCount::Waiter> waiters(kThreads); + EventCount ec(waiters); + TestQueue queues[kQueues]; + + std::vector<std::unique_ptr<std::thread>> producers; + for (int i = 0; i < kThreads; i++) { + producers.emplace_back(new std::thread([&ec, &queues]() { + unsigned int rnd = static_cast<unsigned int>(std::hash<std::thread::id>()(std::this_thread::get_id())); + for (int j = 0; j < kEvents; j++) { + unsigned idx = rand_reentrant(&rnd) % kQueues; + if (queues[idx].Push()) { + ec.Notify(false); + continue; + } + EIGEN_THREAD_YIELD(); + j--; + } + })); + } + + std::vector<std::unique_ptr<std::thread>> consumers; + for (int i = 0; i < kThreads; i++) { + consumers.emplace_back(new std::thread([&ec, &queues, &waiters, i]() { + EventCount::Waiter& w = waiters[i]; + unsigned int rnd = static_cast<unsigned int>(std::hash<std::thread::id>()(std::this_thread::get_id())); + for (int j = 0; j < kEvents; j++) { + unsigned idx = rand_reentrant(&rnd) % kQueues; + if (queues[idx].Pop()) continue; + j--; + ec.Prewait(&w); + bool empty = true; + for (int q = 0; q < kQueues; q++) { + if (!queues[q].Empty()) { + empty = false; + break; + } + } + if (!empty) { + ec.CancelWait(&w); + continue; + } + ec.CommitWait(&w); + } + })); + } + + for (int i = 0; i < kThreads; i++) { + producers[i]->join(); + consumers[i]->join(); + } +} + +void test_cxx11_eventcount() +{ + CALL_SUBTEST(test_basic_eventcount()); + CALL_SUBTEST(test_stress_eventcount()); +} diff --git a/unsupported/test/cxx11_float16.cpp b/unsupported/test/cxx11_float16.cpp index 2dc0872d8..273dcbc11 100644 --- a/unsupported/test/cxx11_float16.cpp +++ b/unsupported/test/cxx11_float16.cpp @@ -122,6 +122,8 @@ void test_comparison() VERIFY(half(1.0f) != half(2.0f)); // Comparisons with NaNs and infinities. +#if !EIGEN_COMP_MSVC + // Visual Studio errors out on divisions by 0 VERIFY(!(half(0.0 / 0.0) == half(0.0 / 0.0))); VERIFY(half(0.0 / 0.0) != half(0.0 / 0.0)); @@ -132,13 +134,26 @@ void test_comparison() VERIFY(half(1.0) < half(1.0 / 0.0)); VERIFY(half(1.0) > half(-1.0 / 0.0)); +#endif } -void test_functions() +void test_basic_functions() { VERIFY_IS_EQUAL(float(numext::abs(half(3.5f))), 3.5f); VERIFY_IS_EQUAL(float(numext::abs(half(-3.5f))), 3.5f); + VERIFY_IS_EQUAL(float(numext::floor(half(3.5f))), 3.0f); + VERIFY_IS_EQUAL(float(numext::floor(half(-3.5f))), -4.0f); + + VERIFY_IS_EQUAL(float(numext::ceil(half(3.5f))), 4.0f); + VERIFY_IS_EQUAL(float(numext::ceil(half(-3.5f))), -3.0f); + + VERIFY_IS_APPROX(float(numext::sqrt(half(0.0f))), 0.0f); + VERIFY_IS_APPROX(float(numext::sqrt(half(4.0f))), 2.0f); + + VERIFY_IS_APPROX(float(numext::pow(half(0.0f), half(1.0f))), 0.0f); + VERIFY_IS_APPROX(float(numext::pow(half(2.0f), half(2.0f))), 4.0f); + VERIFY_IS_EQUAL(float(numext::exp(half(0.0f))), 1.0f); VERIFY_IS_APPROX(float(numext::exp(half(EIGEN_PI))), float(20.0 + EIGEN_PI)); @@ -146,10 +161,32 @@ void test_functions() VERIFY_IS_APPROX(float(numext::log(half(10.0f))), 2.30273f); } +void test_trigonometric_functions() +{ + VERIFY_IS_APPROX(numext::cos(half(0.0f)), half(cosf(0.0f))); + VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI)), half(cosf(EIGEN_PI))); + //VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI/2)), half(cosf(EIGEN_PI/2))); + //VERIFY_IS_APPROX(numext::cos(half(3*EIGEN_PI/2)), half(cosf(3*EIGEN_PI/2))); + VERIFY_IS_APPROX(numext::cos(half(3.5f)), half(cosf(3.5f))); + + VERIFY_IS_APPROX(numext::sin(half(0.0f)), half(sinf(0.0f))); + // VERIFY_IS_APPROX(numext::sin(half(EIGEN_PI)), half(sinf(EIGEN_PI))); + VERIFY_IS_APPROX(numext::sin(half(EIGEN_PI/2)), half(sinf(EIGEN_PI/2))); + VERIFY_IS_APPROX(numext::sin(half(3*EIGEN_PI/2)), half(sinf(3*EIGEN_PI/2))); + VERIFY_IS_APPROX(numext::sin(half(3.5f)), half(sinf(3.5f))); + + VERIFY_IS_APPROX(numext::tan(half(0.0f)), half(tanf(0.0f))); + // VERIFY_IS_APPROX(numext::tan(half(EIGEN_PI)), half(tanf(EIGEN_PI))); + // VERIFY_IS_APPROX(numext::tan(half(EIGEN_PI/2)), half(tanf(EIGEN_PI/2))); + //VERIFY_IS_APPROX(numext::tan(half(3*EIGEN_PI/2)), half(tanf(3*EIGEN_PI/2))); + VERIFY_IS_APPROX(numext::tan(half(3.5f)), half(tanf(3.5f))); +} + void test_cxx11_float16() { CALL_SUBTEST(test_conversion()); CALL_SUBTEST(test_arithmetic()); CALL_SUBTEST(test_comparison()); - CALL_SUBTEST(test_functions()); + CALL_SUBTEST(test_basic_functions()); + CALL_SUBTEST(test_trigonometric_functions()); } diff --git a/unsupported/test/cxx11_meta.cpp b/unsupported/test/cxx11_meta.cpp index ecac3add1..8911c59d8 100644 --- a/unsupported/test/cxx11_meta.cpp +++ b/unsupported/test/cxx11_meta.cpp @@ -10,7 +10,7 @@ #include "main.h" #include <array> -#include <Eigen/CXX11/Core> +#include <Eigen/CXX11/src/util/CXX11Meta.h> using Eigen::internal::is_same; using Eigen::internal::type_list; diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp new file mode 100644 index 000000000..d1770ee1b --- /dev/null +++ b/unsupported/test/cxx11_runqueue.cpp @@ -0,0 +1,227 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com> +// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_USE_THREADS +#include <cstdlib> +#include "main.h" +#include <Eigen/CXX11/ThreadPool> + + +// Visual studio doesn't implement a rand_r() function since its +// implementation of rand() is already thread safe +int rand_reentrant(unsigned int* s) { +#ifdef EIGEN_COMP_MSVC_STRICT + EIGEN_UNUSED_VARIABLE(s); + return rand(); +#else + return rand_r(s); +#endif +} + +void test_basic_runqueue() +{ + RunQueue<int, 4> q; + // Check empty state. + VERIFY(q.Empty()); + VERIFY_IS_EQUAL(0u, q.Size()); + VERIFY_IS_EQUAL(0, q.PopFront()); + std::vector<int> stolen; + VERIFY_IS_EQUAL(0, q.PopBackHalf(&stolen)); + VERIFY_IS_EQUAL(0u, stolen.size()); + // Push one front, pop one front. + VERIFY_IS_EQUAL(0, q.PushFront(1)); + VERIFY_IS_EQUAL(1, q.Size()); + VERIFY_IS_EQUAL(1, q.PopFront()); + VERIFY_IS_EQUAL(0, q.Size()); + // Push front to overflow. + VERIFY_IS_EQUAL(0, q.PushFront(2)); + VERIFY_IS_EQUAL(1, q.Size()); + VERIFY_IS_EQUAL(0, q.PushFront(3)); + VERIFY_IS_EQUAL(2, q.Size()); + VERIFY_IS_EQUAL(0, q.PushFront(4)); + VERIFY_IS_EQUAL(3, q.Size()); + VERIFY_IS_EQUAL(0, q.PushFront(5)); + VERIFY_IS_EQUAL(4, q.Size()); + VERIFY_IS_EQUAL(6, q.PushFront(6)); + VERIFY_IS_EQUAL(4, q.Size()); + VERIFY_IS_EQUAL(5, q.PopFront()); + VERIFY_IS_EQUAL(3, q.Size()); + VERIFY_IS_EQUAL(4, q.PopFront()); + VERIFY_IS_EQUAL(2, q.Size()); + VERIFY_IS_EQUAL(3, q.PopFront()); + VERIFY_IS_EQUAL(1, q.Size()); + VERIFY_IS_EQUAL(2, q.PopFront()); + VERIFY_IS_EQUAL(0, q.Size()); + VERIFY_IS_EQUAL(0, q.PopFront()); + // Push one back, pop one back. + VERIFY_IS_EQUAL(0, q.PushBack(7)); + VERIFY_IS_EQUAL(1, q.Size()); + VERIFY_IS_EQUAL(1, q.PopBackHalf(&stolen)); + VERIFY_IS_EQUAL(1, stolen.size()); + VERIFY_IS_EQUAL(7, stolen[0]); + VERIFY_IS_EQUAL(0, q.Size()); + stolen.clear(); + // Push back to overflow. + VERIFY_IS_EQUAL(0, q.PushBack(8)); + VERIFY_IS_EQUAL(1, q.Size()); + VERIFY_IS_EQUAL(0, q.PushBack(9)); + VERIFY_IS_EQUAL(2, q.Size()); + VERIFY_IS_EQUAL(0, q.PushBack(10)); + VERIFY_IS_EQUAL(3, q.Size()); + VERIFY_IS_EQUAL(0, q.PushBack(11)); + VERIFY_IS_EQUAL(4, q.Size()); + VERIFY_IS_EQUAL(12, q.PushBack(12)); + VERIFY_IS_EQUAL(4, q.Size()); + // Pop back in halves. + VERIFY_IS_EQUAL(2, q.PopBackHalf(&stolen)); + VERIFY_IS_EQUAL(2, stolen.size()); + VERIFY_IS_EQUAL(10, stolen[0]); + VERIFY_IS_EQUAL(11, stolen[1]); + VERIFY_IS_EQUAL(2, q.Size()); + stolen.clear(); + VERIFY_IS_EQUAL(1, q.PopBackHalf(&stolen)); + VERIFY_IS_EQUAL(1, stolen.size()); + VERIFY_IS_EQUAL(9, stolen[0]); + VERIFY_IS_EQUAL(1, q.Size()); + stolen.clear(); + VERIFY_IS_EQUAL(1, q.PopBackHalf(&stolen)); + VERIFY_IS_EQUAL(1, stolen.size()); + VERIFY_IS_EQUAL(8, stolen[0]); + stolen.clear(); + VERIFY_IS_EQUAL(0, q.PopBackHalf(&stolen)); + VERIFY_IS_EQUAL(0, stolen.size()); + // Empty again. + VERIFY(q.Empty()); + VERIFY_IS_EQUAL(0, q.Size()); +} + +// Empty tests that the queue is not claimed to be empty when is is in fact not. +// Emptiness property is crucial part of thread pool blocking scheme, +// so we go to great effort to ensure this property. We create a queue with +// 1 element and then push 1 element (either front or back at random) and pop +// 1 element (either front or back at random). So queue always contains at least +// 1 element, but otherwise changes chaotically. Another thread constantly tests +// that the queue is not claimed to be empty. +void test_empty_runqueue() +{ + RunQueue<int, 4> q; + q.PushFront(1); + std::atomic<bool> done(false); + std::thread mutator([&q, &done]() { + unsigned rnd = 0; + std::vector<int> stolen; + for (int i = 0; i < 1 << 18; i++) { + if (rand_reentrant(&rnd) % 2) + VERIFY_IS_EQUAL(0, q.PushFront(1)); + else + VERIFY_IS_EQUAL(0, q.PushBack(1)); + if (rand_reentrant(&rnd) % 2) + VERIFY_IS_EQUAL(1, q.PopFront()); + else { + for (;;) { + if (q.PopBackHalf(&stolen) == 1) { + stolen.clear(); + break; + } + VERIFY_IS_EQUAL(0, stolen.size()); + } + } + } + done = true; + }); + while (!done) { + VERIFY(!q.Empty()); + int size = q.Size(); + VERIFY_GE(size, 1); + VERIFY_LE(size, 2); + } + VERIFY_IS_EQUAL(1, q.PopFront()); + mutator.join(); +} + +// Stress is a chaotic random test. +// One thread (owner) calls PushFront/PopFront, other threads call PushBack/ +// PopBack. Ensure that we don't crash, deadlock, and all sanity checks pass. +void test_stress_runqueue() +{ + static const int kEvents = 1 << 18; + RunQueue<int, 8> q; + std::atomic<int> total(0); + std::vector<std::unique_ptr<std::thread>> threads; + threads.emplace_back(new std::thread([&q, &total]() { + int sum = 0; + int pushed = 1; + int popped = 1; + while (pushed < kEvents || popped < kEvents) { + if (pushed < kEvents) { + if (q.PushFront(pushed) == 0) { + sum += pushed; + pushed++; + } + } + if (popped < kEvents) { + int v = q.PopFront(); + if (v != 0) { + sum -= v; + popped++; + } + } + } + total += sum; + })); + for (int i = 0; i < 2; i++) { + threads.emplace_back(new std::thread([&q, &total]() { + int sum = 0; + for (int j = 1; j < kEvents; j++) { + if (q.PushBack(j) == 0) { + sum += j; + continue; + } + EIGEN_THREAD_YIELD(); + j--; + } + total += sum; + })); + threads.emplace_back(new std::thread([&q, &total]() { + int sum = 0; + std::vector<int> stolen; + for (int j = 1; j < kEvents;) { + if (q.PopBackHalf(&stolen) == 0) { + EIGEN_THREAD_YIELD(); + continue; + } + while (stolen.size() && j < kEvents) { + int v = stolen.back(); + stolen.pop_back(); + VERIFY_IS_NOT_EQUAL(v, 0); + sum += v; + j++; + } + } + while (stolen.size()) { + int v = stolen.back(); + stolen.pop_back(); + VERIFY_IS_NOT_EQUAL(v, 0); + while ((v = q.PushBack(v)) != 0) EIGEN_THREAD_YIELD(); + } + total -= sum; + })); + } + for (size_t i = 0; i < threads.size(); i++) threads[i]->join(); + VERIFY(q.Empty()); + VERIFY(total.load() == 0); +} + +void test_cxx11_runqueue() +{ + CALL_SUBTEST_1(test_basic_runqueue()); + CALL_SUBTEST_2(test_empty_runqueue()); + CALL_SUBTEST_3(test_stress_runqueue()); +} diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index 134359611..4026f48f0 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -661,6 +661,9 @@ void test_cuda_digamma() for (int i = 5; i < 7; ++i) { VERIFY_IS_EQUAL(out(i), expected_out(i)); } + + cudaFree(d_in); + cudaFree(d_out); } template <typename Scalar> @@ -718,13 +721,17 @@ void test_cuda_zeta() assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); VERIFY_IS_EQUAL(out(0), expected_out(0)); - VERIFY_IS_APPROX_OR_LESS_THAN(out(3), expected_out(3)); + VERIFY((std::isnan)(out(3))); for (int i = 1; i < 6; ++i) { if (i != 3) { VERIFY_IS_APPROX(out(i), expected_out(i)); } } + + cudaFree(d_in_x); + cudaFree(d_in_q); + cudaFree(d_out); } template <typename Scalar> @@ -787,6 +794,10 @@ void test_cuda_polygamma() for (int i = 0; i < 7; ++i) { VERIFY_IS_APPROX(out(i), expected_out(i)); } + + cudaFree(d_in_x); + cudaFree(d_in_n); + cudaFree(d_out); } template <typename Scalar> @@ -826,9 +837,9 @@ void test_cuda_igamma() Scalar* d_a; Scalar* d_x; Scalar* d_out; - cudaMalloc((void**)(&d_a), bytes); - cudaMalloc((void**)(&d_x), bytes); - cudaMalloc((void**)(&d_out), bytes); + assert(cudaMalloc((void**)(&d_a), bytes) == cudaSuccess); + assert(cudaMalloc((void**)(&d_x), bytes) == cudaSuccess); + assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess); cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice); @@ -854,6 +865,10 @@ void test_cuda_igamma() } } } + + cudaFree(d_a); + cudaFree(d_x); + cudaFree(d_out); } template <typename Scalar> @@ -920,6 +935,10 @@ void test_cuda_igammac() } } } + + cudaFree(d_a); + cudaFree(d_x); + cudaFree(d_out); } template <typename Scalar> @@ -935,8 +954,8 @@ void test_cuda_erf(const Scalar stddev) Scalar* d_in; Scalar* d_out; - cudaMalloc((void**)(&d_in), bytes); - cudaMalloc((void**)(&d_out), bytes); + assert(cudaMalloc((void**)(&d_in), bytes) == cudaSuccess); + assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess); cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp index 1c33fefb3..5fe164859 100644 --- a/unsupported/test/cxx11_tensor_fixed_size.cpp +++ b/unsupported/test/cxx11_tensor_fixed_size.cpp @@ -20,6 +20,8 @@ static void test_0d() TensorFixedSize<float, Sizes<> > scalar1; TensorFixedSize<float, Sizes<>, RowMajor> scalar2; VERIFY_IS_EQUAL(scalar1.rank(), 0); + VERIFY_IS_EQUAL(scalar1.size(), 1); + VERIFY_IS_EQUAL(array_prod(scalar1.dimensions()), 1); scalar1() = 7.0; scalar2() = 13.0; diff --git a/unsupported/test/cxx11_tensor_math.cpp b/unsupported/test/cxx11_tensor_math.cpp index d247bebaa..61c742a16 100644 --- a/unsupported/test/cxx11_tensor_math.cpp +++ b/unsupported/test/cxx11_tensor_math.cpp @@ -16,7 +16,7 @@ using Eigen::RowMajor; static void test_tanh() { - Tensor<float, 1> vec1({6}); + Tensor<float, 1> vec1(6); vec1.setRandom(); Tensor<float, 1> vec2 = vec1.tanh(); @@ -28,7 +28,7 @@ static void test_tanh() static void test_sigmoid() { - Tensor<float, 1> vec1({6}); + Tensor<float, 1> vec1(6); vec1.setRandom(); Tensor<float, 1> vec2 = vec1.sigmoid(); diff --git a/unsupported/test/cxx11_tensor_mixed_indices.cpp b/unsupported/test/cxx11_tensor_mixed_indices.cpp index 72f826216..4fba6fdd1 100644 --- a/unsupported/test/cxx11_tensor_mixed_indices.cpp +++ b/unsupported/test/cxx11_tensor_mixed_indices.cpp @@ -14,8 +14,8 @@ static void test_simple() { - Tensor<float, 1, ColMajor> vec1({6}); - Tensor<float, 1, ColMajor, int> vec2({6}); + Tensor<float, 1, ColMajor> vec1(6); + Tensor<float, 1, ColMajor, int> vec2(6); vec1(0) = 4.0; vec2(0) = 0.0; vec1(1) = 8.0; vec2(1) = 1.0; diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index cb917bb37..154a72d5c 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -228,6 +228,42 @@ void test_cuda_reductions() { gpu_device.deallocate(d_res_float); } +void test_cuda_forced_evals() { + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int num_elem = 101; + + float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float( + d_float, num_elem); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half( + d_res_half, num_elem); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float( + d_res_float, num_elem); + + gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f); + gpu_res_float.device(gpu_device) = gpu_float.abs(); + gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>(); + + Tensor<float, 1> half_prec(num_elem); + Tensor<float, 1> full_prec(num_elem); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); + gpu_device.synchronize(); + + for (int i = 0; i < num_elem; ++i) { + std::cout << "Checking unary " << i << std::endl; + VERIFY_IS_APPROX(full_prec(i), half_prec(i)); + } + + gpu_device.deallocate(d_float); + gpu_device.deallocate(d_res_half); + gpu_device.deallocate(d_res_float); +} #endif @@ -246,6 +282,7 @@ void test_cxx11_tensor_of_float16_cuda() CALL_SUBTEST_1(test_cuda_elementwise()); CALL_SUBTEST_2(test_cuda_contractions()); CALL_SUBTEST_3(test_cuda_reductions()); + CALL_SUBTEST_4(test_cuda_forced_evals()); } else { std::cout << "Half floats require compute capability of at least 5.3. This device only supports " << device.majorDeviceVersion() << "." << device.minorDeviceVersion() << ". Skipping the test" << std::endl; diff --git a/unsupported/test/levenberg_marquardt.cpp b/unsupported/test/levenberg_marquardt.cpp index 6dc17bd17..64f168c16 100644 --- a/unsupported/test/levenberg_marquardt.cpp +++ b/unsupported/test/levenberg_marquardt.cpp @@ -792,7 +792,9 @@ void testNistMGH10(void) MGH10_functor functor; LevenbergMarquardt<MGH10_functor> lm(functor); info = lm.minimize(x); + ++g_test_level; VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall); + --g_test_level; // was: VERIFY_IS_EQUAL(info, 1); // check norm^2 |