diff options
-rw-r--r-- | Eigen/src/Core/functors/BinaryFunctors.h | 47 | ||||
-rw-r--r-- | Eigen/src/Core/functors/UnaryFunctors.h | 43 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/Tensor | 2 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h | 138 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 10 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h | 71 |
6 files changed, 156 insertions, 155 deletions
diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h index 5cd8ca950..c90716eba 100644 --- a/Eigen/src/Core/functors/BinaryFunctors.h +++ b/Eigen/src/Core/functors/BinaryFunctors.h @@ -89,13 +89,13 @@ template<typename LhsScalar,typename RhsScalar> struct scalar_conj_product_op { enum { Conj = NumTraits<LhsScalar>::IsComplex }; - + typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type; - + EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return conj_helper<LhsScalar,RhsScalar,Conj,false>().pmul(a,b); } - + template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const { return conj_helper<Packet,Packet,Conj,false>().pmul(a,b); } @@ -591,6 +591,47 @@ template<typename Scalar> struct functor_traits<scalar_inverse_mult_op<Scalar> > { enum { PacketAccess = packet_traits<Scalar>::HasDiv, Cost = NumTraits<Scalar>::template Div<PacketAccess>::Cost }; }; +/** \internal + * \brief Template functor to compute the modulo between an array and a fixed scalar. + */ +template <typename Scalar> +struct scalar_mod_op { + EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {} + EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a % m_divisor; } + const Scalar m_divisor; +}; +template <typename Scalar> +struct functor_traits<scalar_mod_op<Scalar> > +{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; }; + +/** \internal + * \brief Template functor to compute the modulo between two arrays. + */ +template <typename Scalar> +struct scalar_mod2_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op); + EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; } +}; +template <typename Scalar> +struct functor_traits<scalar_mod2_op<Scalar> > +{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; }; + +/** \internal + * \brief Template functor to compute the float modulo between two arrays. + */ +template <typename Scalar> +struct scalar_fmod_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar + operator()(const Scalar& a, const Scalar& b) const { + return numext::fmod(a, b); + } +}; +template <typename Scalar> +struct functor_traits<scalar_fmod_op<Scalar> > { + enum { Cost = 13, // Reciprocal throughput of FPREM on Haswell. + PacketAccess = false }; +}; } // end namespace internal diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index 3f7a635be..65a9bd144 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -496,7 +496,7 @@ struct functor_traits<scalar_digamma_op<Scalar> > PacketAccess = packet_traits<Scalar>::HasDiGamma }; }; - + /** \internal * \brief Template functor to compute the Riemann Zeta function of two arguments. * \sa class CwiseUnaryOp, Cwise::zeta() @@ -587,6 +587,33 @@ struct functor_traits<scalar_erfc_op<Scalar> > }; }; +/** \internal + * \brief Template functor to compute the sigmoid of a scalar + * \sa class CwiseUnaryOp, ArrayBase::sigmoid() + */ +template <typename T> +struct scalar_sigmoid_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { + const T one = T(1); + return one / (one + numext::exp(-x)); + } + + template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Packet packetOp(const Packet& x) const { + const Packet one = pset1<Packet>(T(1)); + return pdiv(one, padd(one, pexp(pnegate(x)))); + } +}; + +template <typename T> +struct functor_traits<scalar_sigmoid_op<T> > { + enum { + Cost = NumTraits<T>::AddCost * 2 + NumTraits<T>::MulCost * 6, + PacketAccess = packet_traits<T>::HasAdd && packet_traits<T>::HasDiv && + packet_traits<T>::HasNegate && packet_traits<T>::HasExp + }; +}; /** \internal * \brief Template functor to compute the atan of a scalar @@ -627,7 +654,7 @@ template<typename Scalar> struct scalar_tanh_op { const Packet plus_9 = pset1<Packet>(9.0); const Packet minus_9 = pset1<Packet>(-9.0); const Packet x = pmax(minus_9, pmin(plus_9, _x)); - + // The monomial coefficients of the numerator polynomial (odd). const Packet alpha_1 = pset1<Packet>(4.89352455891786e-03); const Packet alpha_3 = pset1<Packet>(6.37261928875436e-04); @@ -636,16 +663,16 @@ template<typename Scalar> struct scalar_tanh_op { const Packet alpha_9 = pset1<Packet>(-8.60467152213735e-11); const Packet alpha_11 = pset1<Packet>(2.00018790482477e-13); const Packet alpha_13 = pset1<Packet>(-2.76076847742355e-16); - + // The monomial coefficients of the denominator polynomial (even). const Packet beta_0 = pset1<Packet>(4.89352518554385e-03); const Packet beta_2 = pset1<Packet>(2.26843463243900e-03); const Packet beta_4 = pset1<Packet>(1.18534705686654e-04); const Packet beta_6 = pset1<Packet>(1.19825839466702e-06); - + // Since the polynomials are odd/even, we need x^2. const Packet x2 = pmul(x, x); - + // Evaluate the numerator polynomial p. Packet p = pmadd(x2, alpha_13, alpha_11); p = pmadd(x2, p, alpha_9); @@ -654,12 +681,12 @@ template<typename Scalar> struct scalar_tanh_op { p = pmadd(x2, p, alpha_3); p = pmadd(x2, p, alpha_1); p = pmul(x, p); - + // Evaluate the denominator polynomial p. Packet q = pmadd(x2, beta_6, beta_4); q = pmadd(x2, q, beta_2); q = pmadd(x2, q, beta_0); - + // Divide the numerator by the denominator. return pdiv(p, q); } @@ -938,7 +965,7 @@ struct scalar_sign_op<Scalar,true> { template<typename Scalar> struct functor_traits<scalar_sign_op<Scalar> > { enum { - Cost = + Cost = NumTraits<Scalar>::IsComplex ? ( 8*NumTraits<Scalar>::MulCost ) // roughly : ( 3*NumTraits<Scalar>::AddCost), diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 1e97ad3c0..e45612b58 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -69,6 +69,7 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorMacros.h" #include "src/Tensor/TensorForwardDeclarations.h" #include "src/Tensor/TensorMeta.h" +#include "src/Tensor/TensorCostModel.h" #include "src/Tensor/TensorDeviceDefault.h" #include "src/Tensor/TensorDeviceThreadPool.h" #include "src/Tensor/TensorDeviceCuda.h" @@ -83,7 +84,6 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorBase.h" -#include "src/Tensor/TensorCostModel.h" #include "src/Tensor/TensorEvaluator.h" #include "src/Tensor/TensorExpr.h" #include "src/Tensor/TensorReduction.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index 4df4cc220..fab1e316a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -172,67 +172,69 @@ struct ThreadPoolDevice { pool_->Schedule(func); } - // parallelFor executes f with [0, size) arguments in parallel and waits for - // completion. Block size is choosen between min_block_size and - // 2 * min_block_size to achieve the best parallel efficiency. - // If min_block_size == -1, parallelFor uses block size of 1. - // If hard_align > 0, block size is aligned to hard_align. - // If soft_align > hard_align, block size is aligned to soft_align provided - // that it does not increase block size too much. - void parallelFor(Index size, Index min_block_size, Index hard_align, - Index soft_align, + // parallelFor executes f with [0, n) arguments in parallel and waits for + // completion. F accepts a half-open interval [first, last). + // Block size is choosen based on the iteration cost and resulting parallel + // efficiency. If block_align is not nullptr, it is called to round up the + // block size. + void parallelFor(Index n, const TensorOpCost& cost, + std::function<Index(Index)> block_align, std::function<void(Index, Index)> f) const { - if (size <= 1 || (min_block_size != -1 && size < min_block_size) || - numThreads() == 1) { - f(0, size); + typedef TensorCostModel<ThreadPoolDevice> CostModel; + if (n <= 1 || numThreads() == 1 || + CostModel::numThreads(n, cost, numThreads()) == 1) { + f(0, n); return; } - Index block_size = 1; - Index block_count = size; - if (min_block_size != -1) { - // Calculate block size based on (1) estimated cost and (2) parallel - // efficiency. We want blocks to be not too small to mitigate - // parallelization overheads; not too large to mitigate tail effect and - // potential load imbalance and we also want number of blocks to be evenly - // dividable across threads. - min_block_size = numext::maxi<Index>(min_block_size, 1); - block_size = numext::mini(min_block_size, size); - // Upper bound on block size: - const Index max_block_size = numext::mini(min_block_size * 2, size); - block_size = numext::mini( - alignBlockSize(block_size, hard_align, soft_align), size); - block_count = divup(size, block_size); - // Calculate parallel efficiency as fraction of total CPU time used for - // computations: - double max_efficiency = - static_cast<double>(block_count) / - (divup<int>(block_count, numThreads()) * numThreads()); - // Now try to increase block size up to max_block_size as long as it - // doesn't decrease parallel efficiency. - for (Index prev_block_count = block_count; prev_block_count > 1;) { - // This is the next block size that divides size into a smaller number - // of blocks than the current block_size. - Index coarser_block_size = divup(size, prev_block_count - 1); - coarser_block_size = - alignBlockSize(coarser_block_size, hard_align, soft_align); - if (coarser_block_size > max_block_size) { - break; // Reached max block size. Stop. - } - // Recalculate parallel efficiency. - const Index coarser_block_count = divup(size, coarser_block_size); - eigen_assert(coarser_block_count < prev_block_count); - prev_block_count = coarser_block_count; - const double coarser_efficiency = - static_cast<double>(coarser_block_count) / - (divup<int>(coarser_block_count, numThreads()) * numThreads()); - if (coarser_efficiency + 0.01 >= max_efficiency) { - // Taking it. - block_size = coarser_block_size; - block_count = coarser_block_count; - if (max_efficiency < coarser_efficiency) { - max_efficiency = coarser_efficiency; - } + // Calculate block size based on (1) the iteration cost and (2) parallel + // efficiency. We want blocks to be not too small to mitigate + // parallelization overheads; not too large to mitigate tail + // effect and potential load imbalance and we also want number + // of blocks to be evenly dividable across threads. + + double block_size_f = 1.0 / CostModel::taskSize(1, cost); + Index block_size = numext::mini(n, numext::maxi<Index>(1, block_size_f)); + const Index max_block_size = + numext::mini(n, numext::maxi<Index>(1, 2 * block_size_f)); + if (block_align) { + Index new_block_size = block_align(block_size); + eigen_assert(new_block_size >= block_size); + block_size = numext::mini(n, new_block_size); + } + Index block_count = divup(n, block_size); + // Calculate parallel efficiency as fraction of total CPU time used for + // computations: + double max_efficiency = + static_cast<double>(block_count) / + (divup<int>(block_count, numThreads()) * numThreads()); + // Now try to increase block size up to max_block_size as long as it + // doesn't decrease parallel efficiency. + for (Index prev_block_count = block_count; prev_block_count > 1;) { + // This is the next block size that divides size into a smaller number + // of blocks than the current block_size. + Index coarser_block_size = divup(n, prev_block_count - 1); + if (block_align) { + Index new_block_size = block_align(coarser_block_size); + eigen_assert(new_block_size >= coarser_block_size); + coarser_block_size = numext::mini(n, new_block_size); + } + if (coarser_block_size > max_block_size) { + break; // Reached max block size. Stop. + } + // Recalculate parallel efficiency. + const Index coarser_block_count = divup(n, coarser_block_size); + eigen_assert(coarser_block_count < prev_block_count); + prev_block_count = coarser_block_count; + const double coarser_efficiency = + static_cast<double>(coarser_block_count) / + (divup<int>(coarser_block_count, numThreads()) * numThreads()); + if (coarser_efficiency + 0.01 >= max_efficiency) { + // Taking it. + block_size = coarser_block_size; + block_count = coarser_block_count; + if (max_efficiency < coarser_efficiency) { + max_efficiency = coarser_efficiency; } } } @@ -251,26 +253,20 @@ struct ThreadPoolDevice { } // Split into halves and submit to the pool. Index mid = first + divup((last - first) / 2, block_size) * block_size; - pool_->Schedule([=, &handleRange]() { handleRange(mid, last); }); - pool_->Schedule([=, &handleRange]() { handleRange(first, mid); }); + enqueue_func([=, &handleRange]() { handleRange(mid, last); }); + enqueue_func([=, &handleRange]() { handleRange(first, mid); }); }; - handleRange(0, size); + handleRange(0, n); barrier.Wait(); } - private: - static Index alignBlockSize(Index size, Index hard_align, Index soft_align) { - if (soft_align > hard_align && size >= 4 * soft_align) { - // Align to soft_align, if it won't increase size by more than 25%. - return (size + soft_align - 1) & ~(soft_align - 1); - } - if (hard_align > 0) { - return (size + hard_align - 1) & ~(hard_align - 1); - } - return size; + // Convinience wrapper for parallelFor that does not align blocks. + void parallelFor(Index n, const TensorOpCost& cost, + std::function<void(Index, Index)> f) const { + parallelFor(n, cost, nullptr, std::move(f)); } - + private: ThreadPoolInterface* pool_; size_t num_threads_; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 1155354cd..e0df13e78 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -137,6 +137,13 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> { { const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1; const Index size = array_prod(evaluator.dimensions()); +#if defined(EIGEN_USE_NONBLOCKING_THREAD_POOL) && defined(EIGEN_USE_COST_MODEL) + device.parallelFor(size, evaluator.costPerCoeff(Vectorizable), + EvalRange::alignBlockSize, + [&evaluator](Index first, Index last) { + EvalRange::run(&evaluator, first, last); + }); +#else size_t num_threads = device.numThreads(); #ifdef EIGEN_USE_COST_MODEL if (num_threads > 1) { @@ -163,11 +170,12 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> { } barrier.Wait(); } +#endif // EIGEN_USE_NONBLOCKING_THREAD_POOL } evaluator.cleanup(); } }; -#endif +#endif // EIGEN_USE_THREADS // GPU: the evaluation of the expression is offloaded to a GPU. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index c674fcfe1..d07063444 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -13,77 +13,6 @@ namespace Eigen { namespace internal { - -/** \internal - * \brief Template functor to compute the modulo between an array and a scalar. - */ -template <typename Scalar> -struct scalar_mod_op { - EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {} - EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a % m_divisor; } - const Scalar m_divisor; -}; -template <typename Scalar> -struct functor_traits<scalar_mod_op<Scalar> > -{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; }; - - -/** \internal - * \brief Template functor to compute the modulo between 2 arrays. - */ -template <typename Scalar> -struct scalar_mod2_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op); - EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; } -}; -template <typename Scalar> -struct functor_traits<scalar_mod2_op<Scalar> > -{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; }; - -template <typename Scalar> -struct scalar_fmod_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op); - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar - operator()(const Scalar& a, const Scalar& b) const { - return numext::fmod(a, b); - } -}; -template <typename Scalar> -struct functor_traits<scalar_fmod_op<Scalar> > { - enum { Cost = 13, // Reciprocal throughput of FPREM on Haswell. - PacketAccess = false }; -}; - - -/** \internal - * \brief Template functor to compute the sigmoid of a scalar - * \sa class CwiseUnaryOp, ArrayBase::sigmoid() - */ -template <typename T> -struct scalar_sigmoid_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { - const T one = T(1); - return one / (one + numext::exp(-x)); - } - - template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Packet packetOp(const Packet& x) const { - const Packet one = pset1<Packet>(T(1)); - return pdiv(one, padd(one, pexp(pnegate(x)))); - } -}; - -template <typename T> -struct functor_traits<scalar_sigmoid_op<T> > { - enum { - Cost = NumTraits<T>::AddCost * 2 + NumTraits<T>::MulCost * 6, - PacketAccess = packet_traits<T>::HasAdd && packet_traits<T>::HasDiv && - packet_traits<T>::HasNegate && packet_traits<T>::HasExp - }; -}; - - // Standard reduction functors template <typename T> struct SumReducer { |