diff options
Diffstat (limited to 'unsupported')
89 files changed, 4328 insertions, 898 deletions
diff --git a/unsupported/Eigen/CMakeLists.txt b/unsupported/Eigen/CMakeLists.txt index 2fc8db412..631a06014 100644 --- a/unsupported/Eigen/CMakeLists.txt +++ b/unsupported/Eigen/CMakeLists.txt @@ -18,6 +18,7 @@ set(Eigen_HEADERS Polynomials Skyline SparseExtra + SpecialFunctions Splines ) @@ -26,5 +27,6 @@ install(FILES DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel ) -add_subdirectory(src) +install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel FILES_MATCHING PATTERN "*.h") + add_subdirectory(CXX11) diff --git a/unsupported/Eigen/CXX11/CMakeLists.txt b/unsupported/Eigen/CXX11/CMakeLists.txt index a40bc4715..385ed240c 100644 --- a/unsupported/Eigen/CXX11/CMakeLists.txt +++ b/unsupported/Eigen/CXX11/CMakeLists.txt @@ -5,4 +5,4 @@ install(FILES DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel ) -add_subdirectory(src) +install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel FILES_MATCHING PATTERN "*.h") diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 859147404..f7b94cee1 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -15,6 +15,7 @@ #include <Eigen/src/Core/util/DisableStupidWarnings.h> +#include "../SpecialFunctions" #include "src/util/CXX11Meta.h" #include "src/util/MaxSizeVector.h" @@ -80,6 +81,7 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorTraits.h" #include "src/Tensor/TensorUInt128.h" #include "src/Tensor/TensorIntDiv.h" +#include "src/Tensor/TensorGlobalFunctions.h" #include "src/Tensor/TensorBase.h" diff --git a/unsupported/Eigen/CXX11/src/CMakeLists.txt b/unsupported/Eigen/CXX11/src/CMakeLists.txt deleted file mode 100644 index 1734262bb..000000000 --- a/unsupported/Eigen/CXX11/src/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -add_subdirectory(util) -add_subdirectory(ThreadPool) -add_subdirectory(Tensor) -add_subdirectory(TensorSymmetry) diff --git a/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt b/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt deleted file mode 100644 index 6d4b3ea0d..000000000 --- a/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_CXX11_Tensor_SRCS "*.h") - -INSTALL(FILES - ${Eigen_CXX11_Tensor_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/Tensor COMPONENT Devel - ) diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md index fda33edda..02146527b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/README.md +++ b/unsupported/Eigen/CXX11/src/Tensor/README.md @@ -1102,7 +1102,7 @@ Example: Reduction along two dimensions. As a special case, if you pass no parameter to a reduction operation the original tensor is reduced along *all* its dimensions. The result is a -one-dimension tensor with a single value. +scalar, represented as a zero-dimension tensor. Eigen::Tensor<float, 3> a(2, 3, 4); a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f}, @@ -1112,7 +1112,7 @@ one-dimension tensor with a single value. {19.0f, 18.0f, 17.0f, 16.0f}, {20.0f, 21.0f, 22.0f, 23.0f}}}); // Reduce along all dimensions using the sum() operator. - Eigen::Tensor<float, 1> b = a.sum(); + Eigen::Tensor<float, 0> b = a.sum(); cout << "b" << endl << b << endl << endl; => b diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 1eaa8d4fc..7a45a5cf4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -192,6 +192,12 @@ class TensorBase<Derived, ReadOnlyAccessors> } EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log1p_op<Scalar>, const Derived> + log1p() const { + return unaryExpr(internal::scalar_log1p_op<Scalar>()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived> abs() const { return unaryExpr(internal::scalar_abs_op<Scalar>()); @@ -204,34 +210,74 @@ class TensorBase<Derived, ReadOnlyAccessors> } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_pow_op<Scalar>, const Derived> + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_pow_op<Scalar,Scalar> >, const Derived> pow(Scalar exponent) const { - return unaryExpr(internal::scalar_pow_op<Scalar>(exponent)); + return unaryExpr(internal::bind2nd_op<internal::scalar_pow_op<Scalar,Scalar> >(exponent)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_real_op<Scalar>, const Derived> + real() const { + return unaryExpr(internal::scalar_real_op<Scalar>()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_imag_op<Scalar>, const Derived> + imag() const { + return unaryExpr(internal::scalar_imag_op<Scalar>()); } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived> + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_sum_op<Scalar,Scalar> >, const Derived> operator+ (Scalar rhs) const { - return unaryExpr(internal::scalar_add_op<Scalar>(rhs)); + return unaryExpr(internal::bind2nd_op<internal::scalar_sum_op<Scalar,Scalar> >(rhs)); } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sub_op<Scalar>, const Derived> + EIGEN_STRONG_INLINE friend + const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_sum_op<Scalar> >, const Derived> + operator+ (Scalar lhs, const Derived& rhs) { + return rhs.unaryExpr(internal::bind1st_op<internal::scalar_sum_op<Scalar> >(lhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_difference_op<Scalar,Scalar> >, const Derived> operator- (Scalar rhs) const { EIGEN_STATIC_ASSERT((NumTraits<Scalar>::IsSigned || internal::is_same<Scalar, const std::complex<float> >::value), YOU_MADE_A_PROGRAMMING_MISTAKE); - return unaryExpr(internal::scalar_sub_op<Scalar>(rhs)); + return unaryExpr(internal::bind2nd_op<internal::scalar_difference_op<Scalar,Scalar> >(rhs)); } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Derived> + EIGEN_STRONG_INLINE friend + const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_difference_op<Scalar> >, const Derived> + operator- (Scalar lhs, const Derived& rhs) { + return rhs.unaryExpr(internal::bind1st_op<internal::scalar_difference_op<Scalar> >(lhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_product_op<Scalar,Scalar> >, const Derived> operator* (Scalar rhs) const { - return unaryExpr(internal::scalar_multiple_op<Scalar>(rhs)); + return unaryExpr(internal::bind2nd_op<internal::scalar_product_op<Scalar,Scalar> >(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE friend + const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_product_op<Scalar> >, const Derived> + operator* (Scalar lhs, const Derived& rhs) { + return rhs.unaryExpr(internal::bind1st_op<internal::scalar_product_op<Scalar> >(lhs)); } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_quotient1_op<Scalar>, const Derived> + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_quotient_op<Scalar,Scalar> >, const Derived> operator/ (Scalar rhs) const { - return unaryExpr(internal::scalar_quotient1_op<Scalar>(rhs)); + return unaryExpr(internal::bind2nd_op<internal::scalar_quotient_op<Scalar,Scalar> >(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE friend + const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_quotient_op<Scalar> >, const Derived> + operator/ (Scalar lhs, const Derived& rhs) { + return rhs.unaryExpr(internal::bind1st_op<internal::scalar_quotient_op<Scalar> >(lhs)); } EIGEN_DEVICE_FUNC @@ -277,7 +323,6 @@ class TensorBase<Derived, ReadOnlyAccessors> return unaryExpr(internal::scalar_floor_op<Scalar>()); } - // Generic binary operation support. template <typename CustomBinaryOp, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived> @@ -342,66 +387,66 @@ class TensorBase<Derived, ReadOnlyAccessors> // Comparisons and tests. template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LT>, const Derived, const OtherDerived> + const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const OtherDerived> operator<(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_LT>()); + return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>()); } template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LE>, const Derived, const OtherDerived> + const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const OtherDerived> operator<=(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_LE>()); + return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>()); } template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GT>, const Derived, const OtherDerived> + const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const OtherDerived> operator>(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_GT>()); + return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>()); } template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GE>, const Derived, const OtherDerived> + const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const OtherDerived> operator>=(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_GE>()); + return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>()); } template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_EQ>, const Derived, const OtherDerived> + const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const OtherDerived> operator==(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_EQ>()); + return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>()); } template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>, const Derived, const OtherDerived> + const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const OtherDerived> operator!=(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>()); + return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>()); } // comparisons and tests for Scalars EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > operator<(Scalar threshold) const { return operator<(constant(threshold)); } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > operator<=(Scalar threshold) const { return operator<=(constant(threshold)); } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > operator>(Scalar threshold) const { return operator>(constant(threshold)); } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > operator>=(Scalar threshold) const { return operator>=(constant(threshold)); } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_EQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > operator==(Scalar threshold) const { return operator==(constant(threshold)); } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > operator!=(Scalar threshold) const { return operator!=(constant(threshold)); } @@ -457,15 +502,22 @@ class TensorBase<Derived, ReadOnlyAccessors> typedef TensorScanOp<internal::SumReducer<CoeffReturnType>, const Derived> TensorScanSumOp; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorScanSumOp - cumsum(const Index& axis) const { - return TensorScanSumOp(derived(), axis); + cumsum(const Index& axis, bool exclusive = false) const { + return TensorScanSumOp(derived(), axis, exclusive); } typedef TensorScanOp<internal::ProdReducer<CoeffReturnType>, const Derived> TensorScanProdOp; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorScanProdOp - cumprod(const Index& axis) const { - return TensorScanProdOp(derived(), axis); + cumprod(const Index& axis, bool exclusive = false) const { + return TensorScanProdOp(derived(), axis, exclusive); + } + + template <typename Reducer> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorScanOp<Reducer, const Derived> + scan(const Index& axis, const Reducer& reducer, bool exclusive = false) const { + return TensorScanOp<Reducer, const Derived>(derived(), axis, exclusive, reducer); } // Reductions. @@ -771,8 +823,8 @@ class TensorBase<Derived, ReadOnlyAccessors> EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); } }; -template<typename Derived> -class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyAccessors> { +template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value> +class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> { public: typedef internal::traits<Derived> DerivedTraits; typedef typename DerivedTraits::Scalar Scalar; @@ -782,7 +834,7 @@ class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyA template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor; template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize; - template <typename OtherDerived, int AccessLevel> friend class TensorBase; + template <typename OtherDerived, int OtherAccessLevel> friend class TensorBase; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& setZero() { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 56d9c2025..a6001074b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -25,8 +25,8 @@ template<typename Dimensions, typename LhsXprType, typename RhsXprType> struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> > { // Type promotion to handle the case where the types of the lhs and the rhs are different. - typedef typename internal::promote_storage_type<typename LhsXprType::Scalar, - typename RhsXprType::Scalar>::ret Scalar; + typedef typename gebp_traits<typename LhsXprType::Scalar, typename RhsXprType::Scalar>::ResScalar Scalar; + typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind, typename traits<RhsXprType>::StorageKind>::ret StorageKind; typedef typename promote_index_type<typename traits<LhsXprType>::Index, @@ -75,8 +75,8 @@ class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXp { public: typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar; - typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType, - typename RhsXprType::CoeffReturnType>::ret CoeffReturnType; + typedef typename internal::gebp_traits<typename LhsXprType::CoeffReturnType, + typename RhsXprType::CoeffReturnType>::ResScalar CoeffReturnType; typedef typename Eigen::internal::nested<TensorContractionOp>::type Nested; typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind; typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index 886474986..d65dbb40f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -461,8 +461,8 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, #undef writeResultShmem #undef writeRow - const int max_i_write = (min)((int)((m_size - base_m - threadIdx.y + 7) / 8), 8); - const int max_j_write = (min)((int)((n_size - base_n - threadIdx.z + 7) / 8), 8); + const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8); + const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8); if (threadIdx.x < max_i_write) { if (max_j_write == 8) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h index b27e1a1b4..9b2cb3ff6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -130,19 +130,19 @@ class SimpleTensorContractionMapper { } Index contract_val = left ? col : row; - for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) { - const Index idx = contract_val / m_k_strides[i]; - linidx += idx * m_contract_strides[i]; - contract_val -= idx * m_k_strides[i]; - } - if(array_size<contract_t>::value > 0) { - if (side == Rhs && inner_dim_contiguous) { - eigen_assert(m_contract_strides[0] == 1); - linidx += contract_val; - } else { - linidx += contract_val * m_contract_strides[0]; - } + for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) { + const Index idx = contract_val / m_k_strides[i]; + linidx += idx * m_contract_strides[i]; + contract_val -= idx * m_k_strides[i]; + } + + if (side == Rhs && inner_dim_contiguous) { + eigen_assert(m_contract_strides[0] == 1); + linidx += contract_val; + } else { + linidx += contract_val * m_contract_strides[0]; + } } return linidx; @@ -153,15 +153,15 @@ class SimpleTensorContractionMapper { const bool left = (side == Lhs); Index nocontract_val[2] = {left ? row : col, left ? row + distance : col}; Index linidx[2] = {0, 0}; - for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) { - const Index idx0 = nocontract_val[0] / m_ij_strides[i]; - const Index idx1 = nocontract_val[1] / m_ij_strides[i]; - linidx[0] += idx0 * m_nocontract_strides[i]; - linidx[1] += idx1 * m_nocontract_strides[i]; - nocontract_val[0] -= idx0 * m_ij_strides[i]; - nocontract_val[1] -= idx1 * m_ij_strides[i]; - } if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) { + for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) { + const Index idx0 = nocontract_val[0] / m_ij_strides[i]; + const Index idx1 = nocontract_val[1] / m_ij_strides[i]; + linidx[0] += idx0 * m_nocontract_strides[i]; + linidx[1] += idx1 * m_nocontract_strides[i]; + nocontract_val[0] -= idx0 * m_ij_strides[i]; + nocontract_val[1] -= idx1 * m_ij_strides[i]; + } if (side == Lhs && inner_dim_contiguous) { eigen_assert(m_nocontract_strides[0] == 1); linidx[0] += nocontract_val[0]; @@ -173,22 +173,24 @@ class SimpleTensorContractionMapper { } Index contract_val[2] = {left ? col : row, left ? col : row + distance}; - for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) { - const Index idx0 = contract_val[0] / m_k_strides[i]; - const Index idx1 = contract_val[1] / m_k_strides[i]; - linidx[0] += idx0 * m_contract_strides[i]; - linidx[1] += idx1 * m_contract_strides[i]; - contract_val[0] -= idx0 * m_k_strides[i]; - contract_val[1] -= idx1 * m_k_strides[i]; - } + if (array_size<contract_t>::value> 0) { + for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) { + const Index idx0 = contract_val[0] / m_k_strides[i]; + const Index idx1 = contract_val[1] / m_k_strides[i]; + linidx[0] += idx0 * m_contract_strides[i]; + linidx[1] += idx1 * m_contract_strides[i]; + contract_val[0] -= idx0 * m_k_strides[i]; + contract_val[1] -= idx1 * m_k_strides[i]; + } - if (side == Rhs && inner_dim_contiguous) { - eigen_assert(m_contract_strides[0] == 1); - linidx[0] += contract_val[0]; - linidx[1] += contract_val[1]; - } else { - linidx[0] += contract_val[0] * m_contract_strides[0]; - linidx[1] += contract_val[1] * m_contract_strides[0]; + if (side == Rhs && inner_dim_contiguous) { + eigen_assert(m_contract_strides[0] == 1); + linidx[0] += contract_val[0]; + linidx[1] += contract_val[1]; + } else { + linidx[0] += contract_val[0] * m_contract_strides[0]; + linidx[1] += contract_val[1] * m_contract_strides[0]; + } } return IndexPair<Index>(linidx[0], linidx[1]); } @@ -200,7 +202,7 @@ class SimpleTensorContractionMapper { return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size; } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const { - return ((side == Lhs) && inner_dim_contiguous) ? m_contract_strides[0] : 1; + return ((side == Lhs) && inner_dim_contiguous && array_size<contract_t>::value > 0) ? m_contract_strides[0] : 1; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index a60a17049..ee16cde9b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -202,7 +202,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT // across k dimension. const TensorOpCost cost = contractionCost(m, n, bm, bn, bk, shard_by_col, false); - Index num_threads = TensorCostModel<ThreadPoolDevice>::numThreads( + int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads( static_cast<double>(n) * m, cost, this->m_device.numThreads()); // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost @@ -301,7 +301,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT class Context { public: Context(const Device& device, int num_threads, LhsMapper& lhs, - RhsMapper& rhs, Scalar* buffer, Index m, Index n, Index k, Index bm, + RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm, Index bn, Index bk, Index nm, Index nn, Index nk, Index gm, Index gn, Index nm0, Index nn0, bool shard_by_col, bool parallel_pack) @@ -309,13 +309,13 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT lhs_(lhs), rhs_(rhs), buffer_(buffer), - output_(buffer, m), + output_(buffer, tm), num_threads_(num_threads), shard_by_col_(shard_by_col), parallel_pack_(parallel_pack), - m_(m), - n_(n), - k_(k), + m_(tm), + n_(tn), + k_(tk), bm_(bm), bn_(bn), bk_(bk), diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h index a76c8ca35..d66e45d50 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h @@ -91,21 +91,21 @@ class TensorOpCost { } // TODO(rmlarsen): Define min in terms of total cost, not elementwise. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMin( - const TensorOpCost& rhs) { - bytes_loaded_ = numext::mini(bytes_loaded_, rhs.bytes_loaded()); - bytes_stored_ = numext::mini(bytes_stored_, rhs.bytes_stored()); - compute_cycles_ = numext::mini(compute_cycles_, rhs.compute_cycles()); - return *this; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin( + const TensorOpCost& rhs) const { + double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded()); + double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored()); + double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles()); + return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles); } // TODO(rmlarsen): Define max in terms of total cost, not elementwise. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMax( - const TensorOpCost& rhs) { - bytes_loaded_ = numext::maxi(bytes_loaded_, rhs.bytes_loaded()); - bytes_stored_ = numext::maxi(bytes_stored_, rhs.bytes_stored()); - compute_cycles_ = numext::maxi(compute_cycles_, rhs.compute_cycles()); - return *this; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax( + const TensorOpCost& rhs) const { + double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded()); + double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored()); + double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles()); + return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=( diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index 6c12b2ed8..1468caa23 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -12,6 +12,8 @@ namespace Eigen { +static const int kCudaScratchSize = 1024; + // This defines an interface that GPUDevice can take to use // CUDA streams underneath. class StreamInterface { @@ -27,6 +29,12 @@ class StreamInterface { // Return a scratchpad buffer of size 1k virtual void* scratchpad() const = 0; + + // Return a semaphore. The semaphore is initially initialized to 0, and + // each kernel using it is responsible for resetting to 0 upon completion + // to maintain the invariant that the semaphore is always equal to 0 upon + // each kernel start. + virtual unsigned int* semaphore() const = 0; }; static cudaDeviceProp* m_deviceProperties; @@ -65,12 +73,12 @@ static const cudaStream_t default_stream = cudaStreamDefault; class CudaStreamDevice : public StreamInterface { public: // Use the default stream on the current device - CudaStreamDevice() : stream_(&default_stream), scratch_(NULL) { + CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) { cudaGetDevice(&device_); initializeDeviceProp(); } // Use the default stream on the specified device - CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL) { + CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) { initializeDeviceProp(); } // Use the specified stream. Note that it's the @@ -78,7 +86,7 @@ class CudaStreamDevice : public StreamInterface { // the specified device. If no device is specified the code // assumes that the stream is associated to the current gpu device. CudaStreamDevice(const cudaStream_t* stream, int device = -1) - : stream_(stream), device_(device), scratch_(NULL) { + : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) { if (device < 0) { cudaGetDevice(&device_); } else { @@ -123,15 +131,27 @@ class CudaStreamDevice : public StreamInterface { virtual void* scratchpad() const { if (scratch_ == NULL) { - scratch_ = allocate(1024); + scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int)); } return scratch_; } + virtual unsigned int* semaphore() const { + if (semaphore_ == NULL) { + char* scratch = static_cast<char*>(scratchpad()) + kCudaScratchSize; + semaphore_ = reinterpret_cast<unsigned int*>(scratch); + cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_); + EIGEN_UNUSED_VARIABLE(err) + assert(err == cudaSuccess); + } + return semaphore_; + } + private: const cudaStream_t* stream_; int device_; mutable void* scratch_; + mutable unsigned int* semaphore_; }; struct GpuDevice { @@ -174,6 +194,15 @@ struct GpuDevice { #endif } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned int* semaphore() const { +#ifndef __CUDA_ARCH__ + return stream_->semaphore(); +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); + return NULL; +#endif + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { #ifndef __CUDA_ARCH__ cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index d31b0ad38..069680a11 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -106,7 +106,7 @@ static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) { // Build a thread pool device on top the an existing pool of threads. struct ThreadPoolDevice { // The ownership of the thread pool remains with the caller. - ThreadPoolDevice(ThreadPoolInterface* pool, size_t num_cores) : pool_(pool), num_threads_(num_cores) { } + ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores) : pool_(pool), num_threads_(num_cores) { } EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { return internal::aligned_malloc(num_bytes); @@ -130,7 +130,7 @@ struct ThreadPoolDevice { ::memset(buffer, c, n); } - EIGEN_STRONG_INLINE size_t numThreads() const { + EIGEN_STRONG_INLINE int numThreads() const { return num_threads_; } @@ -151,9 +151,7 @@ struct ThreadPoolDevice { template <class Function, class... Args> EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const { Notification* n = new Notification(); - std::function<void()> func = - std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, f, args...); - pool_->Schedule(func); + pool_->Schedule(std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, f, args...)); return n; } @@ -161,15 +159,19 @@ struct ThreadPoolDevice { EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b, Function&& f, Args&&... args) const { - std::function<void()> func = std::bind( - &FunctionWrapperWithBarrier<Function, Args...>::run, b, f, args...); - pool_->Schedule(func); + pool_->Schedule(std::bind( + &FunctionWrapperWithBarrier<Function, Args...>::run, b, f, args...)); } template <class Function, class... Args> EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const { - std::function<void()> func = std::bind(f, args...); - pool_->Schedule(func); + pool_->Schedule(std::bind(f, args...)); + } + + // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if + // called from one of the threads in pool_. Returns -1 otherwise. + EIGEN_STRONG_INLINE int currentThreadId() const { + return pool_->CurrentThreadId(); } // parallelFor executes f with [0, n) arguments in parallel and waits for @@ -182,7 +184,7 @@ struct ThreadPoolDevice { std::function<void(Index, Index)> f) const { typedef TensorCostModel<ThreadPoolDevice> CostModel; if (n <= 1 || numThreads() == 1 || - CostModel::numThreads(n, cost, numThreads()) == 1) { + CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) { f(0, n); return; } @@ -242,7 +244,7 @@ struct ThreadPoolDevice { // Recursively divide size into halves until we reach block_size. // Division code rounds mid to block_size, so we are guaranteed to get // block_count leaves that do actual computations. - Barrier barrier(block_count); + Barrier barrier(static_cast<unsigned int>(block_count)); std::function<void(Index, Index)> handleRange; handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) { if (last - first <= block_size) { @@ -268,7 +270,7 @@ struct ThreadPoolDevice { private: ThreadPoolInterface* pool_; - size_t num_threads_; + int num_threads_; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 26b1f65a8..a08dfa7c3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -94,7 +94,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device> static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { - IsAligned = true, + IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 31b361c83..33ffaa600 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -131,7 +131,7 @@ double loadConstant(const double* address) { } template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half loadConstant(const Eigen::half* address) { - return Eigen::half(internal::raw_uint16_to_half(__ldg(&address->x))); + return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x))); } #endif } @@ -403,6 +403,101 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg TensorEvaluator<RightArgType, Device> m_rightImpl; }; +// -------------------- CwiseTernaryOp -------------------- + +template<typename TernaryOp, typename Arg1Type, typename Arg2Type, typename Arg3Type, typename Device> +struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type>, Device> +{ + typedef TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type> XprType; + + enum { + IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned & TensorEvaluator<Arg3Type, Device>::IsAligned, + PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess & + internal::functor_traits<TernaryOp>::PacketAccess, + Layout = TensorEvaluator<Arg1Type, Device>::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_functor(op.functor()), + m_arg1Impl(op.arg1Expression(), device), + m_arg2Impl(op.arg2Expression(), device), + m_arg3Impl(op.arg3Expression(), device) + { + EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<Arg1Type, Device>::Layout) == static_cast<int>(TensorEvaluator<Arg3Type, Device>::Layout) || internal::traits<XprType>::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + + EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind, + typename internal::traits<Arg2Type>::StorageKind>::value), + STORAGE_KIND_MUST_MATCH) + EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind, + typename internal::traits<Arg3Type>::StorageKind>::value), + STORAGE_KIND_MUST_MATCH) + EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index, + typename internal::traits<Arg2Type>::Index>::value), + STORAGE_INDEX_MUST_MATCH) + EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index, + typename internal::traits<Arg3Type>::Index>::value), + STORAGE_INDEX_MUST_MATCH) + + eigen_assert(dimensions_match(m_arg1Impl.dimensions(), m_arg2Impl.dimensions()) && dimensions_match(m_arg1Impl.dimensions(), m_arg3Impl.dimensions())); + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::traits<XprType>::Scalar CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; + typedef typename TensorEvaluator<Arg1Type, Device>::Dimensions Dimensions; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const + { + // TODO: use arg2 or arg3 dimensions if they are known at compile time. + return m_arg1Impl.dimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + m_arg1Impl.evalSubExprsIfNeeded(NULL); + m_arg2Impl.evalSubExprsIfNeeded(NULL); + m_arg3Impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_arg1Impl.cleanup(); + m_arg2Impl.cleanup(); + m_arg3Impl.cleanup(); + } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index)); + } + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_functor.packetOp(m_arg1Impl.template packet<LoadMode>(index), + m_arg2Impl.template packet<LoadMode>(index), + m_arg3Impl.template packet<LoadMode>(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double functor_cost = internal::functor_traits<TernaryOp>::Cost; + return m_arg1Impl.costPerCoeff(vectorized) + + m_arg2Impl.costPerCoeff(vectorized) + + m_arg3Impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + + private: + const TernaryOp m_functor; + TensorEvaluator<Arg1Type, Device> m_arg1Impl; + TensorEvaluator<Arg1Type, Device> m_arg2Impl; + TensorEvaluator<Arg3Type, Device> m_arg3Impl; +}; + // -------------------- SelectOp -------------------- @@ -479,7 +574,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType> .cwiseMax(m_elseImpl.costPerCoeff(vectorized)); } - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return NULL; } private: TensorEvaluator<IfArgType, Device> m_condImpl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index ad5c97b57..a116bf17f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -159,7 +159,6 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> { #else size_t num_threads = device.numThreads(); if (num_threads > 1) { - cost = evaluator.costPerCoeff(Vectorizable) num_threads = TensorCostModel<ThreadPoolDevice>::numThreads( size, evaluator.costPerCoeff(Vectorizable), num_threads); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h index ea250d8bc..5f2e329f2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -219,6 +219,86 @@ class TensorCwiseBinaryOp : public TensorBase<TensorCwiseBinaryOp<BinaryOp, LhsX namespace internal { +template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType> +struct traits<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> > +{ + // Type promotion to handle the case where the types of the args are different. + typedef typename result_of< + TernaryOp(typename Arg1XprType::Scalar, + typename Arg2XprType::Scalar, + typename Arg3XprType::Scalar)>::type Scalar; + typedef traits<Arg1XprType> XprTraits; + typedef typename traits<Arg1XprType>::StorageKind StorageKind; + typedef typename traits<Arg1XprType>::Index Index; + typedef typename Arg1XprType::Nested Arg1Nested; + typedef typename Arg2XprType::Nested Arg2Nested; + typedef typename Arg3XprType::Nested Arg3Nested; + typedef typename remove_reference<Arg1Nested>::type _Arg1Nested; + typedef typename remove_reference<Arg2Nested>::type _Arg2Nested; + typedef typename remove_reference<Arg3Nested>::type _Arg3Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + + enum { + Flags = 0 + }; +}; + +template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType> +struct eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, Eigen::Dense> +{ + typedef const TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>& type; +}; + +template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType> +struct nested<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, 1, typename eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> >::type> +{ + typedef TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> type; +}; + +} // end namespace internal + + + +template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType> +class TensorCwiseTernaryOp : public TensorBase<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Scalar Scalar; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef Scalar CoeffReturnType; + typedef typename Eigen::internal::nested<TensorCwiseTernaryOp>::type Nested; + typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseTernaryOp(const Arg1XprType& arg1, const Arg2XprType& arg2, const Arg3XprType& arg3, const TernaryOp& func = TernaryOp()) + : m_arg1_xpr(arg1), m_arg2_xpr(arg2), m_arg3_xpr(arg3), m_functor(func) {} + + EIGEN_DEVICE_FUNC + const TernaryOp& functor() const { return m_functor; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename Arg1XprType::Nested>::type& + arg1Expression() const { return m_arg1_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename Arg1XprType::Nested>::type& + arg2Expression() const { return m_arg2_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename Arg3XprType::Nested>::type& + arg3Expression() const { return m_arg3_xpr; } + + protected: + typename Arg1XprType::Nested m_arg1_xpr; + typename Arg1XprType::Nested m_arg2_xpr; + typename Arg3XprType::Nested m_arg3_xpr; + const TernaryOp m_functor; +}; + + +namespace internal { template<typename IfXprType, typename ThenXprType, typename ElseXprType> struct traits<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> > : traits<ThenXprType> diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index ece2ed91b..08eb5595a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -329,7 +329,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D for (Index i = 0; i < n; ++i) { if(FFTDir == FFT_FORWARD) { - a[i] = data[i] * std::conj(pos_j_base_powered[i]); + a[i] = data[i] * numext::conj(pos_j_base_powered[i]); } else { a[i] = data[i] * pos_j_base_powered[i]; @@ -344,7 +344,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D b[i] = pos_j_base_powered[i]; } else { - b[i] = std::conj(pos_j_base_powered[i]); + b[i] = numext::conj(pos_j_base_powered[i]); } } for (Index i = n; i < m - n; ++i) { @@ -355,7 +355,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D b[i] = pos_j_base_powered[m-i]; } else { - b[i] = std::conj(pos_j_base_powered[m-i]); + b[i] = numext::conj(pos_j_base_powered[m-i]); } } @@ -379,7 +379,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D for (Index i = 0; i < n; ++i) { if(FFTDir == FFT_FORWARD) { - data[i] = a[i] * std::conj(pos_j_base_powered[i]); + data[i] = a[i] * numext::conj(pos_j_base_powered[i]); } else { data[i] = a[i] * pos_j_base_powered[i]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index 5d0548b84..c23ecdbc4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -102,7 +102,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device> EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { - const Index numValues = m_impl.dimensions().TotalSize(); + const Index numValues = internal::array_prod(m_impl.dimensions()); m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType)); // Should initialize the memory in case we're dealing with non POD types. if (NumTraits<CoeffReturnType>::RequireInitialization) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index a1a18d938..490ddd8bd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -16,11 +16,12 @@ template<typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType template<typename Scalar_, typename Dimensions, int Options_ = 0, typename IndexType = DenseIndex> class TensorFixedSize; template<typename PlainObjectType, int Options_ = Unaligned> class TensorMap; template<typename PlainObjectType> class TensorRef; -template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value> class TensorBase; +template<typename Derived, int AccessLevel> class TensorBase; template<typename NullaryOp, typename PlainObjectType> class TensorCwiseNullaryOp; template<typename UnaryOp, typename XprType> class TensorCwiseUnaryOp; template<typename BinaryOp, typename LeftXprType, typename RightXprType> class TensorCwiseBinaryOp; +template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType> class TensorCwiseTernaryOp; template<typename IfXprType, typename ThenXprType, typename ElseXprType> class TensorSelectOp; template<typename Op, typename Dims, typename XprType> class TensorReductionOp; template<typename XprType> class TensorIndexTupleOp; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index 3dd32e9d1..a8e48fced 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -84,6 +84,14 @@ struct functor_traits<scalar_sigmoid_op<T> > { }; +template<typename Reducer, typename Device> +struct reducer_traits { + enum { + Cost = 1, + PacketAccess = false + }; +}; + // Standard reduction functors template <typename T> struct SumReducer { @@ -119,6 +127,15 @@ template <typename T> struct SumReducer } }; +template <typename T, typename Device> +struct reducer_traits<SumReducer<T>, Device> { + enum { + Cost = NumTraits<T>::AddCost, + PacketAccess = PacketType<T, Device>::HasAdd + }; +}; + + template <typename T> struct MeanReducer { static const bool PacketAccess = packet_traits<T>::HasAdd && !NumTraits<T>::IsInteger; @@ -162,6 +179,15 @@ template <typename T> struct MeanReducer DenseIndex packetCount_; }; +template <typename T, typename Device> +struct reducer_traits<MeanReducer<T>, Device> { + enum { + Cost = NumTraits<T>::AddCost, + PacketAccess = PacketType<T, Device>::HasAdd + }; +}; + + template <typename T> struct MaxReducer { static const bool PacketAccess = packet_traits<T>::HasMax; @@ -195,6 +221,15 @@ template <typename T> struct MaxReducer } }; +template <typename T, typename Device> +struct reducer_traits<MaxReducer<T>, Device> { + enum { + Cost = NumTraits<T>::AddCost, + PacketAccess = PacketType<T, Device>::HasMax + }; +}; + + template <typename T> struct MinReducer { static const bool PacketAccess = packet_traits<T>::HasMin; @@ -228,6 +263,14 @@ template <typename T> struct MinReducer } }; +template <typename T, typename Device> +struct reducer_traits<MinReducer<T>, Device> { + enum { + Cost = NumTraits<T>::AddCost, + PacketAccess = PacketType<T, Device>::HasMin + }; +}; + template <typename T> struct ProdReducer { @@ -263,6 +306,14 @@ template <typename T> struct ProdReducer } }; +template <typename T, typename Device> +struct reducer_traits<ProdReducer<T>, Device> { + enum { + Cost = NumTraits<T>::MulCost, + PacketAccess = PacketType<T, Device>::HasMul + }; +}; + struct AndReducer { @@ -280,6 +331,15 @@ struct AndReducer } }; +template <typename Device> +struct reducer_traits<AndReducer, Device> { + enum { + Cost = 1, + PacketAccess = false + }; +}; + + struct OrReducer { static const bool PacketAccess = false; static const bool IsStateful = false; @@ -295,6 +355,15 @@ struct OrReducer { } }; +template <typename Device> +struct reducer_traits<OrReducer, Device> { + enum { + Cost = 1, + PacketAccess = false + }; +}; + + // Argmin/Argmax reducers template <typename T> struct ArgMaxTupleReducer { @@ -312,6 +381,15 @@ template <typename T> struct ArgMaxTupleReducer } }; +template <typename T, typename Device> +struct reducer_traits<ArgMaxTupleReducer<T>, Device> { + enum { + Cost = NumTraits<T>::AddCost, + PacketAccess = false + }; +}; + + template <typename T> struct ArgMinTupleReducer { static const bool PacketAccess = false; @@ -328,6 +406,14 @@ template <typename T> struct ArgMinTupleReducer } }; +template <typename T, typename Device> +struct reducer_traits<ArgMinTupleReducer<T>, Device> { + enum { + Cost = NumTraits<T>::AddCost, + PacketAccess = false + }; +}; + // Random number generation namespace { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h new file mode 100644 index 000000000..665b861cf --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h @@ -0,0 +1,33 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H +#define EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H + +namespace Eigen { + +/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given tensors. + * + * This function computes the regularized incomplete beta function (integral). + * + */ +template <typename ADerived, typename BDerived, typename XDerived> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const + TensorCwiseTernaryOp<internal::scalar_betainc_op<typename XDerived::Scalar>, + const ADerived, const BDerived, const XDerived> + betainc(const ADerived& a, const BDerived& b, const XDerived& x) { + return TensorCwiseTernaryOp< + internal::scalar_betainc_op<typename XDerived::Scalar>, const ADerived, + const BDerived, const XDerived>( + a, b, x, internal::scalar_betainc_op<typename XDerived::Scalar>()); +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h index 38a833f82..a901c5dd4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h @@ -13,38 +13,61 @@ namespace Eigen { namespace internal { -template<> -struct significant_decimals_impl<std::string> - : significant_decimals_default_impl<std::string, true> -{}; -} +// Print the tensor as a 2d matrix +template <typename Tensor, int Rank> +struct TensorPrinter { + static void run (std::ostream& os, const Tensor& tensor) { + typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar; + typedef typename Tensor::Index Index; + const Index total_size = internal::array_prod(tensor.dimensions()); + if (total_size > 0) { + const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions()); + static const int layout = Tensor::Layout; + Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim); + os << matrix; + } + } +}; + + +// Print the tensor as a vector +template <typename Tensor> +struct TensorPrinter<Tensor, 1> { + static void run (std::ostream& os, const Tensor& tensor) { + typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar; + typedef typename Tensor::Index Index; + const Index total_size = internal::array_prod(tensor.dimensions()); + if (total_size > 0) { + Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size); + os << array; + } + } +}; + + +// Print the tensor as a scalar +template <typename Tensor> +struct TensorPrinter<Tensor, 0> { + static void run (std::ostream& os, const Tensor& tensor) { + os << tensor.coeff(0); + } +}; +} template <typename T> std::ostream& operator << (std::ostream& os, const TensorBase<T, ReadOnlyAccessors>& expr) { + typedef TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> Evaluator; + typedef typename Evaluator::Dimensions Dimensions; + // Evaluate the expression if needed TensorForcedEvalOp<const T> eval = expr.eval(); - TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> tensor(eval, DefaultDevice()); + Evaluator tensor(eval, DefaultDevice()); tensor.evalSubExprsIfNeeded(NULL); - typedef typename internal::remove_const<typename T::Scalar>::type Scalar; - typedef typename T::Index Index; - typedef typename TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Dimensions Dimensions; - const Index total_size = internal::array_prod(tensor.dimensions()); - - // Print the tensor as a 1d vector or a 2d matrix. + // Print the result static const int rank = internal::array_size<Dimensions>::value; - if (rank == 0) { - os << tensor.coeff(0); - } else if (rank == 1) { - Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size); - os << array; - } else { - const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions()); - static const int layout = TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Layout; - Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim); - os << matrix; - } + internal::TensorPrinter<Evaluator, rank>::run(os, tensor); // Cleanup. tensor.cleanup(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index 33c6c1b0f..ede3939c2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -29,25 +29,47 @@ namespace Eigen { namespace internal { namespace { + // Note: result is undefined if val == 0 template <typename T> - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int count_leading_zeros(const T val) + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + typename internal::enable_if<sizeof(T)==4,int>::type count_leading_zeros(const T val) { #ifdef __CUDA_ARCH__ - return (sizeof(T) == 8) ? __clzll(val) : __clz(val); + return __clz(val); #elif EIGEN_COMP_MSVC - unsigned long index; - if (sizeof(T) == 8) { - _BitScanReverse64(&index, val); - } else { - _BitScanReverse(&index, val); - } - return (sizeof(T) == 8) ? 63 - index : 31 - index; + unsigned long index; + _BitScanReverse(&index, val); + return 31 - index; +#else + EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE); + return __builtin_clz(static_cast<uint32_t>(val)); +#endif + } + + template <typename T> + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + typename internal::enable_if<sizeof(T)==8,int>::type count_leading_zeros(const T val) + { +#ifdef __CUDA_ARCH__ + return __clzll(val); +#elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64 + unsigned long index; + _BitScanReverse64(&index, val); + return 63 - index; +#elif EIGEN_COMP_MSVC + // MSVC's _BitScanReverse64 is not available for 32bits builds. + unsigned int lo = (unsigned int)(val&0xffffffff); + unsigned int hi = (unsigned int)((val>>32)&0xffffffff); + int n; + if(hi==0) + n = 32 + count_leading_zeros<unsigned int>(lo); + else + n = count_leading_zeros<unsigned int>(hi); + return n; #else EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE); - return (sizeof(T) == 8) ? - __builtin_clzll(static_cast<uint64_t>(val)) : - __builtin_clz(static_cast<uint32_t>(val)); + return __builtin_clzll(static_cast<uint64_t>(val)); #endif } @@ -98,7 +120,9 @@ namespace { return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1); #else const uint64_t shift = 1ULL << log_div; - TensorUInt128<uint64_t, uint64_t> result = (TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider) - TensorUInt128<static_val<1>, static_val<0> >(1, 0) + TensorUInt128<static_val<0>, static_val<1> >(1)); + TensorUInt128<uint64_t, uint64_t> result = TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider) + - TensorUInt128<static_val<1>, static_val<0> >(1, 0) + + TensorUInt128<static_val<0>, static_val<1> >(1); return static_cast<uint64_t>(result); #endif } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index b1645d56f..fdb5ee6b8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -47,22 +47,39 @@ template <> struct max_n_1<0> { // Default packet types template <typename Scalar, typename Device> -struct PacketType { +struct PacketType : internal::packet_traits<Scalar> { typedef typename internal::packet_traits<Scalar>::type type; - enum { size = internal::unpacket_traits<type>::size }; }; // For CUDA packet types when using a GpuDevice -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16) template <> -struct PacketType<float, GpuDevice> { - typedef float4 type; - static const int size = 4; -}; -template <> -struct PacketType<double, GpuDevice> { - typedef double2 type; +struct PacketType<half, GpuDevice> { + typedef half2 type; static const int size = 2; + enum { + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 0, + HasMin = 1, + HasMax = 1, + HasConj = 0, + HasSetLinear = 0, + HasBlend = 0, + + HasDiv = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasExp = 1, + HasLog = 1, + HasLog1p = 0, + HasLog10 = 0, + HasPow = 1, + }; }; #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 52cfc2824..d34f1e328 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -148,7 +148,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); } - const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } + EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } protected: TensorEvaluator<ArgType, Device> m_impl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 99a09c058..9df697e4c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -264,7 +264,7 @@ struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> { const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; eigen_assert(num_coeffs >= numblocks * blocksize); - Barrier barrier(numblocks); + Barrier barrier(internal::convert_index<unsigned int>(numblocks)); MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize()); for (Index i = 0; i < numblocks; ++i) { device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run, @@ -316,7 +316,7 @@ struct OuterReducer { #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) template <int B, int N, typename S, typename R, typename I> -__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*); +__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); #ifdef EIGEN_HAS_CUDA_FP16 @@ -492,7 +492,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> } // Attempt to use an optimized reduction. - else if (RunningOnGPU && data && (m_device.majorDeviceVersion() >= 3)) { + else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) { bool reducing_inner_dims = true; for (int i = 0; i < NumReducedDims; ++i) { if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { @@ -505,8 +505,20 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> (reducing_inner_dims || ReducingInnerMostDims)) { const Index num_values_to_reduce = internal::array_prod(m_reducedDims); const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); + if (!data && num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) { + data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); + m_result = data; + } Op reducer(m_reducer); - return internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); + if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { + if (m_result) { + m_device.deallocate(m_result); + m_result = NULL; + } + return true; + } else { + return (m_result != NULL); + } } bool preserving_inner_dims = true; @@ -521,8 +533,20 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> preserving_inner_dims) { const Index num_values_to_reduce = internal::array_prod(m_reducedDims); const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); + if (!data && num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) { + data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); + m_result = data; + } Op reducer(m_reducer); - return internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); + if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { + if (m_result) { + m_device.deallocate(m_result); + m_result = NULL; + } + return true; + } else { + return (m_result != NULL); + } } } return true; @@ -537,8 +561,8 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - if (RunningFullReduction && m_result) { - return *m_result; + if ((RunningFullReduction || RunningOnGPU) && m_result) { + return *(m_result + index); } Op reducer(m_reducer); if (ReducingInnerMostDims || RunningFullReduction) { @@ -558,7 +582,11 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions()))); + + if (RunningOnGPU && m_result) { + return internal::pload<PacketReturnType>(m_result + index); + } EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; if (ReducingInnerMostDims) { @@ -616,7 +644,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> template <typename S, typename O, bool V> friend struct internal::FullReducerShard; #endif #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) - template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*); + template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); #ifdef EIGEN_HAS_CUDA_FP16 template <typename S, typename R, typename I> friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 45087a9a4..65638b6a8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -67,11 +67,21 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) #endif } +// We extend atomicExch to support extra data types +template <typename Type> +__device__ inline Type atomicExchCustom(Type* address, Type val) { + return atomicExch(address, val); +} + +template <> +__device__ inline double atomicExchCustom(double* address, double val) { + unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address); + return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val))); +} #ifdef EIGEN_HAS_CUDA_FP16 template <template <typename T> class R> __device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) { -#if __CUDA_ARCH__ >= 300 unsigned int oldval = *reinterpret_cast<unsigned int*>(output); unsigned int newval = oldval; reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval)); @@ -87,9 +97,6 @@ __device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer return; } } -#else - assert(0 && "Shouldn't be called on unsupported device"); -#endif } #endif @@ -112,18 +119,44 @@ __global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coe } } + template <int BlockSize, int NumPerThread, typename Self, typename Reducer, typename Index> __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs, - typename Self::CoeffReturnType* output) { + typename Self::CoeffReturnType* output, unsigned int* semaphore) { +#if __CUDA_ARCH__ >= 300 + // Initialize the output value const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x; - - // Initialize the output value if it wasn't initialized by the ReductionInitKernel - if (gridDim.x == 1 && first_index == 0) { - *output = reducer.initialize(); - __syncthreads(); + if (gridDim.x == 1) { + if (first_index == 0) { + *output = reducer.initialize(); + } + } + else { + if (threadIdx.x == 0) { + unsigned int block = atomicCAS(semaphore, 0u, 1u); + if (block == 0) { + // We're the first block to run, initialize the output value + atomicExchCustom(output, reducer.initialize()); + __threadfence(); + atomicExch(semaphore, 2u); + } + else { + // Wait for the first block to initialize the output value. + // Use atomicCAS here to ensure that the reads aren't cached + unsigned int val; + do { + val = atomicCAS(semaphore, 2u, 2u); + } + while (val < 2u); + } + } } + __syncthreads(); + + eigen_assert(gridDim.x == 1 || *semaphore >= 2u); + typename Self::CoeffReturnType accum = reducer.initialize(); Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize); for (Index i = 0; i < max_iter; i+=BlockSize) { @@ -141,6 +174,14 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num if ((threadIdx.x & (warpSize - 1)) == 0) { atomicReduce(output, accum, reducer); } + + if (gridDim.x > 1 && threadIdx.x == 0) { + // Let the last block reset the semaphore + atomicInc(semaphore, gridDim.x + 1); + } +#else + assert(0 && "Shouldn't be called on unsupported device"); +#endif } @@ -229,32 +270,35 @@ __global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2 #endif - -template <typename Self, typename Op, typename OutputType, bool PacketAccess> +template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void> struct FullReductionLauncher { static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) { - assert(false && "Should only be called on floats and half floats"); + assert(false && "Should only be called on doubles, floats and half floats"); } }; -template <typename Self, typename Op, bool PacketAccess> -struct FullReductionLauncher<Self, Op, float, PacketAccess> { - static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs) { +// Specialization for float and double +template <typename Self, typename Op, typename OutputType, bool PacketAccess> +struct FullReductionLauncher< + Self, Op, OutputType, PacketAccess, + typename internal::enable_if< + internal::is_same<float, OutputType>::value || + internal::is_same<double, OutputType>::value, + void>::type> { + static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) { typedef typename Self::Index Index; typedef typename Self::CoeffReturnType Scalar; const int block_size = 256; const int num_per_thread = 128; const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread); + unsigned int* semaphore = NULL; if (num_blocks > 1) { - // We initialize the outputs outside the reduction kernel when we can't be sure that there - // won't be a race conditions between multiple thread blocks. - LAUNCH_CUDA_KERNEL((ReductionInitKernel<Scalar, Index>), - 1, 32, 0, device, reducer.initialize(), 1, output); + semaphore = device.semaphore(); } LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>), - num_blocks, block_size, 0, device, reducer, self, num_coeffs, output); + num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore); } }; @@ -298,27 +342,29 @@ struct FullReductionLauncher<Self, Op, Eigen::half, true> { template <typename Self, typename Op, bool Vectorizable> struct FullReducer<Self, Op, GpuDevice, Vectorizable> { // Unfortunately nvidia doesn't support well exotic types such as complex, - // so reduce the scope of the optimized version of the code to the simple case - // of floats and half floats. - #ifdef EIGEN_HAS_CUDA_FP16 + // so reduce the scope of the optimized version of the code to the simple cases + // of doubles, floats and half floats +#ifdef EIGEN_HAS_CUDA_FP16 static const bool HasOptimizedImplementation = !Op::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value || - (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && Op::PacketAccess)); + internal::is_same<typename Self::CoeffReturnType, double>::value || + (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess)); #else static const bool HasOptimizedImplementation = !Op::IsStateful && - internal::is_same<typename Self::CoeffReturnType, float>::value; + (internal::is_same<typename Self::CoeffReturnType, float>::value || + internal::is_same<typename Self::CoeffReturnType, double>::value); #endif template <typename OutputType> static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { - assert(HasOptimizedImplementation && "Should only be called on floats or half floats"); + assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats"); const Index num_coeffs = array_prod(self.m_impl.dimensions()); // Don't crash when we're called with an input tensor of size 0. if (num_coeffs == 0) { return; } - FullReductionLauncher<Self, Op, OutputType, Op::PacketAccess>::run(self, reducer, device, output, num_coeffs); + FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs); } }; @@ -327,6 +373,8 @@ template <int NumPerThread, typename Self, typename Reducer, typename Index> __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, typename Self::CoeffReturnType* output) { +#if __CUDA_ARCH__ >= 300 + typedef typename Self::CoeffReturnType Type; eigen_assert(blockDim.y == 1); eigen_assert(blockDim.z == 1); eigen_assert(gridDim.y == 1); @@ -356,13 +404,13 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu const Index col_block = i % input_col_blocks; const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x; - float reduced_val = reducer.initialize(); + Type reduced_val = reducer.initialize(); for (Index j = 0; j < NumPerThread; j += unroll_times) { const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1); if (last_col >= num_coeffs_to_reduce) { for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) { - const float val = input.m_impl.coeff(row * num_coeffs_to_reduce + col); + const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col); reducer.reduce(val, &reduced_val); } break; @@ -386,6 +434,9 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu } } } +#else + assert(0 && "Shouldn't be called on unsupported device"); +#endif } #ifdef EIGEN_HAS_CUDA_FP16 @@ -485,17 +536,23 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, #endif -template <typename Self, typename Op, typename OutputType, bool PacketAccess> +template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void> struct InnerReductionLauncher { static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) { - assert(false && "Should only be called to reduce floats and half floats on a gpu device"); + assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device"); return true; } }; -template <typename Self, typename Op, bool PacketAccess> -struct InnerReductionLauncher<Self, Op, float, PacketAccess> { - static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { +// Specialization for float and double +template <typename Self, typename Op, typename OutputType, bool PacketAccess> +struct InnerReductionLauncher< + Self, Op, OutputType, PacketAccess, + typename internal::enable_if< + internal::is_same<float, OutputType>::value || + internal::is_same<double, OutputType>::value, + void>::type> { + static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { typedef typename Self::Index Index; const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; @@ -513,7 +570,7 @@ struct InnerReductionLauncher<Self, Op, float, PacketAccess> { const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / 1024; const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); - LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>), + LAUNCH_CUDA_KERNEL((ReductionInitKernel<OutputType, Index>), num_blocks, 1024, 0, device, reducer.initialize(), num_preserved_vals, output); } @@ -580,15 +637,17 @@ struct InnerReducer<Self, Op, GpuDevice> { #ifdef EIGEN_HAS_CUDA_FP16 static const bool HasOptimizedImplementation = !Op::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value || - (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && Op::PacketAccess)); + internal::is_same<typename Self::CoeffReturnType, double>::value || + (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess)); #else static const bool HasOptimizedImplementation = !Op::IsStateful && - internal::is_same<typename Self::CoeffReturnType, float>::value; + (internal::is_same<typename Self::CoeffReturnType, float>::value || + internal::is_same<typename Self::CoeffReturnType, double>::value); #endif template <typename OutputType> static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { - assert(HasOptimizedImplementation && "Should only be called on floats or half floats"); + assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats"); const Index num_coeffs = array_prod(self.m_impl.dimensions()); // Don't crash when we're called with an input tensor of size 0. if (num_coeffs == 0) { @@ -599,7 +658,7 @@ struct InnerReducer<Self, Op, GpuDevice> { return true; } - return InnerReductionLauncher<Self, Op, OutputType, Op::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals); + return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals); } }; @@ -639,11 +698,11 @@ struct OuterReducer<Self, Op, GpuDevice> { // so reduce the scope of the optimized version of the code to the simple case // of floats. static const bool HasOptimizedImplementation = !Op::IsStateful && - internal::is_same<typename Self::CoeffReturnType, float>::value; - + (internal::is_same<typename Self::CoeffReturnType, float>::value || + internal::is_same<typename Self::CoeffReturnType, double>::value); template <typename Device, typename OutputType> static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { - assert(false && "Should only be called to reduce floats on a gpu device"); + assert(false && "Should only be called to reduce doubles or floats on a gpu device"); return true; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h index 031dbf6f2..8501466ce 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h @@ -9,9 +9,11 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H #define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H + namespace Eigen { namespace internal { + template <typename Op, typename XprType> struct traits<TensorScanOp<Op, XprType> > : public traits<XprType> { @@ -42,9 +44,7 @@ struct nested<TensorScanOp<Op, XprType>, 1, * \ingroup CXX11_Tensor_Module * * \brief Tensor scan class. - * */ - template <typename Op, typename XprType> class TensorScanOp : public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> { @@ -57,8 +57,8 @@ public: typedef typename Eigen::internal::traits<TensorScanOp>::Index Index; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp( - const XprType& expr, const Index& axis, const Op& op = Op()) - : m_expr(expr), m_axis(axis), m_accumulator(op) {} + const XprType& expr, const Index& axis, bool exclusive = false, const Op& op = Op()) + : m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index axis() const { return m_axis; } @@ -66,13 +66,19 @@ public: const XprType& expression() const { return m_expr; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op accumulator() const { return m_accumulator; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + bool exclusive() const { return m_exclusive; } protected: typename XprType::Nested m_expr; const Index m_axis; const Op m_accumulator; + const bool m_exclusive; }; +template <typename Self, typename Reducer, typename Device> +struct ScanLauncher; + // Eval as rvalue template <typename Op, typename ArgType, typename Device> struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> { @@ -81,13 +87,14 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> { typedef typename XprType::Index Index; static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; typedef DSizes<Index, NumDims> Dimensions; - typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self; enum { IsAligned = false, - PacketAccess = (internal::packet_traits<Scalar>::size > 1), + PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1), BlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, @@ -98,45 +105,71 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> { const Device& device) : m_impl(op.expression(), device), m_device(device), - m_axis(op.axis()), + m_exclusive(op.exclusive()), m_accumulator(op.accumulator()), - m_dimensions(m_impl.dimensions()), - m_size(m_dimensions[m_axis]), + m_size(m_impl.dimensions()[op.axis()]), m_stride(1), m_output(NULL) { // Accumulating a scalar isn't supported. - EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - eigen_assert(m_axis >= 0 && m_axis < NumDims); + EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(op.axis() >= 0 && op.axis() < NumDims); // Compute stride of scan axis + const Dimensions& dims = m_impl.dimensions(); if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - for (int i = 0; i < m_axis; ++i) { - m_stride = m_stride * m_dimensions[i]; + for (int i = 0; i < op.axis(); ++i) { + m_stride = m_stride * dims[i]; } } else { - for (int i = NumDims - 1; i > m_axis; --i) { - m_stride = m_stride * m_dimensions[i]; + for (int i = NumDims - 1; i > op.axis(); --i) { + m_stride = m_stride * dims[i]; } } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { - return m_dimensions; + return m_impl.dimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const { + return m_stride; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const { + return m_size; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const { + return m_accumulator; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const { + return m_exclusive; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const { + return m_impl; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { + return m_device; + } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { m_impl.evalSubExprsIfNeeded(NULL); + ScanLauncher<Self, Op, Device> launcher; if (data) { - accumulateTo(data); + launcher(*this, data); return false; - } else { - m_output = static_cast<CoeffReturnType*>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); - accumulateTo(m_output); - return true; } + + const Index total_size = internal::array_prod(dimensions()); + m_output = static_cast<CoeffReturnType*>(m_device.allocate(total_size * sizeof(Scalar))); + launcher(*this, m_output); + return true; } - + template<int LoadMode> EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { return internal::ploadt<PacketReturnType, LoadMode>(m_output + index); @@ -152,6 +185,10 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> { return m_output[index]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { if (m_output != NULL) { m_device.deallocate(m_output); @@ -163,35 +200,88 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> { protected: TensorEvaluator<ArgType, Device> m_impl; const Device& m_device; - const Index m_axis; + const bool m_exclusive; Op m_accumulator; - const Dimensions& m_dimensions; - const Index& m_size; + const Index m_size; Index m_stride; CoeffReturnType* m_output; +}; + +// CPU implementation of scan +// TODO(ibab) This single-threaded implementation should be parallelized, +// at least by running multiple scans at the same time. +template <typename Self, typename Reducer, typename Device> +struct ScanLauncher { + void operator()(Self& self, typename Self::CoeffReturnType *data) { + Index total_size = internal::array_prod(self.dimensions()); - // TODO(ibab) Parallelize this single-threaded implementation if desired - EIGEN_DEVICE_FUNC void accumulateTo(Scalar* data) { - // We fix the index along the scan axis to 0 and perform an + // We fix the index along the scan axis to 0 and perform a // scan per remaining entry. The iteration is split into two nested // loops to avoid an integer division by keeping track of each idx1 and idx2. - for (Index idx1 = 0; idx1 < dimensions().TotalSize() / m_size; idx1 += m_stride) { - for (Index idx2 = 0; idx2 < m_stride; idx2++) { - // Calculate the starting offset for the scan - Index offset = idx1 * m_size + idx2; - - // Compute the prefix sum along the axis, starting at the calculated offset - CoeffReturnType accum = m_accumulator.initialize(); - for (Index idx3 = 0; idx3 < m_size; idx3++) { - Index curr = offset + idx3 * m_stride; - m_accumulator.reduce(m_impl.coeff(curr), &accum); - data[curr] = m_accumulator.finalize(accum); + for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) { + for (Index idx2 = 0; idx2 < self.stride(); idx2++) { + // Calculate the starting offset for the scan + Index offset = idx1 + idx2; + + // Compute the scan along the axis, starting at the calculated offset + typename Self::CoeffReturnType accum = self.accumulator().initialize(); + for (Index idx3 = 0; idx3 < self.size(); idx3++) { + Index curr = offset + idx3 * self.stride(); + + if (self.exclusive()) { + data[curr] = self.accumulator().finalize(accum); + self.accumulator().reduce(self.inner().coeff(curr), &accum); + } else { + self.accumulator().reduce(self.inner().coeff(curr), &accum); + data[curr] = self.accumulator().finalize(accum); } - } + } + } } } }; +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) + +// GPU implementation of scan +// TODO(ibab) This placeholder implementation performs multiple scans in +// parallel, but it would be better to use a parallel scan algorithm and +// optimize memory access. +template <typename Self, typename Reducer> +__global__ void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) { + // Compute offset as in the CPU version + Index val = threadIdx.x + blockIdx.x * blockDim.x; + Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride(); + + if (offset + (self.size() - 1) * self.stride() < total_size) { + // Compute the scan along the axis, starting at the calculated offset + typename Self::CoeffReturnType accum = self.accumulator().initialize(); + for (Index idx = 0; idx < self.size(); idx++) { + Index curr = offset + idx * self.stride(); + if (self.exclusive()) { + data[curr] = self.accumulator().finalize(accum); + self.accumulator().reduce(self.inner().coeff(curr), &accum); + } else { + self.accumulator().reduce(self.inner().coeff(curr), &accum); + data[curr] = self.accumulator().finalize(accum); + } + } + } + __syncthreads(); + +} + +template <typename Self, typename Reducer> +struct ScanLauncher<Self, Reducer, GpuDevice> { + void operator()(const Self& self, typename Self::CoeffReturnType* data) { + Index total_size = internal::array_prod(self.dimensions()); + Index num_blocks = (total_size / self.size() + 63) / 64; + Index block_size = 64; + LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data); + } +}; +#endif // EIGEN_USE_GPU && __CUDACC__ + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h index bdcd70fd9..3523e7c94 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h @@ -20,6 +20,7 @@ struct static_val { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator uint64_t() const { return n; } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val() { } + template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) { eigen_assert(v == n); @@ -53,7 +54,7 @@ struct TensorUInt128 template<typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE explicit TensorUInt128(const T& x) : high(0), low(x) { - eigen_assert((static_cast<typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type>(x) <= static_cast<typename conditional<sizeof(LOW) == 8, uint64_t, uint32_t>::type>(NumTraits<LOW>::highest()))); + eigen_assert((static_cast<typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type>(x) <= NumTraits<uint64_t>::highest())); eigen_assert(x >= 0); } diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt b/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt deleted file mode 100644 index 6e871a8da..000000000 --- a/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -FILE(GLOB Eigen_CXX11_TensorSymmetry_SRCS "*.h") - -INSTALL(FILES - ${Eigen_CXX11_TensorSymmetry_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/TensorSymmetry COMPONENT Devel - ) - -add_subdirectory(util) diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt deleted file mode 100644 index dc9fc78ec..000000000 --- a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_CXX11_TensorSymmetry_util_SRCS "*.h") - -INSTALL(FILES - ${Eigen_CXX11_TensorSymmetry_util_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/TensorSymmetry/util COMPONENT Devel - ) diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt b/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt deleted file mode 100644 index 88fef50c6..000000000 --- a/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_CXX11_ThreadPool_SRCS "*.h") - -INSTALL(FILES - ${Eigen_CXX11_ThreadPool_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/ThreadPool COMPONENT Devel - ) diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h index 6dd64f185..12b80d6c4 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h @@ -169,7 +169,8 @@ class EventCount { class Waiter { friend class EventCount; - std::atomic<Waiter*> next; + // Align to 128 byte boundary to prevent false sharing with other Waiter objects in the same vector. + EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic<Waiter*> next; std::mutex mu; std::condition_variable cv; uint64_t epoch; @@ -179,8 +180,6 @@ class EventCount { kWaiting, kSignaled, }; - // Prevent false sharing with other Waiter objects in the same vector. - char pad_[128]; }; private: diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h index c094563b7..33ae45131 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h @@ -74,7 +74,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { PerThread* pt = GetPerThread(); if (pt->pool == this) { // Worker thread of this pool, push onto the thread's queue. - Queue* q = queues_[pt->index]; + Queue* q = queues_[pt->thread_id]; t = q->PushFront(std::move(t)); } else { // A free-standing thread (or worker of another pool), push onto a random @@ -95,14 +95,28 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { env_.ExecuteTask(t); // Push failed, execute directly. } + int NumThreads() const final { + return static_cast<int>(threads_.size()); + } + + int CurrentThreadId() const final { + const PerThread* pt = + const_cast<NonBlockingThreadPoolTempl*>(this)->GetPerThread(); + if (pt->pool == this) { + return pt->thread_id; + } else { + return -1; + } + } + private: typedef typename Environment::EnvThread Thread; struct PerThread { - bool inited; + constexpr PerThread() : pool(NULL), rand(0), thread_id(-1) { } NonBlockingThreadPoolTempl* pool; // Parent pool, or null for normal threads. - unsigned index; // Worker thread index in pool. - unsigned rand; // Random generator state. + uint64_t rand; // Random generator state. + int thread_id; // Worker thread index in pool. }; Environment env_; @@ -116,12 +130,13 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { EventCount ec_; // Main worker thread loop. - void WorkerLoop(unsigned index) { + void WorkerLoop(int thread_id) { PerThread* pt = GetPerThread(); pt->pool = this; - pt->index = index; - Queue* q = queues_[index]; - EventCount::Waiter* waiter = &waiters_[index]; + pt->rand = std::hash<std::thread::id>()(std::this_thread::get_id()); + pt->thread_id = thread_id; + Queue* q = queues_[thread_id]; + EventCount::Waiter* waiter = &waiters_[thread_id]; for (;;) { Task t = q->PopFront(); if (!t.f) { @@ -235,17 +250,18 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { return -1; } - PerThread* GetPerThread() { + static EIGEN_STRONG_INLINE PerThread* GetPerThread() { EIGEN_THREAD_LOCAL PerThread per_thread_; PerThread* pt = &per_thread_; - if (pt->inited) return pt; - pt->inited = true; - pt->rand = static_cast<unsigned>(std::hash<std::thread::id>()(std::this_thread::get_id())); return pt; } - static unsigned Rand(unsigned* state) { - return *state = *state * 1103515245 + 12345; + static EIGEN_STRONG_INLINE unsigned Rand(uint64_t* state) { + uint64_t current = *state; + // Update the internal state + *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; + // Generate the random output (using the PCG-XSH-RS scheme) + return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61))); } }; diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h index 17fd1658b..e75d0f467 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h @@ -24,7 +24,7 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface { explicit SimpleThreadPoolTempl(int num_threads, Environment env = Environment()) : env_(env), threads_(num_threads), waiters_(num_threads) { for (int i = 0; i < num_threads; i++) { - threads_.push_back(env.CreateThread([this]() { WorkerLoop(); })); + threads_.push_back(env.CreateThread([this, i]() { WorkerLoop(i); })); } } @@ -55,7 +55,7 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface { // Schedule fn() for execution in the pool of threads. The functions are // executed in the order in which they are scheduled. - void Schedule(std::function<void()> fn) { + void Schedule(std::function<void()> fn) final { Task t = env_.CreateTask(std::move(fn)); std::unique_lock<std::mutex> l(mu_); if (waiters_.empty()) { @@ -69,9 +69,25 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface { } } + int NumThreads() const final { + return static_cast<int>(threads_.size()); + } + + int CurrentThreadId() const final { + const PerThread* pt = this->GetPerThread(); + if (pt->pool == this) { + return pt->thread_id; + } else { + return -1; + } + } + protected: - void WorkerLoop() { + void WorkerLoop(int thread_id) { std::unique_lock<std::mutex> l(mu_); + PerThread* pt = GetPerThread(); + pt->pool = this; + pt->thread_id = thread_id; Waiter w; Task t; while (!exiting_) { @@ -111,13 +127,24 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface { bool ready; }; + struct PerThread { + constexpr PerThread() : pool(NULL), thread_id(-1) { } + SimpleThreadPoolTempl* pool; // Parent pool, or null for normal threads. + int thread_id; // Worker thread index in pool. + }; + Environment env_; std::mutex mu_; MaxSizeVector<Thread*> threads_; // All threads MaxSizeVector<Waiter*> waiters_; // Stack of waiting threads. - std::deque<Task> pending_; // Queue of pending work - std::condition_variable empty_; // Signaled on pending_.empty() + std::deque<Task> pending_; // Queue of pending work + std::condition_variable empty_; // Signaled on pending_.empty() bool exiting_ = false; + + PerThread* GetPerThread() const { + EIGEN_THREAD_LOCAL PerThread per_thread; + return &per_thread; + } }; typedef SimpleThreadPoolTempl<StlThreadEnvironment> SimpleThreadPool; diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h index d2204ad5b..399f95cc1 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h @@ -21,14 +21,14 @@ struct StlThreadEnvironment { // destructor must join the thread. class EnvThread { public: - EnvThread(std::function<void()> f) : thr_(f) {} + EnvThread(std::function<void()> f) : thr_(std::move(f)) {} ~EnvThread() { thr_.join(); } private: std::thread thr_; }; - EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(f); } + EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(std::move(f)); } Task CreateTask(std::function<void()> f) { return Task{std::move(f)}; } void ExecuteTask(const Task& t) { t.f(); } }; diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h index 38b40aceb..a65ee97c9 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h @@ -18,6 +18,13 @@ class ThreadPoolInterface { public: virtual void Schedule(std::function<void()> fn) = 0; + // Returns the number of threads in the pool. + virtual int NumThreads() const = 0; + + // Returns a logical thread index between 0 and NumThreads() - 1 if called + // from one of the threads in the pool. Returns -1 otherwise. + virtual int CurrentThreadId() const = 0; + virtual ~ThreadPoolInterface() {} }; diff --git a/unsupported/Eigen/CXX11/src/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/util/CMakeLists.txt deleted file mode 100644 index 7eab492d6..000000000 --- a/unsupported/Eigen/CXX11/src/util/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_CXX11_util_SRCS "*.h") - -INSTALL(FILES - ${Eigen_CXX11_util_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/util COMPONENT Devel - ) diff --git a/unsupported/Eigen/KroneckerProduct b/unsupported/Eigen/KroneckerProduct index c932c06a6..5f5afb8cf 100644 --- a/unsupported/Eigen/KroneckerProduct +++ b/unsupported/Eigen/KroneckerProduct @@ -13,6 +13,8 @@ #include "../../Eigen/src/Core/util/DisableStupidWarnings.h" +#include "../../Eigen/src/SparseCore/SparseUtil.h" + namespace Eigen { /** diff --git a/unsupported/Eigen/MPRealSupport b/unsupported/Eigen/MPRealSupport index 89036886b..7f0b70c63 100644 --- a/unsupported/Eigen/MPRealSupport +++ b/unsupported/Eigen/MPRealSupport @@ -67,27 +67,32 @@ int main() IsSigned = 1, IsComplex = 0, RequireInitialization = 1, - ReadCost = 10, - AddCost = 10, - MulCost = 40 + ReadCost = HugeCost, + AddCost = HugeCost, + MulCost = HugeCost }; typedef mpfr::mpreal Real; typedef mpfr::mpreal NonInteger; - inline static Real highest (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::maxval(Precision); } - inline static Real lowest (long Precision = mpfr::mpreal::get_default_prec()) { return -mpfr::maxval(Precision); } + static inline Real highest (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::maxval(Precision); } + static inline Real lowest (long Precision = mpfr::mpreal::get_default_prec()) { return -mpfr::maxval(Precision); } // Constants - inline static Real Pi (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_pi(Precision); } - inline static Real Euler (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_euler(Precision); } - inline static Real Log2 (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_log2(Precision); } - inline static Real Catalan (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_catalan(Precision); } + static inline Real Pi (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_pi(Precision); } + static inline Real Euler (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_euler(Precision); } + static inline Real Log2 (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_log2(Precision); } + static inline Real Catalan (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_catalan(Precision); } - inline static Real epsilon (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::machine_epsilon(Precision); } - inline static Real epsilon (const Real& x) { return mpfr::machine_epsilon(x); } + static inline Real epsilon (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::machine_epsilon(Precision); } + static inline Real epsilon (const Real& x) { return mpfr::machine_epsilon(x); } - inline static Real dummy_precision() +#ifdef MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS + static inline int digits10 (long Precision = mpfr::mpreal::get_default_prec()) { return std::numeric_limits<Real>::digits10(Precision); } + static inline int digits10 (const Real& x) { return std::numeric_limits<Real>::digits10(x); } +#endif + + static inline Real dummy_precision() { mpfr_prec_t weak_prec = ((mpfr::mpreal::get_default_prec()-1) * 90) / 100; return mpfr::machine_epsilon(weak_prec); diff --git a/unsupported/Eigen/SpecialFunctions b/unsupported/Eigen/SpecialFunctions new file mode 100644 index 000000000..7c7493c56 --- /dev/null +++ b/unsupported/Eigen/SpecialFunctions @@ -0,0 +1,61 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Gael Guennebaud <g.gael@free.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPECIALFUNCTIONS_MODULE +#define EIGEN_SPECIALFUNCTIONS_MODULE + +#include "../../Eigen/Core" + +#include "../../Eigen/src/Core/util/DisableStupidWarnings.h" + +namespace Eigen { + +/** + * \defgroup SpecialFunctions_Module Special math functions module + * + * This module features additional coefficient-wise math functions available + * within the numext:: namespace for the scalar version, and as method and/or free + * functions of Array. Those include: + * + * - erf + * - erfc + * - lgamma + * - igamma + * - igammac + * - digamma + * - polygamma + * - zeta + * - betainc + * + * \code + * #include <unsupported/Eigen/SpecialFunctions> + * \endcode + */ +//@{ + +} + +#include "src/SpecialFunctions/SpecialFunctionsImpl.h" +#include "src/SpecialFunctions/SpecialFunctionsPacketMath.h" +#include "src/SpecialFunctions/SpecialFunctionsHalf.h" +#include "src/SpecialFunctions/SpecialFunctionsFunctors.h" +#include "src/SpecialFunctions/SpecialFunctionsArrayAPI.h" + +#if defined EIGEN_VECTORIZE_CUDA + #include "src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h" +#endif + +namespace Eigen { +//@} +} + + +#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h" + +#endif // EIGEN_SPECIALFUNCTIONS_MODULE diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h index 089042751..50fedf6ac 100755 --- a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h +++ b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h @@ -30,6 +30,13 @@ template<typename _DerType, bool Enable> struct auto_diff_special_op; } // end namespace internal +template<typename _DerType> class AutoDiffScalar; + +template<typename NewDerType> +inline AutoDiffScalar<NewDerType> MakeAutoDiffScalar(const typename NewDerType::Scalar& value, const NewDerType &der) { + return AutoDiffScalar<NewDerType>(value,der); +} + /** \class AutoDiffScalar * \brief A scalar type replacement with automatic differentation capability * @@ -60,7 +67,7 @@ template<typename _DerType> class AutoDiffScalar : public internal::auto_diff_special_op <_DerType, !internal::is_same<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar, - typename NumTraits<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar>::Real>::value> + typename NumTraits<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar>::Real>::value> { public: typedef internal::auto_diff_special_op @@ -257,20 +264,16 @@ class AutoDiffScalar -m_derivatives); } - inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> > + inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) > operator*(const Scalar& other) const { - return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >( - m_value * other, - (m_derivatives * other)); + return MakeAutoDiffScalar(m_value * other, m_derivatives * other); } - friend inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> > + friend inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) > operator*(const Scalar& other, const AutoDiffScalar& a) { - return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >( - a.value() * other, - a.derivatives() * other); + return MakeAutoDiffScalar(a.value() * other, a.derivatives() * other); } // inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type > @@ -289,20 +292,16 @@ class AutoDiffScalar // a.derivatives() * other); // } - inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> > + inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) > operator/(const Scalar& other) const { - return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >( - m_value / other, - (m_derivatives * (Scalar(1)/other))); + return MakeAutoDiffScalar(m_value / other, (m_derivatives * (Scalar(1)/other))); } - friend inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> > + friend inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) > operator/(const Scalar& other, const AutoDiffScalar& a) { - return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >( - other / a.value(), - a.derivatives() * (Scalar(-other) / (a.value()*a.value()))); + return MakeAutoDiffScalar(other / a.value(), a.derivatives() * (Scalar(-other) / (a.value()*a.value()))); } // inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type > @@ -322,34 +321,29 @@ class AutoDiffScalar // } template<typename OtherDerType> - inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, - const CwiseBinaryOp<internal::scalar_difference_op<Scalar>, - const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>, - const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type > > > > + inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE( + CwiseBinaryOp<internal::scalar_difference_op<Scalar> EIGEN_COMMA + const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) EIGEN_COMMA + const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename internal::remove_all<OtherDerType>::type,Scalar,product) >,Scalar,product) > operator/(const AutoDiffScalar<OtherDerType>& other) const { internal::make_coherent(m_derivatives, other.derivatives()); - return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, - const CwiseBinaryOp<internal::scalar_difference_op<Scalar>, - const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>, - const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type > > > >( + return MakeAutoDiffScalar( m_value / other.value(), - ((m_derivatives * other.value()) - (m_value * other.derivatives())) + ((m_derivatives * other.value()) - (other.derivatives() * m_value)) * (Scalar(1)/(other.value()*other.value()))); } template<typename OtherDerType> inline const AutoDiffScalar<CwiseBinaryOp<internal::scalar_sum_op<Scalar>, - const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>, - const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type> > > + const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product), + const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename internal::remove_all<OtherDerType>::type,Scalar,product) > > operator*(const AutoDiffScalar<OtherDerType>& other) const { internal::make_coherent(m_derivatives, other.derivatives()); - return AutoDiffScalar<const CwiseBinaryOp<internal::scalar_sum_op<Scalar>, - const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>, - const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type > > >( + return MakeAutoDiffScalar( m_value * other.value(), - (m_derivatives * other.value()) + (m_value * other.derivatives())); + (m_derivatives * other.value()) + (other.derivatives() * m_value)); } inline AutoDiffScalar& operator*=(const Scalar& other) @@ -426,18 +420,18 @@ struct auto_diff_special_op<_DerType, true> } - inline const AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type > + inline const AutoDiffScalar<typename CwiseUnaryOp<bind2nd_op<scalar_product_op<Scalar,Real> >, DerType>::Type > operator*(const Real& other) const { - return AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >( + return AutoDiffScalar<typename CwiseUnaryOp<bind2nd_op<scalar_product_op<Scalar,Real> >, DerType>::Type >( derived().value() * other, derived().derivatives() * other); } - friend inline const AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type > + friend inline const AutoDiffScalar<typename CwiseUnaryOp<bind1st_op<scalar_product_op<Real,Scalar> >, DerType>::Type > operator*(const Real& other, const AutoDiffScalar<_DerType>& a) { - return AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >( + return AutoDiffScalar<typename CwiseUnaryOp<bind1st_op<scalar_product_op<Real,Scalar> >, DerType>::Type >( a.value() * other, a.derivatives() * other); } @@ -501,43 +495,44 @@ struct make_coherent_impl<Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, } }; -template<typename A_Scalar, int A_Rows, int A_Cols, int A_Options, int A_MaxRows, int A_MaxCols> -struct scalar_product_traits<Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols>,A_Scalar> -{ - enum { Defined = 1 }; - typedef Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> ReturnType; -}; - -template<typename A_Scalar, int A_Rows, int A_Cols, int A_Options, int A_MaxRows, int A_MaxCols> -struct scalar_product_traits<A_Scalar, Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> > -{ - enum { Defined = 1 }; - typedef Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> ReturnType; -}; +} // end namespace internal -template<typename DerType> -struct scalar_product_traits<AutoDiffScalar<DerType>,typename DerType::Scalar> +template<typename DerType, typename BinOp> +struct ScalarBinaryOpTraits<AutoDiffScalar<DerType>,typename DerType::Scalar,BinOp> { - enum { Defined = 1 }; typedef AutoDiffScalar<DerType> ReturnType; }; -template<typename DerType> -struct scalar_product_traits<typename DerType::Scalar,AutoDiffScalar<DerType> > +template<typename DerType, typename BinOp> +struct ScalarBinaryOpTraits<typename DerType::Scalar,AutoDiffScalar<DerType>, BinOp> { - enum { Defined = 1 }; typedef AutoDiffScalar<DerType> ReturnType; }; -} // end namespace internal + +// The following is an attempt to let Eigen's known about expression template, but that's more tricky! + +// template<typename DerType, typename BinOp> +// struct ScalarBinaryOpTraits<AutoDiffScalar<DerType>,AutoDiffScalar<DerType>, BinOp> +// { +// enum { Defined = 1 }; +// typedef AutoDiffScalar<typename DerType::PlainObject> ReturnType; +// }; +// +// template<typename DerType1,typename DerType2, typename BinOp> +// struct ScalarBinaryOpTraits<AutoDiffScalar<DerType1>,AutoDiffScalar<DerType2>, BinOp> +// { +// enum { Defined = 1 };//internal::is_same<typename DerType1::Scalar,typename DerType2::Scalar>::value }; +// typedef AutoDiffScalar<typename DerType1::PlainObject> ReturnType; +// }; #define EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(FUNC,CODE) \ template<typename DerType> \ - inline const Eigen::AutoDiffScalar<Eigen::CwiseUnaryOp<Eigen::internal::scalar_multiple_op<typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar>, const typename Eigen::internal::remove_all<DerType>::type> > \ + inline const Eigen::AutoDiffScalar< \ + EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename Eigen::internal::remove_all<DerType>::type, typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar, product) > \ FUNC(const Eigen::AutoDiffScalar<DerType>& x) { \ using namespace Eigen; \ - typedef typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar Scalar; \ - typedef AutoDiffScalar<CwiseUnaryOp<Eigen::internal::scalar_multiple_op<Scalar>, const typename Eigen::internal::remove_all<DerType>::type> > ReturnType; \ + EIGEN_UNUSED typedef typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar Scalar; \ CODE; \ } @@ -567,49 +562,56 @@ inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::Plain typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> ADS; return (x > y ? ADS(x) : ADS(y)); } +template<typename DerType> +inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (min)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) { + return (x.value() < y.value() ? x : y); +} +template<typename DerType> +inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (max)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) { + return (x.value() >= y.value() ? x : y); +} + EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(abs, using std::abs; - return ReturnType(abs(x.value()), x.derivatives() * (x.value()<0 ? -1 : 1) );) + return Eigen::MakeAutoDiffScalar(abs(x.value()), x.derivatives() * (x.value()<0 ? -1 : 1) );) EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(abs2, using numext::abs2; - return ReturnType(abs2(x.value()), x.derivatives() * (Scalar(2)*x.value()));) + return Eigen::MakeAutoDiffScalar(abs2(x.value()), x.derivatives() * (Scalar(2)*x.value()));) EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sqrt, using std::sqrt; Scalar sqrtx = sqrt(x.value()); - return ReturnType(sqrtx,x.derivatives() * (Scalar(0.5) / sqrtx));) + return Eigen::MakeAutoDiffScalar(sqrtx,x.derivatives() * (Scalar(0.5) / sqrtx));) EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(cos, using std::cos; using std::sin; - return ReturnType(cos(x.value()), x.derivatives() * (-sin(x.value())));) + return Eigen::MakeAutoDiffScalar(cos(x.value()), x.derivatives() * (-sin(x.value())));) EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sin, using std::sin; using std::cos; - return ReturnType(sin(x.value()),x.derivatives() * cos(x.value()));) + return Eigen::MakeAutoDiffScalar(sin(x.value()),x.derivatives() * cos(x.value()));) EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(exp, using std::exp; Scalar expx = exp(x.value()); - return ReturnType(expx,x.derivatives() * expx);) + return Eigen::MakeAutoDiffScalar(expx,x.derivatives() * expx);) EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(log, using std::log; - return ReturnType(log(x.value()),x.derivatives() * (Scalar(1)/x.value()));) + return Eigen::MakeAutoDiffScalar(log(x.value()),x.derivatives() * (Scalar(1)/x.value()));) template<typename DerType> -inline const Eigen::AutoDiffScalar<Eigen::CwiseUnaryOp<Eigen::internal::scalar_multiple_op<typename internal::traits<typename internal::remove_all<DerType>::type>::Scalar>, const typename internal::remove_all<DerType>::type> > -pow(const Eigen::AutoDiffScalar<DerType>& x, const typename internal::traits<typename internal::remove_all<DerType>::type>::Scalar &y) +inline const Eigen::AutoDiffScalar< +EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename internal::remove_all<DerType>::type,typename internal::traits<typename internal::remove_all<DerType>::type>::Scalar,product) > +pow(const Eigen::AutoDiffScalar<DerType> &x, const typename internal::traits<typename internal::remove_all<DerType>::type>::Scalar &y) { using namespace Eigen; - typedef typename internal::remove_all<DerType>::type DerTypeCleaned; - typedef typename Eigen::internal::traits<DerTypeCleaned>::Scalar Scalar; - return AutoDiffScalar<CwiseUnaryOp<Eigen::internal::scalar_multiple_op<Scalar>, const DerTypeCleaned> >( - std::pow(x.value(),y), - x.derivatives() * (y * std::pow(x.value(),y-1))); + using std::pow; + return Eigen::MakeAutoDiffScalar(pow(x.value(),y), x.derivatives() * (y * pow(x.value(),y-1))); } @@ -634,27 +636,44 @@ atan2(const AutoDiffScalar<DerTypeA>& a, const AutoDiffScalar<DerTypeB>& b) EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(tan, using std::tan; using std::cos; - return ReturnType(tan(x.value()),x.derivatives() * (Scalar(1)/numext::abs2(cos(x.value()))));) + return Eigen::MakeAutoDiffScalar(tan(x.value()),x.derivatives() * (Scalar(1)/numext::abs2(cos(x.value()))));) EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(asin, using std::sqrt; using std::asin; - return ReturnType(asin(x.value()),x.derivatives() * (Scalar(1)/sqrt(1-numext::abs2(x.value()))));) + return Eigen::MakeAutoDiffScalar(asin(x.value()),x.derivatives() * (Scalar(1)/sqrt(1-numext::abs2(x.value()))));) EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(acos, using std::sqrt; using std::acos; - return ReturnType(acos(x.value()),x.derivatives() * (Scalar(-1)/sqrt(1-numext::abs2(x.value()))));) + return Eigen::MakeAutoDiffScalar(acos(x.value()),x.derivatives() * (Scalar(-1)/sqrt(1-numext::abs2(x.value()))));) + +EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(tanh, + using std::cosh; + using std::tanh; + return Eigen::MakeAutoDiffScalar(tanh(x.value()),x.derivatives() * (Scalar(1)/numext::abs2(cosh(x.value()))));) + +EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sinh, + using std::sinh; + using std::cosh; + return Eigen::MakeAutoDiffScalar(sinh(x.value()),x.derivatives() * cosh(x.value()));) + +EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(cosh, + using std::sinh; + using std::cosh; + return Eigen::MakeAutoDiffScalar(cosh(x.value()),x.derivatives() * sinh(x.value()));) #undef EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY template<typename DerType> struct NumTraits<AutoDiffScalar<DerType> > - : NumTraits< typename NumTraits<typename DerType::Scalar>::Real > + : NumTraits< typename NumTraits<typename internal::remove_all<DerType>::type::Scalar>::Real > { - typedef AutoDiffScalar<Matrix<typename NumTraits<typename DerType::Scalar>::Real,DerType::RowsAtCompileTime,DerType::ColsAtCompileTime, - DerType::Options, DerType::MaxRowsAtCompileTime, DerType::MaxColsAtCompileTime> > Real; + typedef typename internal::remove_all<DerType>::type DerTypeCleaned; + typedef AutoDiffScalar<Matrix<typename NumTraits<typename DerTypeCleaned::Scalar>::Real,DerTypeCleaned::RowsAtCompileTime,DerTypeCleaned::ColsAtCompileTime, + 0, DerTypeCleaned::MaxRowsAtCompileTime, DerTypeCleaned::MaxColsAtCompileTime> > Real; typedef AutoDiffScalar<DerType> NonInteger; typedef AutoDiffScalar<DerType> Nested; + typedef typename NumTraits<typename DerTypeCleaned::Scalar>::Literal Literal; enum{ RequireInitialization = 1 }; diff --git a/unsupported/Eigen/src/AutoDiff/CMakeLists.txt b/unsupported/Eigen/src/AutoDiff/CMakeLists.txt deleted file mode 100644 index ad91fd9c4..000000000 --- a/unsupported/Eigen/src/AutoDiff/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_AutoDiff_SRCS "*.h") - -INSTALL(FILES - ${Eigen_AutoDiff_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/AutoDiff COMPONENT Devel - ) diff --git a/unsupported/Eigen/src/BVH/CMakeLists.txt b/unsupported/Eigen/src/BVH/CMakeLists.txt deleted file mode 100644 index b377d865c..000000000 --- a/unsupported/Eigen/src/BVH/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_BVH_SRCS "*.h") - -INSTALL(FILES - ${Eigen_BVH_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/BVH COMPONENT Devel - ) diff --git a/unsupported/Eigen/src/CMakeLists.txt b/unsupported/Eigen/src/CMakeLists.txt deleted file mode 100644 index 754953335..000000000 --- a/unsupported/Eigen/src/CMakeLists.txt +++ /dev/null @@ -1,16 +0,0 @@ -ADD_SUBDIRECTORY(AutoDiff) -ADD_SUBDIRECTORY(BVH) -ADD_SUBDIRECTORY(Eigenvalues) -ADD_SUBDIRECTORY(FFT) -ADD_SUBDIRECTORY(IterativeSolvers) -ADD_SUBDIRECTORY(LevenbergMarquardt) -ADD_SUBDIRECTORY(MatrixFunctions) -ADD_SUBDIRECTORY(MoreVectorization) -ADD_SUBDIRECTORY(NonLinearOptimization) -ADD_SUBDIRECTORY(NumericalDiff) -ADD_SUBDIRECTORY(Polynomials) -ADD_SUBDIRECTORY(Skyline) -ADD_SUBDIRECTORY(SparseExtra) -ADD_SUBDIRECTORY(KroneckerProduct) -ADD_SUBDIRECTORY(Splines) -ADD_SUBDIRECTORY(EulerAngles) diff --git a/unsupported/Eigen/src/Eigenvalues/CMakeLists.txt b/unsupported/Eigen/src/Eigenvalues/CMakeLists.txt deleted file mode 100644 index 1d4387c82..000000000 --- a/unsupported/Eigen/src/Eigenvalues/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_Eigenvalues_SRCS "*.h") - -INSTALL(FILES - ${Eigen_Eigenvalues_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Eigenvalues COMPONENT Devel - ) diff --git a/unsupported/Eigen/src/FFT/CMakeLists.txt b/unsupported/Eigen/src/FFT/CMakeLists.txt deleted file mode 100644 index edcffcb18..000000000 --- a/unsupported/Eigen/src/FFT/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_FFT_SRCS "*.h") - -INSTALL(FILES - ${Eigen_FFT_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/FFT COMPONENT Devel - ) diff --git a/unsupported/Eigen/src/IterativeSolvers/CMakeLists.txt b/unsupported/Eigen/src/IterativeSolvers/CMakeLists.txt deleted file mode 100644 index 7986afc5e..000000000 --- a/unsupported/Eigen/src/IterativeSolvers/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_IterativeSolvers_SRCS "*.h") - -INSTALL(FILES - ${Eigen_IterativeSolvers_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/IterativeSolvers COMPONENT Devel - ) diff --git a/unsupported/Eigen/src/KroneckerProduct/CMakeLists.txt b/unsupported/Eigen/src/KroneckerProduct/CMakeLists.txt deleted file mode 100644 index 4daefebee..000000000 --- a/unsupported/Eigen/src/KroneckerProduct/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_KroneckerProduct_SRCS "*.h") - -INSTALL(FILES - ${Eigen_KroneckerProduct_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/KroneckerProduct COMPONENT Devel - ) diff --git a/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h b/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h index bf9727c21..582fa8512 100644 --- a/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h +++ b/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h @@ -203,7 +203,7 @@ struct traits<KroneckerProduct<_Lhs,_Rhs> > { typedef typename remove_all<_Lhs>::type Lhs; typedef typename remove_all<_Rhs>::type Rhs; - typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar; + typedef typename ScalarBinaryOpTraits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar; typedef typename promote_index_type<typename Lhs::StorageIndex, typename Rhs::StorageIndex>::type StorageIndex; enum { @@ -222,7 +222,7 @@ struct traits<KroneckerProductSparse<_Lhs,_Rhs> > typedef MatrixXpr XprKind; typedef typename remove_all<_Lhs>::type Lhs; typedef typename remove_all<_Rhs>::type Rhs; - typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar; + typedef typename ScalarBinaryOpTraits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar; typedef typename cwise_promote_storage_type<typename traits<Lhs>::StorageKind, typename traits<Rhs>::StorageKind, scalar_product_op<typename Lhs::Scalar, typename Rhs::Scalar> >::ret StorageKind; typedef typename promote_index_type<typename Lhs::StorageIndex, typename Rhs::StorageIndex>::type StorageIndex; diff --git a/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt b/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt deleted file mode 100644 index d9690854d..000000000 --- a/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_LevenbergMarquardt_SRCS "*.h") - -INSTALL(FILES - ${Eigen_LevenbergMarquardt_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/LevenbergMarquardt COMPONENT Devel - ) diff --git a/unsupported/Eigen/src/MatrixFunctions/CMakeLists.txt b/unsupported/Eigen/src/MatrixFunctions/CMakeLists.txt deleted file mode 100644 index cdde64d2c..000000000 --- a/unsupported/Eigen/src/MatrixFunctions/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_MatrixFunctions_SRCS "*.h") - -INSTALL(FILES - ${Eigen_MatrixFunctions_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/MatrixFunctions COMPONENT Devel - ) diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h b/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h index 9f08c6162..afd88ec4d 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h @@ -65,21 +65,6 @@ void matrix_sqrt_quasi_triangular_2x1_off_diagonal_block(const MatrixType& T, ty sqrtT.template block<2,1>(i,j) = A.fullPivLu().solve(rhs); } -// similar to compute1x1offDiagonalBlock() -template <typename MatrixType, typename ResultType> -void matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT) -{ - typedef typename traits<MatrixType>::Scalar Scalar; - Matrix<Scalar,2,2> A = sqrtT.template block<2,2>(i,i); - Matrix<Scalar,2,2> B = sqrtT.template block<2,2>(j,j); - Matrix<Scalar,2,2> C = T.template block<2,2>(i,j); - if (j-i > 2) - C -= sqrtT.block(i, i+2, 2, j-i-2) * sqrtT.block(i+2, j, j-i-2, 2); - Matrix<Scalar,2,2> X; - matrix_sqrt_quasi_triangular_solve_auxiliary_equation(X, A, B, C); - sqrtT.template block<2,2>(i,j) = X; -} - // solves the equation A X + X B = C where all matrices are 2-by-2 template <typename MatrixType> void matrix_sqrt_quasi_triangular_solve_auxiliary_equation(MatrixType& X, const MatrixType& A, const MatrixType& B, const MatrixType& C) @@ -98,13 +83,13 @@ void matrix_sqrt_quasi_triangular_solve_auxiliary_equation(MatrixType& X, const coeffMatrix.coeffRef(2,3) = B.coeff(1,0); coeffMatrix.coeffRef(3,1) = A.coeff(1,0); coeffMatrix.coeffRef(3,2) = B.coeff(0,1); - + Matrix<Scalar,4,1> rhs; rhs.coeffRef(0) = C.coeff(0,0); rhs.coeffRef(1) = C.coeff(0,1); rhs.coeffRef(2) = C.coeff(1,0); rhs.coeffRef(3) = C.coeff(1,1); - + Matrix<Scalar,4,1> result; result = coeffMatrix.fullPivLu().solve(rhs); @@ -114,6 +99,20 @@ void matrix_sqrt_quasi_triangular_solve_auxiliary_equation(MatrixType& X, const X.coeffRef(1,1) = result.coeff(3); } +// similar to compute1x1offDiagonalBlock() +template <typename MatrixType, typename ResultType> +void matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT) +{ + typedef typename traits<MatrixType>::Scalar Scalar; + Matrix<Scalar,2,2> A = sqrtT.template block<2,2>(i,i); + Matrix<Scalar,2,2> B = sqrtT.template block<2,2>(j,j); + Matrix<Scalar,2,2> C = T.template block<2,2>(i,j); + if (j-i > 2) + C -= sqrtT.block(i, i+2, 2, j-i-2) * sqrtT.block(i+2, j, j-i-2, 2); + Matrix<Scalar,2,2> X; + matrix_sqrt_quasi_triangular_solve_auxiliary_equation(X, A, B, C); + sqrtT.template block<2,2>(i,j) = X; +} // pre: T is quasi-upper-triangular and sqrtT is a zero matrix of the same size // post: the diagonal blocks of sqrtT are the square roots of the diagonal blocks of T diff --git a/unsupported/Eigen/src/MoreVectorization/CMakeLists.txt b/unsupported/Eigen/src/MoreVectorization/CMakeLists.txt deleted file mode 100644 index 1b887cc8e..000000000 --- a/unsupported/Eigen/src/MoreVectorization/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_MoreVectorization_SRCS "*.h") - -INSTALL(FILES - ${Eigen_MoreVectorization_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/MoreVectorization COMPONENT Devel - ) diff --git a/unsupported/Eigen/src/NonLinearOptimization/CMakeLists.txt b/unsupported/Eigen/src/NonLinearOptimization/CMakeLists.txt deleted file mode 100644 index 9322ddadf..000000000 --- a/unsupported/Eigen/src/NonLinearOptimization/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_NonLinearOptimization_SRCS "*.h") - -INSTALL(FILES - ${Eigen_NonLinearOptimization_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/NonLinearOptimization COMPONENT Devel - ) diff --git a/unsupported/Eigen/src/NumericalDiff/CMakeLists.txt b/unsupported/Eigen/src/NumericalDiff/CMakeLists.txt deleted file mode 100644 index 1199aca2f..000000000 --- a/unsupported/Eigen/src/NumericalDiff/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_NumericalDiff_SRCS "*.h") - -INSTALL(FILES - ${Eigen_NumericalDiff_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/NumericalDiff COMPONENT Devel - ) diff --git a/unsupported/Eigen/src/Polynomials/CMakeLists.txt b/unsupported/Eigen/src/Polynomials/CMakeLists.txt deleted file mode 100644 index 51f13f3cb..000000000 --- a/unsupported/Eigen/src/Polynomials/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_Polynomials_SRCS "*.h") - -INSTALL(FILES - ${Eigen_Polynomials_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Polynomials COMPONENT Devel - ) diff --git a/unsupported/Eigen/src/Skyline/CMakeLists.txt b/unsupported/Eigen/src/Skyline/CMakeLists.txt deleted file mode 100644 index 3bf1b0dd4..000000000 --- a/unsupported/Eigen/src/Skyline/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_Skyline_SRCS "*.h") - -INSTALL(FILES - ${Eigen_Skyline_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Skyline COMPONENT Devel - ) diff --git a/unsupported/Eigen/src/SparseExtra/CMakeLists.txt b/unsupported/Eigen/src/SparseExtra/CMakeLists.txt deleted file mode 100644 index 7ea32ca5e..000000000 --- a/unsupported/Eigen/src/SparseExtra/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_SparseExtra_SRCS "*.h") - -INSTALL(FILES - ${Eigen_SparseExtra_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/SparseExtra COMPONENT Devel - ) diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h new file mode 100644 index 000000000..ed415db99 --- /dev/null +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h @@ -0,0 +1,124 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +#ifndef EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H +#define EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H + +namespace Eigen { + +/** \cpp11 \returns an expression of the coefficient-wise igamma(\a a, \a x) to the given arrays. + * + * This function computes the coefficient-wise incomplete gamma function. + * + * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types, + * or float/double in non c++11 mode, the user has to provide implementations of igammac(T,T) for any scalar + * type T to be supported. + * + * \sa Eigen::igammac(), Eigen::lgamma() + */ +template<typename Derived,typename ExponentDerived> +inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived> +igamma(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x) +{ + return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>( + a.derived(), + x.derived() + ); +} + +/** \cpp11 \returns an expression of the coefficient-wise igammac(\a a, \a x) to the given arrays. + * + * This function computes the coefficient-wise complementary incomplete gamma function. + * + * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types, + * or float/double in non c++11 mode, the user has to provide implementations of igammac(T,T) for any scalar + * type T to be supported. + * + * \sa Eigen::igamma(), Eigen::lgamma() + */ +template<typename Derived,typename ExponentDerived> +inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived> +igammac(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x) +{ + return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>( + a.derived(), + x.derived() + ); +} + +/** \cpp11 \returns an expression of the coefficient-wise polygamma(\a n, \a x) to the given arrays. + * + * It returns the \a n -th derivative of the digamma(psi) evaluated at \c x. + * + * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types, + * or float/double in non c++11 mode, the user has to provide implementations of polygamma(T,T) for any scalar + * type T to be supported. + * + * \sa Eigen::digamma() + */ +// * \warning Be careful with the order of the parameters: x.polygamma(n) is equivalent to polygamma(n,x) +// * \sa ArrayBase::polygamma() +template<typename DerivedN,typename DerivedX> +inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX> +polygamma(const Eigen::ArrayBase<DerivedN>& n, const Eigen::ArrayBase<DerivedX>& x) +{ + return Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX>( + n.derived(), + x.derived() + ); +} + +/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given arrays. + * + * This function computes the regularized incomplete beta function (integral). + * + * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types, + * or float/double in non c++11 mode, the user has to provide implementations of betainc(T,T,T) for any scalar + * type T to be supported. + * + * \sa Eigen::betainc(), Eigen::lgamma() + */ +template<typename ArgADerived, typename ArgBDerived, typename ArgXDerived> +inline const Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived> +betainc(const Eigen::ArrayBase<ArgADerived>& a, const Eigen::ArrayBase<ArgBDerived>& b, const Eigen::ArrayBase<ArgXDerived>& x) +{ + return Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived>( + a.derived(), + b.derived(), + x.derived() + ); +} + + +/** \returns an expression of the coefficient-wise zeta(\a x, \a q) to the given arrays. + * + * It returns the Riemann zeta function of two arguments \a x and \a q: + * + * \param x is the exposent, it must be > 1 + * \param q is the shift, it must be > 0 + * + * \note This function supports only float and double scalar types. To support other scalar types, the user has + * to provide implementations of zeta(T,T) for any scalar type T to be supported. + * + * \sa ArrayBase::zeta() + */ +template<typename DerivedX,typename DerivedQ> +inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ> +zeta(const Eigen::ArrayBase<DerivedX>& x, const Eigen::ArrayBase<DerivedQ>& q) +{ + return Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>( + x.derived(), + q.derived() + ); +} + +} // end namespace Eigen + +#endif // EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h new file mode 100644 index 000000000..d8f2363be --- /dev/null +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h @@ -0,0 +1,236 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com> +// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPECIALFUNCTIONS_FUNCTORS_H +#define EIGEN_SPECIALFUNCTIONS_FUNCTORS_H + +namespace Eigen { + +namespace internal { + + +/** \internal + * \brief Template functor to compute the incomplete gamma function igamma(a, x) + * + * \sa class CwiseBinaryOp, Cwise::igamma + */ +template<typename Scalar> struct scalar_igamma_op : binary_op_base<Scalar,Scalar> +{ + EIGEN_EMPTY_STRUCT_CTOR(scalar_igamma_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const { + using numext::igamma; return igamma(a, x); + } + template<typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const { + return internal::pigamma(a, x); + } +}; +template<typename Scalar> +struct functor_traits<scalar_igamma_op<Scalar> > { + enum { + // Guesstimate + Cost = 20 * NumTraits<Scalar>::MulCost + 10 * NumTraits<Scalar>::AddCost, + PacketAccess = packet_traits<Scalar>::HasIGamma + }; +}; + + +/** \internal + * \brief Template functor to compute the complementary incomplete gamma function igammac(a, x) + * + * \sa class CwiseBinaryOp, Cwise::igammac + */ +template<typename Scalar> struct scalar_igammac_op : binary_op_base<Scalar,Scalar> +{ + EIGEN_EMPTY_STRUCT_CTOR(scalar_igammac_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const { + using numext::igammac; return igammac(a, x); + } + template<typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const + { + return internal::pigammac(a, x); + } +}; +template<typename Scalar> +struct functor_traits<scalar_igammac_op<Scalar> > { + enum { + // Guesstimate + Cost = 20 * NumTraits<Scalar>::MulCost + 10 * NumTraits<Scalar>::AddCost, + PacketAccess = packet_traits<Scalar>::HasIGammac + }; +}; + + +/** \internal + * \brief Template functor to compute the incomplete beta integral betainc(a, b, x) + * + */ +template<typename Scalar> struct scalar_betainc_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_betainc_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& x, const Scalar& a, const Scalar& b) const { + using numext::betainc; return betainc(x, a, b); + } + template<typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& x, const Packet& a, const Packet& b) const + { + return internal::pbetainc(x, a, b); + } +}; +template<typename Scalar> +struct functor_traits<scalar_betainc_op<Scalar> > { + enum { + // Guesstimate + Cost = 400 * NumTraits<Scalar>::MulCost + 400 * NumTraits<Scalar>::AddCost, + PacketAccess = packet_traits<Scalar>::HasBetaInc + }; +}; + + +/** \internal + * \brief Template functor to compute the natural log of the absolute + * value of Gamma of a scalar + * \sa class CwiseUnaryOp, Cwise::lgamma() + */ +template<typename Scalar> struct scalar_lgamma_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_lgamma_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { + using numext::lgamma; return lgamma(a); + } + typedef typename packet_traits<Scalar>::type Packet; + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plgamma(a); } +}; +template<typename Scalar> +struct functor_traits<scalar_lgamma_op<Scalar> > +{ + enum { + // Guesstimate + Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost, + PacketAccess = packet_traits<Scalar>::HasLGamma + }; +}; + +/** \internal + * \brief Template functor to compute psi, the derivative of lgamma of a scalar. + * \sa class CwiseUnaryOp, Cwise::digamma() + */ +template<typename Scalar> struct scalar_digamma_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_digamma_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { + using numext::digamma; return digamma(a); + } + typedef typename packet_traits<Scalar>::type Packet; + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pdigamma(a); } +}; +template<typename Scalar> +struct functor_traits<scalar_digamma_op<Scalar> > +{ + enum { + // Guesstimate + Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost, + PacketAccess = packet_traits<Scalar>::HasDiGamma + }; +}; + +/** \internal + * \brief Template functor to compute the Riemann Zeta function of two arguments. + * \sa class CwiseUnaryOp, Cwise::zeta() + */ +template<typename Scalar> struct scalar_zeta_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_zeta_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& x, const Scalar& q) const { + using numext::zeta; return zeta(x, q); + } + typedef typename packet_traits<Scalar>::type Packet; + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x, const Packet& q) const { return internal::pzeta(x, q); } +}; +template<typename Scalar> +struct functor_traits<scalar_zeta_op<Scalar> > +{ + enum { + // Guesstimate + Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost, + PacketAccess = packet_traits<Scalar>::HasZeta + }; +}; + +/** \internal + * \brief Template functor to compute the polygamma function. + * \sa class CwiseUnaryOp, Cwise::polygamma() + */ +template<typename Scalar> struct scalar_polygamma_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_polygamma_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& n, const Scalar& x) const { + using numext::polygamma; return polygamma(n, x); + } + typedef typename packet_traits<Scalar>::type Packet; + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& n, const Packet& x) const { return internal::ppolygamma(n, x); } +}; +template<typename Scalar> +struct functor_traits<scalar_polygamma_op<Scalar> > +{ + enum { + // Guesstimate + Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost, + PacketAccess = packet_traits<Scalar>::HasPolygamma + }; +}; + +/** \internal + * \brief Template functor to compute the Gauss error function of a + * scalar + * \sa class CwiseUnaryOp, Cwise::erf() + */ +template<typename Scalar> struct scalar_erf_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_erf_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { + using numext::erf; return erf(a); + } + typedef typename packet_traits<Scalar>::type Packet; + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perf(a); } +}; +template<typename Scalar> +struct functor_traits<scalar_erf_op<Scalar> > +{ + enum { + // Guesstimate + Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost, + PacketAccess = packet_traits<Scalar>::HasErf + }; +}; + +/** \internal + * \brief Template functor to compute the Complementary Error Function + * of a scalar + * \sa class CwiseUnaryOp, Cwise::erfc() + */ +template<typename Scalar> struct scalar_erfc_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_erfc_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { + using numext::erfc; return erfc(a); + } + typedef typename packet_traits<Scalar>::type Packet; + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perfc(a); } +}; +template<typename Scalar> +struct functor_traits<scalar_erfc_op<Scalar> > +{ + enum { + // Guesstimate + Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost, + PacketAccess = packet_traits<Scalar>::HasErfc + }; +}; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_SPECIALFUNCTIONS_FUNCTORS_H diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h new file mode 100644 index 000000000..553bcda6a --- /dev/null +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h @@ -0,0 +1,47 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPECIALFUNCTIONS_HALF_H +#define EIGEN_SPECIALFUNCTIONS_HALF_H + +namespace Eigen { +namespace numext { + +#if EIGEN_HAS_C99_MATH +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half lgamma(const Eigen::half& a) { + return Eigen::half(Eigen::numext::lgamma(static_cast<float>(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half digamma(const Eigen::half& a) { + return Eigen::half(Eigen::numext::digamma(static_cast<float>(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half zeta(const Eigen::half& x, const Eigen::half& q) { + return Eigen::half(Eigen::numext::zeta(static_cast<float>(x), static_cast<float>(q))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half polygamma(const Eigen::half& n, const Eigen::half& x) { + return Eigen::half(Eigen::numext::polygamma(static_cast<float>(n), static_cast<float>(x))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erf(const Eigen::half& a) { + return Eigen::half(Eigen::numext::erf(static_cast<float>(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erfc(const Eigen::half& a) { + return Eigen::half(Eigen::numext::erfc(static_cast<float>(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma(const Eigen::half& a, const Eigen::half& x) { + return Eigen::half(Eigen::numext::igamma(static_cast<float>(a), static_cast<float>(x))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igammac(const Eigen::half& a, const Eigen::half& x) { + return Eigen::half(Eigen::numext::igammac(static_cast<float>(a), static_cast<float>(x))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half betainc(const Eigen::half& a, const Eigen::half& b, const Eigen::half& x) { + return Eigen::half(Eigen::numext::betainc(static_cast<float>(a), static_cast<float>(b), static_cast<float>(x))); +} +#endif + +} // end namespace numext +} // end namespace Eigen + +#endif // EIGEN_SPECIALFUNCTIONS_HALF_H diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h new file mode 100644 index 000000000..52619fc0c --- /dev/null +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h @@ -0,0 +1,1551 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Eugene Brevdo <ebrevdo@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPECIAL_FUNCTIONS_H +#define EIGEN_SPECIAL_FUNCTIONS_H + +namespace Eigen { +namespace internal { + +// Parts of this code are based on the Cephes Math Library. +// +// Cephes Math Library Release 2.8: June, 2000 +// Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier +// +// Permission has been kindly provided by the original author +// to incorporate the Cephes software into the Eigen codebase: +// +// From: Stephen Moshier +// To: Eugene Brevdo +// Subject: Re: Permission to wrap several cephes functions in Eigen +// +// Hello Eugene, +// +// Thank you for writing. +// +// If your licensing is similar to BSD, the formal way that has been +// handled is simply to add a statement to the effect that you are incorporating +// the Cephes software by permission of the author. +// +// Good luck with your project, +// Steve + +namespace cephes { + +/* polevl (modified for Eigen) + * + * Evaluate polynomial + * + * + * + * SYNOPSIS: + * + * int N; + * Scalar x, y, coef[N+1]; + * + * y = polevl<decltype(x), N>( x, coef); + * + * + * + * DESCRIPTION: + * + * Evaluates polynomial of degree N: + * + * 2 N + * y = C + C x + C x +...+ C x + * 0 1 2 N + * + * Coefficients are stored in reverse order: + * + * coef[0] = C , ..., coef[N] = C . + * N 0 + * + * The function p1evl() assumes that coef[N] = 1.0 and is + * omitted from the array. Its calling arguments are + * otherwise the same as polevl(). + * + * + * The Eigen implementation is templatized. For best speed, store + * coef as a const array (constexpr), e.g. + * + * const double coef[] = {1.0, 2.0, 3.0, ...}; + * + */ +template <typename Scalar, int N> +struct polevl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar x, const Scalar coef[]) { + EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + + return polevl<Scalar, N - 1>::run(x, coef) * x + coef[N]; + } +}; + +template <typename Scalar> +struct polevl<Scalar, 0> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar, const Scalar coef[]) { + return coef[0]; + } +}; + +} // end namespace cephes + +/**************************************************************************** + * Implementation of lgamma, requires C++11/C99 * + ****************************************************************************/ + +template <typename Scalar> +struct lgamma_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar) { + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +template <typename Scalar> +struct lgamma_retval { + typedef Scalar type; +}; + +#if EIGEN_HAS_C99_MATH +template <> +struct lgamma_impl<float> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float run(float x) { return ::lgammaf(x); } +}; + +template <> +struct lgamma_impl<double> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double run(double x) { return ::lgamma(x); } +}; +#endif + +/**************************************************************************** + * Implementation of digamma (psi), based on Cephes * + ****************************************************************************/ + +template <typename Scalar> +struct digamma_retval { + typedef Scalar type; +}; + +/* + * + * Polynomial evaluation helper for the Psi (digamma) function. + * + * digamma_impl_maybe_poly::run(s) evaluates the asymptotic Psi expansion for + * input Scalar s, assuming s is above 10.0. + * + * If s is above a certain threshold for the given Scalar type, zero + * is returned. Otherwise the polynomial is evaluated with enough + * coefficients for results matching Scalar machine precision. + * + * + */ +template <typename Scalar> +struct digamma_impl_maybe_poly { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar) { + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + + +template <> +struct digamma_impl_maybe_poly<float> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float run(const float s) { + const float A[] = { + -4.16666666666666666667E-3f, + 3.96825396825396825397E-3f, + -8.33333333333333333333E-3f, + 8.33333333333333333333E-2f + }; + + float z; + if (s < 1.0e8f) { + z = 1.0f / (s * s); + return z * cephes::polevl<float, 3>::run(z, A); + } else return 0.0f; + } +}; + +template <> +struct digamma_impl_maybe_poly<double> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double run(const double s) { + const double A[] = { + 8.33333333333333333333E-2, + -2.10927960927960927961E-2, + 7.57575757575757575758E-3, + -4.16666666666666666667E-3, + 3.96825396825396825397E-3, + -8.33333333333333333333E-3, + 8.33333333333333333333E-2 + }; + + double z; + if (s < 1.0e17) { + z = 1.0 / (s * s); + return z * cephes::polevl<double, 6>::run(z, A); + } + else return 0.0; + } +}; + +template <typename Scalar> +struct digamma_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar x) { + /* + * + * Psi (digamma) function (modified for Eigen) + * + * + * SYNOPSIS: + * + * double x, y, psi(); + * + * y = psi( x ); + * + * + * DESCRIPTION: + * + * d - + * psi(x) = -- ln | (x) + * dx + * + * is the logarithmic derivative of the gamma function. + * For integer x, + * n-1 + * - + * psi(n) = -EUL + > 1/k. + * - + * k=1 + * + * If x is negative, it is transformed to a positive argument by the + * reflection formula psi(1-x) = psi(x) + pi cot(pi x). + * For general positive x, the argument is made greater than 10 + * using the recurrence psi(x+1) = psi(x) + 1/x. + * Then the following asymptotic expansion is applied: + * + * inf. B + * - 2k + * psi(x) = log(x) - 1/2x - > ------- + * - 2k + * k=1 2k x + * + * where the B2k are Bernoulli numbers. + * + * ACCURACY (float): + * Relative error (except absolute when |psi| < 1): + * arithmetic domain # trials peak rms + * IEEE 0,30 30000 1.3e-15 1.4e-16 + * IEEE -30,0 40000 1.5e-15 2.2e-16 + * + * ACCURACY (double): + * Absolute error, relative when |psi| > 1 : + * arithmetic domain # trials peak rms + * IEEE -33,0 30000 8.2e-7 1.2e-7 + * IEEE 0,33 100000 7.3e-7 7.7e-8 + * + * ERROR MESSAGES: + * message condition value returned + * psi singularity x integer <=0 INFINITY + */ + + Scalar p, q, nz, s, w, y; + bool negative = false; + + const Scalar maxnum = NumTraits<Scalar>::infinity(); + const Scalar m_pi = Scalar(EIGEN_PI); + + const Scalar zero = Scalar(0); + const Scalar one = Scalar(1); + const Scalar half = Scalar(0.5); + nz = zero; + + if (x <= zero) { + negative = true; + q = x; + p = numext::floor(q); + if (p == q) { + return maxnum; + } + /* Remove the zeros of tan(m_pi x) + * by subtracting the nearest integer from x + */ + nz = q - p; + if (nz != half) { + if (nz > half) { + p += one; + nz = q - p; + } + nz = m_pi / numext::tan(m_pi * nz); + } + else { + nz = zero; + } + x = one - x; + } + + /* use the recurrence psi(x+1) = psi(x) + 1/x. */ + s = x; + w = zero; + while (s < Scalar(10)) { + w += one / s; + s += one; + } + + y = digamma_impl_maybe_poly<Scalar>::run(s); + + y = numext::log(s) - (half / s) - y - w; + + return (negative) ? y - nz : y; + } +}; + +/**************************************************************************** + * Implementation of erf, requires C++11/C99 * + ****************************************************************************/ + +template <typename Scalar> +struct erf_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar) { + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +template <typename Scalar> +struct erf_retval { + typedef Scalar type; +}; + +#if EIGEN_HAS_C99_MATH +template <> +struct erf_impl<float> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float run(float x) { return ::erff(x); } +}; + +template <> +struct erf_impl<double> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double run(double x) { return ::erf(x); } +}; +#endif // EIGEN_HAS_C99_MATH + +/*************************************************************************** +* Implementation of erfc, requires C++11/C99 * +****************************************************************************/ + +template <typename Scalar> +struct erfc_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar) { + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +template <typename Scalar> +struct erfc_retval { + typedef Scalar type; +}; + +#if EIGEN_HAS_C99_MATH +template <> +struct erfc_impl<float> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float run(const float x) { return ::erfcf(x); } +}; + +template <> +struct erfc_impl<double> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double run(const double x) { return ::erfc(x); } +}; +#endif // EIGEN_HAS_C99_MATH + +/************************************************************************************************************** + * Implementation of igammac (complemented incomplete gamma integral), based on Cephes but requires C++11/C99 * + **************************************************************************************************************/ + +template <typename Scalar> +struct igammac_retval { + typedef Scalar type; +}; + +// NOTE: cephes_helper is also used to implement zeta +template <typename Scalar> +struct cephes_helper { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar machep() { assert(false && "machep not supported for this type"); return 0.0; } + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar big() { assert(false && "big not supported for this type"); return 0.0; } + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar biginv() { assert(false && "biginv not supported for this type"); return 0.0; } +}; + +template <> +struct cephes_helper<float> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float machep() { + return NumTraits<float>::epsilon() / 2; // 1.0 - machep == 1.0 + } + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float big() { + // use epsneg (1.0 - epsneg == 1.0) + return 1.0f / (NumTraits<float>::epsilon() / 2); + } + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float biginv() { + // epsneg + return machep(); + } +}; + +template <> +struct cephes_helper<double> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double machep() { + return NumTraits<double>::epsilon() / 2; // 1.0 - machep == 1.0 + } + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double big() { + return 1.0 / NumTraits<double>::epsilon(); + } + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double biginv() { + // inverse of eps + return NumTraits<double>::epsilon(); + } +}; + +#if !EIGEN_HAS_C99_MATH + +template <typename Scalar> +struct igammac_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar a, Scalar x) { + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +#else + +template <typename Scalar> struct igamma_impl; // predeclare igamma_impl + +template <typename Scalar> +struct igammac_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar a, Scalar x) { + /* igamc() + * + * Incomplete gamma integral (modified for Eigen) + * + * + * + * SYNOPSIS: + * + * double a, x, y, igamc(); + * + * y = igamc( a, x ); + * + * DESCRIPTION: + * + * The function is defined by + * + * + * igamc(a,x) = 1 - igam(a,x) + * + * inf. + * - + * 1 | | -t a-1 + * = ----- | e t dt. + * - | | + * | (a) - + * x + * + * + * In this implementation both arguments must be positive. + * The integral is evaluated by either a power series or + * continued fraction expansion, depending on the relative + * values of a and x. + * + * ACCURACY (float): + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0,30 30000 7.8e-6 5.9e-7 + * + * + * ACCURACY (double): + * + * Tested at random a, x. + * a x Relative error: + * arithmetic domain domain # trials peak rms + * IEEE 0.5,100 0,100 200000 1.9e-14 1.7e-15 + * IEEE 0.01,0.5 0,100 200000 1.4e-13 1.6e-15 + * + */ + /* + Cephes Math Library Release 2.2: June, 1992 + Copyright 1985, 1987, 1992 by Stephen L. Moshier + Direct inquiries to 30 Frost Street, Cambridge, MA 02140 + */ + const Scalar zero = 0; + const Scalar one = 1; + const Scalar nan = NumTraits<Scalar>::quiet_NaN(); + + if ((x < zero) || (a <= zero)) { + // domain error + return nan; + } + + if ((x < one) || (x < a)) { + /* The checks above ensure that we meet the preconditions for + * igamma_impl::Impl(), so call it, rather than igamma_impl::Run(). + * Calling Run() would also work, but in that case the compiler may not be + * able to prove that igammac_impl::Run and igamma_impl::Run are not + * mutually recursive. This leads to worse code, particularly on + * platforms like nvptx, where recursion is allowed only begrudgingly. + */ + return (one - igamma_impl<Scalar>::Impl(a, x)); + } + + return Impl(a, x); + } + + private: + /* igamma_impl calls igammac_impl::Impl. */ + friend struct igamma_impl<Scalar>; + + /* Actually computes igamc(a, x). + * + * Preconditions: + * a > 0 + * x >= 1 + * x >= a + */ + EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) { + const Scalar zero = 0; + const Scalar one = 1; + const Scalar two = 2; + const Scalar machep = cephes_helper<Scalar>::machep(); + const Scalar maxlog = numext::log(NumTraits<Scalar>::highest()); + const Scalar big = cephes_helper<Scalar>::big(); + const Scalar biginv = cephes_helper<Scalar>::biginv(); + const Scalar inf = NumTraits<Scalar>::infinity(); + + Scalar ans, ax, c, yc, r, t, y, z; + Scalar pk, pkm1, pkm2, qk, qkm1, qkm2; + + if (x == inf) return zero; // std::isinf crashes on CUDA + + /* Compute x**a * exp(-x) / gamma(a) */ + ax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a); + if (ax < -maxlog) { // underflow + return zero; + } + ax = numext::exp(ax); + + // continued fraction + y = one - a; + z = x + y + one; + c = zero; + pkm2 = one; + qkm2 = x; + pkm1 = x + one; + qkm1 = z * x; + ans = pkm1 / qkm1; + + while (true) { + c += one; + y += one; + z += two; + yc = y * c; + pk = pkm1 * z - pkm2 * yc; + qk = qkm1 * z - qkm2 * yc; + if (qk != zero) { + r = pk / qk; + t = numext::abs((ans - r) / r); + ans = r; + } else { + t = one; + } + pkm2 = pkm1; + pkm1 = pk; + qkm2 = qkm1; + qkm1 = qk; + if (numext::abs(pk) > big) { + pkm2 *= biginv; + pkm1 *= biginv; + qkm2 *= biginv; + qkm1 *= biginv; + } + if (t <= machep) { + break; + } + } + + return (ans * ax); + } +}; + +#endif // EIGEN_HAS_C99_MATH + +/************************************************************************************************ + * Implementation of igamma (incomplete gamma integral), based on Cephes but requires C++11/C99 * + ************************************************************************************************/ + +template <typename Scalar> +struct igamma_retval { + typedef Scalar type; +}; + +#if !EIGEN_HAS_C99_MATH + +template <typename Scalar> +struct igamma_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar x) { + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +#else + +template <typename Scalar> +struct igamma_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar a, Scalar x) { + /* igam() + * Incomplete gamma integral + * + * + * + * SYNOPSIS: + * + * double a, x, y, igam(); + * + * y = igam( a, x ); + * + * DESCRIPTION: + * + * The function is defined by + * + * x + * - + * 1 | | -t a-1 + * igam(a,x) = ----- | e t dt. + * - | | + * | (a) - + * 0 + * + * + * In this implementation both arguments must be positive. + * The integral is evaluated by either a power series or + * continued fraction expansion, depending on the relative + * values of a and x. + * + * ACCURACY (double): + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0,30 200000 3.6e-14 2.9e-15 + * IEEE 0,100 300000 9.9e-14 1.5e-14 + * + * + * ACCURACY (float): + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0,30 20000 7.8e-6 5.9e-7 + * + */ + /* + Cephes Math Library Release 2.2: June, 1992 + Copyright 1985, 1987, 1992 by Stephen L. Moshier + Direct inquiries to 30 Frost Street, Cambridge, MA 02140 + */ + + + /* left tail of incomplete gamma function: + * + * inf. k + * a -x - x + * x e > ---------- + * - - + * k=0 | (a+k+1) + * + */ + const Scalar zero = 0; + const Scalar one = 1; + const Scalar nan = NumTraits<Scalar>::quiet_NaN(); + + if (x == zero) return zero; + + if ((x < zero) || (a <= zero)) { // domain error + return nan; + } + + if ((x > one) && (x > a)) { + /* The checks above ensure that we meet the preconditions for + * igammac_impl::Impl(), so call it, rather than igammac_impl::Run(). + * Calling Run() would also work, but in that case the compiler may not be + * able to prove that igammac_impl::Run and igamma_impl::Run are not + * mutually recursive. This leads to worse code, particularly on + * platforms like nvptx, where recursion is allowed only begrudgingly. + */ + return (one - igammac_impl<Scalar>::Impl(a, x)); + } + + return Impl(a, x); + } + + private: + /* igammac_impl calls igamma_impl::Impl. */ + friend struct igammac_impl<Scalar>; + + /* Actually computes igam(a, x). + * + * Preconditions: + * x > 0 + * a > 0 + * !(x > 1 && x > a) + */ + EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) { + const Scalar zero = 0; + const Scalar one = 1; + const Scalar machep = cephes_helper<Scalar>::machep(); + const Scalar maxlog = numext::log(NumTraits<Scalar>::highest()); + + Scalar ans, ax, c, r; + + /* Compute x**a * exp(-x) / gamma(a) */ + ax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a); + if (ax < -maxlog) { + // underflow + return zero; + } + ax = numext::exp(ax); + + /* power series */ + r = a; + c = one; + ans = one; + + while (true) { + r += one; + c *= x/r; + ans += c; + if (c/ans <= machep) { + break; + } + } + + return (ans * ax / a); + } +}; + +#endif // EIGEN_HAS_C99_MATH + +/***************************************************************************** + * Implementation of Riemann zeta function of two arguments, based on Cephes * + *****************************************************************************/ + +template <typename Scalar> +struct zeta_retval { + typedef Scalar type; +}; + +template <typename Scalar> +struct zeta_impl_series { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar) { + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +template <> +struct zeta_impl_series<float> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE bool run(float& a, float& b, float& s, const float x, const float machep) { + int i = 0; + while(i < 9) + { + i += 1; + a += 1.0f; + b = numext::pow( a, -x ); + s += b; + if( numext::abs(b/s) < machep ) + return true; + } + + //Return whether we are done + return false; + } +}; + +template <> +struct zeta_impl_series<double> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE bool run(double& a, double& b, double& s, const double x, const double machep) { + int i = 0; + while( (i < 9) || (a <= 9.0) ) + { + i += 1; + a += 1.0; + b = numext::pow( a, -x ); + s += b; + if( numext::abs(b/s) < machep ) + return true; + } + + //Return whether we are done + return false; + } +}; + +template <typename Scalar> +struct zeta_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar x, Scalar q) { + /* zeta.c + * + * Riemann zeta function of two arguments + * + * + * + * SYNOPSIS: + * + * double x, q, y, zeta(); + * + * y = zeta( x, q ); + * + * + * + * DESCRIPTION: + * + * + * + * inf. + * - -x + * zeta(x,q) = > (k+q) + * - + * k=0 + * + * where x > 1 and q is not a negative integer or zero. + * The Euler-Maclaurin summation formula is used to obtain + * the expansion + * + * n + * - -x + * zeta(x,q) = > (k+q) + * - + * k=1 + * + * 1-x inf. B x(x+1)...(x+2j) + * (n+q) 1 - 2j + * + --------- - ------- + > -------------------- + * x-1 x - x+2j+1 + * 2(n+q) j=1 (2j)! (n+q) + * + * where the B2j are Bernoulli numbers. Note that (see zetac.c) + * zeta(x,1) = zetac(x) + 1. + * + * + * + * ACCURACY: + * + * Relative error for single precision: + * arithmetic domain # trials peak rms + * IEEE 0,25 10000 6.9e-7 1.0e-7 + * + * Large arguments may produce underflow in powf(), in which + * case the results are inaccurate. + * + * REFERENCE: + * + * Gradshteyn, I. S., and I. M. Ryzhik, Tables of Integrals, + * Series, and Products, p. 1073; Academic Press, 1980. + * + */ + + int i; + Scalar p, r, a, b, k, s, t, w; + + const Scalar A[] = { + Scalar(12.0), + Scalar(-720.0), + Scalar(30240.0), + Scalar(-1209600.0), + Scalar(47900160.0), + Scalar(-1.8924375803183791606e9), /*1.307674368e12/691*/ + Scalar(7.47242496e10), + Scalar(-2.950130727918164224e12), /*1.067062284288e16/3617*/ + Scalar(1.1646782814350067249e14), /*5.109094217170944e18/43867*/ + Scalar(-4.5979787224074726105e15), /*8.028576626982912e20/174611*/ + Scalar(1.8152105401943546773e17), /*1.5511210043330985984e23/854513*/ + Scalar(-7.1661652561756670113e18) /*1.6938241367317436694528e27/236364091*/ + }; + + const Scalar maxnum = NumTraits<Scalar>::infinity(); + const Scalar zero = 0.0, half = 0.5, one = 1.0; + const Scalar machep = cephes_helper<Scalar>::machep(); + const Scalar nan = NumTraits<Scalar>::quiet_NaN(); + + if( x == one ) + return maxnum; + + if( x < one ) + { + return nan; + } + + if( q <= zero ) + { + if(q == numext::floor(q)) + { + return maxnum; + } + p = x; + r = numext::floor(p); + if (p != r) + return nan; + } + + /* Permit negative q but continue sum until n+q > +9 . + * This case should be handled by a reflection formula. + * If q<0 and x is an integer, there is a relation to + * the polygamma function. + */ + s = numext::pow( q, -x ); + a = q; + b = zero; + // Run the summation in a helper function that is specific to the floating precision + if (zeta_impl_series<Scalar>::run(a, b, s, x, machep)) { + return s; + } + + w = a; + s += b*w/(x-one); + s -= half * b; + a = one; + k = zero; + for( i=0; i<12; i++ ) + { + a *= x + k; + b /= w; + t = a*b/A[i]; + s = s + t; + t = numext::abs(t/s); + if( t < machep ) { + break; + } + k += one; + a *= x + k; + b /= w; + k += one; + } + return s; + } +}; + +/**************************************************************************** + * Implementation of polygamma function, requires C++11/C99 * + ****************************************************************************/ + +template <typename Scalar> +struct polygamma_retval { + typedef Scalar type; +}; + +#if !EIGEN_HAS_C99_MATH + +template <typename Scalar> +struct polygamma_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(Scalar n, Scalar x) { + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +#else + +template <typename Scalar> +struct polygamma_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar n, Scalar x) { + Scalar zero = 0.0, one = 1.0; + Scalar nplus = n + one; + const Scalar nan = NumTraits<Scalar>::quiet_NaN(); + + // Check that n is an integer + if (numext::floor(n) != n) { + return nan; + } + // Just return the digamma function for n = 1 + else if (n == zero) { + return digamma_impl<Scalar>::run(x); + } + // Use the same implementation as scipy + else { + Scalar factorial = numext::exp(lgamma_impl<Scalar>::run(nplus)); + return numext::pow(-one, nplus) * factorial * zeta_impl<Scalar>::run(nplus, x); + } + } +}; + +#endif // EIGEN_HAS_C99_MATH + +/************************************************************************************************ + * Implementation of betainc (incomplete beta integral), based on Cephes but requires C++11/C99 * + ************************************************************************************************/ + +template <typename Scalar> +struct betainc_retval { + typedef Scalar type; +}; + +#if !EIGEN_HAS_C99_MATH + +template <typename Scalar> +struct betainc_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar b, Scalar x) { + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +#else + +template <typename Scalar> +struct betainc_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(Scalar, Scalar, Scalar) { + /* betaincf.c + * + * Incomplete beta integral + * + * + * SYNOPSIS: + * + * float a, b, x, y, betaincf(); + * + * y = betaincf( a, b, x ); + * + * + * DESCRIPTION: + * + * Returns incomplete beta integral of the arguments, evaluated + * from zero to x. The function is defined as + * + * x + * - - + * | (a+b) | | a-1 b-1 + * ----------- | t (1-t) dt. + * - - | | + * | (a) | (b) - + * 0 + * + * The domain of definition is 0 <= x <= 1. In this + * implementation a and b are restricted to positive values. + * The integral from x to 1 may be obtained by the symmetry + * relation + * + * 1 - betainc( a, b, x ) = betainc( b, a, 1-x ). + * + * The integral is evaluated by a continued fraction expansion. + * If a < 1, the function calls itself recursively after a + * transformation to increase a to a+1. + * + * ACCURACY (float): + * + * Tested at random points (a,b,x) with a and b in the indicated + * interval and x between 0 and 1. + * + * arithmetic domain # trials peak rms + * Relative error: + * IEEE 0,30 10000 3.7e-5 5.1e-6 + * IEEE 0,100 10000 1.7e-4 2.5e-5 + * The useful domain for relative error is limited by underflow + * of the single precision exponential function. + * Absolute error: + * IEEE 0,30 100000 2.2e-5 9.6e-7 + * IEEE 0,100 10000 6.5e-5 3.7e-6 + * + * Larger errors may occur for extreme ratios of a and b. + * + * ACCURACY (double): + * arithmetic domain # trials peak rms + * IEEE 0,5 10000 6.9e-15 4.5e-16 + * IEEE 0,85 250000 2.2e-13 1.7e-14 + * IEEE 0,1000 30000 5.3e-12 6.3e-13 + * IEEE 0,10000 250000 9.3e-11 7.1e-12 + * IEEE 0,100000 10000 8.7e-10 4.8e-11 + * Outputs smaller than the IEEE gradual underflow threshold + * were excluded from these statistics. + * + * ERROR MESSAGES: + * message condition value returned + * incbet domain x<0, x>1 nan + * incbet underflow nan + */ + + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +/* Continued fraction expansion #1 for incomplete beta integral (small_branch = True) + * Continued fraction expansion #2 for incomplete beta integral (small_branch = False) + */ +template <typename Scalar> +struct incbeta_cfe { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar b, Scalar x, bool small_branch) { + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, float>::value || + internal::is_same<Scalar, double>::value), + THIS_TYPE_IS_NOT_SUPPORTED); + const Scalar big = cephes_helper<Scalar>::big(); + const Scalar machep = cephes_helper<Scalar>::machep(); + const Scalar biginv = cephes_helper<Scalar>::biginv(); + + const Scalar zero = 0; + const Scalar one = 1; + const Scalar two = 2; + + Scalar xk, pk, pkm1, pkm2, qk, qkm1, qkm2; + Scalar k1, k2, k3, k4, k5, k6, k7, k8, k26update; + Scalar ans; + int n; + + const int num_iters = (internal::is_same<Scalar, float>::value) ? 100 : 300; + const Scalar thresh = + (internal::is_same<Scalar, float>::value) ? machep : Scalar(3) * machep; + Scalar r = (internal::is_same<Scalar, float>::value) ? zero : one; + + if (small_branch) { + k1 = a; + k2 = a + b; + k3 = a; + k4 = a + one; + k5 = one; + k6 = b - one; + k7 = k4; + k8 = a + two; + k26update = one; + } else { + k1 = a; + k2 = b - one; + k3 = a; + k4 = a + one; + k5 = one; + k6 = a + b; + k7 = a + one; + k8 = a + two; + k26update = -one; + x = x / (one - x); + } + + pkm2 = zero; + qkm2 = one; + pkm1 = one; + qkm1 = one; + ans = one; + n = 0; + + do { + xk = -(x * k1 * k2) / (k3 * k4); + pk = pkm1 + pkm2 * xk; + qk = qkm1 + qkm2 * xk; + pkm2 = pkm1; + pkm1 = pk; + qkm2 = qkm1; + qkm1 = qk; + + xk = (x * k5 * k6) / (k7 * k8); + pk = pkm1 + pkm2 * xk; + qk = qkm1 + qkm2 * xk; + pkm2 = pkm1; + pkm1 = pk; + qkm2 = qkm1; + qkm1 = qk; + + if (qk != zero) { + r = pk / qk; + if (numext::abs(ans - r) < numext::abs(r) * thresh) { + return r; + } + ans = r; + } + + k1 += one; + k2 += k26update; + k3 += two; + k4 += two; + k5 += one; + k6 -= k26update; + k7 += two; + k8 += two; + + if ((numext::abs(qk) + numext::abs(pk)) > big) { + pkm2 *= biginv; + pkm1 *= biginv; + qkm2 *= biginv; + qkm1 *= biginv; + } + if ((numext::abs(qk) < biginv) || (numext::abs(pk) < biginv)) { + pkm2 *= big; + pkm1 *= big; + qkm2 *= big; + qkm1 *= big; + } + } while (++n < num_iters); + + return ans; + } +}; + +/* Helper functions depending on the Scalar type */ +template <typename Scalar> +struct betainc_helper {}; + +template <> +struct betainc_helper<float> { + /* Core implementation, assumes a large (> 1.0) */ + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float incbsa(float aa, float bb, + float xx) { + float ans, a, b, t, x, onemx; + bool reversed_a_b = false; + + onemx = 1.0f - xx; + + /* see if x is greater than the mean */ + if (xx > (aa / (aa + bb))) { + reversed_a_b = true; + a = bb; + b = aa; + t = xx; + x = onemx; + } else { + a = aa; + b = bb; + t = onemx; + x = xx; + } + + /* Choose expansion for optimal convergence */ + if (b > 10.0f) { + if (numext::abs(b * x / a) < 0.3f) { + t = betainc_helper<float>::incbps(a, b, x); + if (reversed_a_b) t = 1.0f - t; + return t; + } + } + + ans = x * (a + b - 2.0f) / (a - 1.0f); + if (ans < 1.0f) { + ans = incbeta_cfe<float>::run(a, b, x, true /* small_branch */); + t = b * numext::log(t); + } else { + ans = incbeta_cfe<float>::run(a, b, x, false /* small_branch */); + t = (b - 1.0f) * numext::log(t); + } + + t += a * numext::log(x) + lgamma_impl<float>::run(a + b) - + lgamma_impl<float>::run(a) - lgamma_impl<float>::run(b); + t += numext::log(ans / a); + t = numext::exp(t); + + if (reversed_a_b) t = 1.0f - t; + return t; + } + + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float incbps(float a, float b, float x) { + float t, u, y, s; + const float machep = cephes_helper<float>::machep(); + + y = a * numext::log(x) + (b - 1.0f) * numext::log1p(-x) - numext::log(a); + y -= lgamma_impl<float>::run(a) + lgamma_impl<float>::run(b); + y += lgamma_impl<float>::run(a + b); + + t = x / (1.0f - x); + s = 0.0f; + u = 1.0f; + do { + b -= 1.0f; + if (b == 0.0f) { + break; + } + a += 1.0f; + u *= t * b / a; + s += u; + } while (numext::abs(u) > machep); + + return numext::exp(y) * (1.0f + s); + } +}; + +template <> +struct betainc_impl<float> { + EIGEN_DEVICE_FUNC + static float run(float a, float b, float x) { + const float nan = NumTraits<float>::quiet_NaN(); + float ans, t; + + if (a <= 0.0f) return nan; + if (b <= 0.0f) return nan; + if ((x <= 0.0f) || (x >= 1.0f)) { + if (x == 0.0f) return 0.0f; + if (x == 1.0f) return 1.0f; + // mtherr("betaincf", DOMAIN); + return nan; + } + + /* transformation for small aa */ + if (a <= 1.0f) { + ans = betainc_helper<float>::incbsa(a + 1.0f, b, x); + t = a * numext::log(x) + b * numext::log1p(-x) + + lgamma_impl<float>::run(a + b) - lgamma_impl<float>::run(a + 1.0f) - + lgamma_impl<float>::run(b); + return (ans + numext::exp(t)); + } else { + return betainc_helper<float>::incbsa(a, b, x); + } + } +}; + +template <> +struct betainc_helper<double> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double incbps(double a, double b, double x) { + const double machep = cephes_helper<double>::machep(); + + double s, t, u, v, n, t1, z, ai; + + ai = 1.0 / a; + u = (1.0 - b) * x; + v = u / (a + 1.0); + t1 = v; + t = u; + n = 2.0; + s = 0.0; + z = machep * ai; + while (numext::abs(v) > z) { + u = (n - b) * x / n; + t *= u; + v = t / (a + n); + s += v; + n += 1.0; + } + s += t1; + s += ai; + + u = a * numext::log(x); + // TODO: gamma() is not directly implemented in Eigen. + /* + if ((a + b) < maxgam && numext::abs(u) < maxlog) { + t = gamma(a + b) / (gamma(a) * gamma(b)); + s = s * t * pow(x, a); + } else { + */ + t = lgamma_impl<double>::run(a + b) - lgamma_impl<double>::run(a) - + lgamma_impl<double>::run(b) + u + numext::log(s); + return s = numext::exp(t); + } +}; + +template <> +struct betainc_impl<double> { + EIGEN_DEVICE_FUNC + static double run(double aa, double bb, double xx) { + const double nan = NumTraits<double>::quiet_NaN(); + const double machep = cephes_helper<double>::machep(); + // const double maxgam = 171.624376956302725; + + double a, b, t, x, xc, w, y; + bool reversed_a_b = false; + + if (aa <= 0.0 || bb <= 0.0) { + return nan; // goto domerr; + } + + if ((xx <= 0.0) || (xx >= 1.0)) { + if (xx == 0.0) return (0.0); + if (xx == 1.0) return (1.0); + // mtherr("incbet", DOMAIN); + return nan; + } + + if ((bb * xx) <= 1.0 && xx <= 0.95) { + return betainc_helper<double>::incbps(aa, bb, xx); + } + + w = 1.0 - xx; + + /* Reverse a and b if x is greater than the mean. */ + if (xx > (aa / (aa + bb))) { + reversed_a_b = true; + a = bb; + b = aa; + xc = xx; + x = w; + } else { + a = aa; + b = bb; + xc = w; + x = xx; + } + + if (reversed_a_b && (b * x) <= 1.0 && x <= 0.95) { + t = betainc_helper<double>::incbps(a, b, x); + if (t <= machep) { + t = 1.0 - machep; + } else { + t = 1.0 - t; + } + return t; + } + + /* Choose expansion for better convergence. */ + y = x * (a + b - 2.0) - (a - 1.0); + if (y < 0.0) { + w = incbeta_cfe<double>::run(a, b, x, true /* small_branch */); + } else { + w = incbeta_cfe<double>::run(a, b, x, false /* small_branch */) / xc; + } + + /* Multiply w by the factor + a b _ _ _ + x (1-x) | (a+b) / ( a | (a) | (b) ) . */ + + y = a * numext::log(x); + t = b * numext::log(xc); + // TODO: gamma is not directly implemented in Eigen. + /* + if ((a + b) < maxgam && numext::abs(y) < maxlog && numext::abs(t) < maxlog) + { + t = pow(xc, b); + t *= pow(x, a); + t /= a; + t *= w; + t *= gamma(a + b) / (gamma(a) * gamma(b)); + } else { + */ + /* Resort to logarithms. */ + y += t + lgamma_impl<double>::run(a + b) - lgamma_impl<double>::run(a) - + lgamma_impl<double>::run(b); + y += numext::log(w / a); + t = numext::exp(y); + + /* } */ + // done: + + if (reversed_a_b) { + if (t <= machep) { + t = 1.0 - machep; + } else { + t = 1.0 - t; + } + } + return t; + } +}; + +#endif // EIGEN_HAS_C99_MATH + +} // end namespace internal + +namespace numext { + +template <typename Scalar> +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(lgamma, Scalar) + lgamma(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(lgamma, Scalar)::run(x); +} + +template <typename Scalar> +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(digamma, Scalar) + digamma(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(digamma, Scalar)::run(x); +} + +template <typename Scalar> +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(zeta, Scalar) +zeta(const Scalar& x, const Scalar& q) { + return EIGEN_MATHFUNC_IMPL(zeta, Scalar)::run(x, q); +} + +template <typename Scalar> +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(polygamma, Scalar) +polygamma(const Scalar& n, const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(polygamma, Scalar)::run(n, x); +} + +template <typename Scalar> +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erf, Scalar) + erf(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(erf, Scalar)::run(x); +} + +template <typename Scalar> +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erfc, Scalar) + erfc(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(erfc, Scalar)::run(x); +} + +template <typename Scalar> +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igamma, Scalar) + igamma(const Scalar& a, const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(igamma, Scalar)::run(a, x); +} + +template <typename Scalar> +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igammac, Scalar) + igammac(const Scalar& a, const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(igammac, Scalar)::run(a, x); +} + +template <typename Scalar> +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(betainc, Scalar) + betainc(const Scalar& a, const Scalar& b, const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(betainc, Scalar)::run(a, b, x); +} + +} // end namespace numext + + +} // end namespace Eigen + +#endif // EIGEN_SPECIAL_FUNCTIONS_H diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h new file mode 100644 index 000000000..46d60d323 --- /dev/null +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h @@ -0,0 +1,58 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPECIALFUNCTIONS_PACKETMATH_H +#define EIGEN_SPECIALFUNCTIONS_PACKETMATH_H + +namespace Eigen { + +namespace internal { + +/** \internal \returns the ln(|gamma(\a a)|) (coeff-wise) */ +template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet plgamma(const Packet& a) { using numext::lgamma; return lgamma(a); } + +/** \internal \returns the derivative of lgamma, psi(\a a) (coeff-wise) */ +template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pdigamma(const Packet& a) { using numext::digamma; return digamma(a); } + +/** \internal \returns the zeta function of two arguments (coeff-wise) */ +template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pzeta(const Packet& x, const Packet& q) { using numext::zeta; return zeta(x, q); } + +/** \internal \returns the polygamma function (coeff-wise) */ +template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet ppolygamma(const Packet& n, const Packet& x) { using numext::polygamma; return polygamma(n, x); } + +/** \internal \returns the erf(\a a) (coeff-wise) */ +template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet perf(const Packet& a) { using numext::erf; return erf(a); } + +/** \internal \returns the erfc(\a a) (coeff-wise) */ +template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet perfc(const Packet& a) { using numext::erfc; return erfc(a); } + +/** \internal \returns the incomplete gamma function igamma(\a a, \a x) */ +template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +Packet pigamma(const Packet& a, const Packet& x) { using numext::igamma; return igamma(a, x); } + +/** \internal \returns the complementary incomplete gamma function igammac(\a a, \a x) */ +template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +Packet pigammac(const Packet& a, const Packet& x) { using numext::igammac; return igammac(a, x); } + +/** \internal \returns the complementary incomplete gamma function betainc(\a a, \a b, \a x) */ +template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +Packet pbetainc(const Packet& a, const Packet& b,const Packet& x) { using numext::betainc; return betainc(a, b, x); } + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_SPECIALFUNCTIONS_PACKETMATH_H + diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h new file mode 100644 index 000000000..ec4fa8448 --- /dev/null +++ b/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h @@ -0,0 +1,165 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CUDA_SPECIALFUNCTIONS_H +#define EIGEN_CUDA_SPECIALFUNCTIONS_H + +namespace Eigen { + +namespace internal { + +// Make sure this is only available when targeting a GPU: we don't want to +// introduce conflicts between these packet_traits definitions and the ones +// we'll use on the host side (SSE, AVX, ...) +#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 plgamma<float4>(const float4& a) +{ + return make_float4(lgammaf(a.x), lgammaf(a.y), lgammaf(a.z), lgammaf(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 plgamma<double2>(const double2& a) +{ + using numext::lgamma; + return make_double2(lgamma(a.x), lgamma(a.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pdigamma<float4>(const float4& a) +{ + using numext::digamma; + return make_float4(digamma(a.x), digamma(a.y), digamma(a.z), digamma(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pdigamma<double2>(const double2& a) +{ + using numext::digamma; + return make_double2(digamma(a.x), digamma(a.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pzeta<float4>(const float4& x, const float4& q) +{ + using numext::zeta; + return make_float4(zeta(x.x, q.x), zeta(x.y, q.y), zeta(x.z, q.z), zeta(x.w, q.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pzeta<double2>(const double2& x, const double2& q) +{ + using numext::zeta; + return make_double2(zeta(x.x, q.x), zeta(x.y, q.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 ppolygamma<float4>(const float4& n, const float4& x) +{ + using numext::polygamma; + return make_float4(polygamma(n.x, x.x), polygamma(n.y, x.y), polygamma(n.z, x.z), polygamma(n.w, x.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 ppolygamma<double2>(const double2& n, const double2& x) +{ + using numext::polygamma; + return make_double2(polygamma(n.x, x.x), polygamma(n.y, x.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 perf<float4>(const float4& a) +{ + return make_float4(erff(a.x), erff(a.y), erff(a.z), erff(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 perf<double2>(const double2& a) +{ + using numext::erf; + return make_double2(erf(a.x), erf(a.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 perfc<float4>(const float4& a) +{ + using numext::erfc; + return make_float4(erfc(a.x), erfc(a.y), erfc(a.z), erfc(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 perfc<double2>(const double2& a) +{ + using numext::erfc; + return make_double2(erfc(a.x), erfc(a.y)); +} + + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pigamma<float4>(const float4& a, const float4& x) +{ + using numext::igamma; + return make_float4( + igamma(a.x, x.x), + igamma(a.y, x.y), + igamma(a.z, x.z), + igamma(a.w, x.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pigamma<double2>(const double2& a, const double2& x) +{ + using numext::igamma; + return make_double2(igamma(a.x, x.x), igamma(a.y, x.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pigammac<float4>(const float4& a, const float4& x) +{ + using numext::igammac; + return make_float4( + igammac(a.x, x.x), + igammac(a.y, x.y), + igammac(a.z, x.z), + igammac(a.w, x.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pigammac<double2>(const double2& a, const double2& x) +{ + using numext::igammac; + return make_double2(igammac(a.x, x.x), igammac(a.y, x.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pbetainc<float4>(const float4& a, const float4& b, const float4& x) +{ + using numext::betainc; + return make_float4( + betainc(a.x, b.x, x.x), + betainc(a.y, b.y, x.y), + betainc(a.z, b.z, x.z), + betainc(a.w, b.w, x.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pbetainc<double2>(const double2& a, const double2& b, const double2& x) +{ + using numext::betainc; + return make_double2(betainc(a.x, b.x, x.x), betainc(a.y, b.y, x.y)); +} + +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CUDA_SPECIALFUNCTIONS_H diff --git a/unsupported/Eigen/src/Splines/CMakeLists.txt b/unsupported/Eigen/src/Splines/CMakeLists.txt deleted file mode 100644 index 55c6271e9..000000000 --- a/unsupported/Eigen/src/Splines/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_Splines_SRCS "*.h") - -INSTALL(FILES - ${Eigen_Splines_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Splines COMPONENT Devel - ) diff --git a/unsupported/Eigen/src/Splines/Spline.h b/unsupported/Eigen/src/Splines/Spline.h index ddcddfc9a..627f6e482 100644 --- a/unsupported/Eigen/src/Splines/Spline.h +++ b/unsupported/Eigen/src/Splines/Spline.h @@ -94,7 +94,7 @@ namespace Eigen const KnotVectorType& knots() const { return m_knots; } /** - * \brief Returns the knots of the underlying spline. + * \brief Returns the ctrls of the underlying spline. **/ const ControlPointVectorType& ctrls() const { return m_ctrls; } diff --git a/unsupported/doc/examples/BVH_Example.cpp b/unsupported/doc/examples/BVH_Example.cpp index 6b6fac075..afb0c94c2 100644 --- a/unsupported/doc/examples/BVH_Example.cpp +++ b/unsupported/doc/examples/BVH_Example.cpp @@ -6,9 +6,7 @@ using namespace Eigen; typedef AlignedBox<double, 2> Box2d; namespace Eigen { - namespace internal { - Box2d bounding_box(const Vector2d &v) { return Box2d(v, v); } //compute the bounding box of a single point - } + Box2d bounding_box(const Vector2d &v) { return Box2d(v, v); } //compute the bounding box of a single point } struct PointPointMinimizer //how to compute squared distances between points and rectangles diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index fab140871..c0b321617 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -111,10 +111,14 @@ ei_add_test(gmres) ei_add_test(minres) ei_add_test(levenberg_marquardt) ei_add_test(kronecker_product) +ei_add_test(special_functions) # TODO: The following test names are prefixed with the cxx11 string, since historically # the tests depended on c++11. This isn't the case anymore so we ought to rename them. -ei_add_test(cxx11_float16) +# FIXME: Old versions of MSVC fail to compile this code, so we just disable these tests +# when using visual studio. We should make the check more strict to enable the tests for +# newer versions of MSVC. +if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") ei_add_test(cxx11_tensor_dimension) ei_add_test(cxx11_tensor_map) ei_add_test(cxx11_tensor_assign) @@ -132,7 +136,8 @@ ei_add_test(cxx11_tensor_io) if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") # This test requires __uint128_t which is only available on 64bit systems ei_add_test(cxx11_tensor_uint128) -endif() +endif() +endif() if(EIGEN_TEST_CXX11) # It should be safe to always run these tests as there is some fallback code for @@ -188,10 +193,12 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA) # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor # and -fno-check-new flags since they trigger thousands of compilation warnings # in the CUDA runtime + # Also remove -ansi that is incompatible with std=c++11. string(REPLACE "-pedantic" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") string(REPLACE "-Wundef" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") string(REPLACE "-Wnon-virtual-dtor" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + string(REPLACE "-ansi" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") message(STATUS "Flags used to compile cuda code: " ${CMAKE_CXX_FLAGS}) @@ -207,7 +214,14 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA) set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr") endif() - set(CUDA_NVCC_FLAGS "-std=c++11 ${EIGEN_CUDA_RELAXED_CONSTEXPR} -arch compute_${EIGEN_CUDA_COMPUTE_ARCH} -Xcudafe \"--display_error_number\"") + if( (NOT EIGEN_TEST_CXX11) OR (CMAKE_VERSION VERSION_LESS 3.3)) + set(EIGEN_CUDA_CXX11_FLAG "-std=c++11") + else() + # otherwise the flag has already been added because of the above set(CMAKE_CXX_STANDARD 11) + set(EIGEN_CUDA_CXX11_FLAG "") + endif() + + set(CUDA_NVCC_FLAGS "${EIGEN_CUDA_CXX11_FLAG} ${EIGEN_CUDA_RELAXED_CONSTEXPR} -arch compute_${EIGEN_CUDA_COMPUTE_ARCH} -Xcudafe \"--display_error_number\" ${CUDA_NVCC_FLAGS}") cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include") set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") @@ -217,6 +231,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA) ei_add_test(cxx11_tensor_reduction_cuda) ei_add_test(cxx11_tensor_argmax_cuda) ei_add_test(cxx11_tensor_cast_float16_cuda) + ei_add_test(cxx11_tensor_scan_cuda) # The random number generation code requires arch 3.5 or greater. if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 34) diff --git a/unsupported/test/FFTW.cpp b/unsupported/test/FFTW.cpp index 1dd6dc97d..8b7528fb7 100644 --- a/unsupported/test/FFTW.cpp +++ b/unsupported/test/FFTW.cpp @@ -18,11 +18,11 @@ using namespace Eigen; template < typename T> -complex<long double> promote(complex<T> x) { return complex<long double>(x.real(),x.imag()); } +complex<long double> promote(complex<T> x) { return complex<long double>((long double)x.real(),(long double)x.imag()); } -complex<long double> promote(float x) { return complex<long double>( x); } -complex<long double> promote(double x) { return complex<long double>( x); } -complex<long double> promote(long double x) { return complex<long double>( x); } +complex<long double> promote(float x) { return complex<long double>((long double)x); } +complex<long double> promote(double x) { return complex<long double>((long double)x); } +complex<long double> promote(long double x) { return complex<long double>((long double)x); } template <typename VT1,typename VT2> @@ -33,7 +33,7 @@ complex<long double> promote(long double x) { return complex<long double>( x); long double pi = acos((long double)-1 ); for (size_t k0=0;k0<(size_t)fftbuf.size();++k0) { complex<long double> acc = 0; - long double phinc = -2.*k0* pi / timebuf.size(); + long double phinc = (long double)(-2.)*k0* pi / timebuf.size(); for (size_t k1=0;k1<(size_t)timebuf.size();++k1) { acc += promote( timebuf[k1] ) * exp( complex<long double>(0,k1*phinc) ); } @@ -54,8 +54,8 @@ complex<long double> promote(long double x) { return complex<long double>( x); long double difpower=0; size_t n = (min)( buf1.size(),buf2.size() ); for (size_t k=0;k<n;++k) { - totalpower += (numext::abs2( buf1[k] ) + numext::abs2(buf2[k]) )/2; - difpower += numext::abs2(buf1[k] - buf2[k]); + totalpower += (long double)((numext::abs2( buf1[k] ) + numext::abs2(buf2[k]) )/2); + difpower += (long double)(numext::abs2(buf1[k] - buf2[k])); } return sqrt(difpower/totalpower); } @@ -93,19 +93,19 @@ void test_scalar_generic(int nfft) fft.SetFlag(fft.HalfSpectrum ); fft.fwd( freqBuf,tbuf); VERIFY((size_t)freqBuf.size() == (size_t)( (nfft>>1)+1) ); - VERIFY( fft_rmse(freqBuf,tbuf) < test_precision<T>() );// gross check + VERIFY( T(fft_rmse(freqBuf,tbuf)) < test_precision<T>() );// gross check fft.ClearFlag(fft.HalfSpectrum ); fft.fwd( freqBuf,tbuf); VERIFY( (size_t)freqBuf.size() == (size_t)nfft); - VERIFY( fft_rmse(freqBuf,tbuf) < test_precision<T>() );// gross check + VERIFY( T(fft_rmse(freqBuf,tbuf)) < test_precision<T>() );// gross check if (nfft&1) return; // odd FFTs get the wrong size inverse FFT ScalarVector tbuf2; fft.inv( tbuf2 , freqBuf); - VERIFY( dif_rmse(tbuf,tbuf2) < test_precision<T>() );// gross check + VERIFY( T(dif_rmse(tbuf,tbuf2)) < test_precision<T>() );// gross check // verify that the Unscaled flag takes effect @@ -121,12 +121,12 @@ void test_scalar_generic(int nfft) //for (size_t i=0;i<(size_t) tbuf.size();++i) // cout << "freqBuf=" << freqBuf[i] << " in2=" << tbuf3[i] << " - in=" << tbuf[i] << " => " << (tbuf3[i] - tbuf[i] ) << endl; - VERIFY( dif_rmse(tbuf,tbuf3) < test_precision<T>() );// gross check + VERIFY( T(dif_rmse(tbuf,tbuf3)) < test_precision<T>() );// gross check // verify that ClearFlag works fft.ClearFlag(fft.Unscaled); fft.inv( tbuf2 , freqBuf); - VERIFY( dif_rmse(tbuf,tbuf2) < test_precision<T>() );// gross check + VERIFY( T(dif_rmse(tbuf,tbuf2)) < test_precision<T>() );// gross check } template <typename T> @@ -152,10 +152,10 @@ void test_complex_generic(int nfft) inbuf[k]= Complex( (T)(rand()/(double)RAND_MAX - .5), (T)(rand()/(double)RAND_MAX - .5) ); fft.fwd( outbuf , inbuf); - VERIFY( fft_rmse(outbuf,inbuf) < test_precision<T>() );// gross check + VERIFY( T(fft_rmse(outbuf,inbuf)) < test_precision<T>() );// gross check fft.inv( buf3 , outbuf); - VERIFY( dif_rmse(inbuf,buf3) < test_precision<T>() );// gross check + VERIFY( T(dif_rmse(inbuf,buf3)) < test_precision<T>() );// gross check // verify that the Unscaled flag takes effect ComplexVector buf4; @@ -163,12 +163,12 @@ void test_complex_generic(int nfft) fft.inv( buf4 , outbuf); for (int k=0;k<nfft;++k) buf4[k] *= T(1./nfft); - VERIFY( dif_rmse(inbuf,buf4) < test_precision<T>() );// gross check + VERIFY( T(dif_rmse(inbuf,buf4)) < test_precision<T>() );// gross check // verify that ClearFlag works fft.ClearFlag(fft.Unscaled); fft.inv( buf3 , outbuf); - VERIFY( dif_rmse(inbuf,buf3) < test_precision<T>() );// gross check + VERIFY( T(dif_rmse(inbuf,buf3)) < test_precision<T>() );// gross check } template <typename T> diff --git a/unsupported/test/autodiff.cpp b/unsupported/test/autodiff.cpp index b59fd1c43..2da6dd8f3 100644 --- a/unsupported/test/autodiff.cpp +++ b/unsupported/test/autodiff.cpp @@ -205,6 +205,10 @@ void test_autodiff_hessian() VERIFY_IS_APPROX(y.value().derivatives()(1), s4*std::cos(s1*s3+s2*s4)); VERIFY_IS_APPROX(y.derivatives()(0).derivatives(), -std::sin(s1*s3+s2*s4)*Vector2d(s3*s3,s4*s3)); VERIFY_IS_APPROX(y.derivatives()(1).derivatives(), -std::sin(s1*s3+s2*s4)*Vector2d(s3*s4,s4*s4)); + + ADD z = x(0)*x(1); + VERIFY_IS_APPROX(z.derivatives()(0).derivatives(), Vector2d(0,1)); + VERIFY_IS_APPROX(z.derivatives()(1).derivatives(), Vector2d(1,0)); } double bug_1222() { @@ -234,6 +238,32 @@ double bug_1223() { return t.value() + t2.value(); } +// regression test for some compilation issues with specializations of ScalarBinaryOpTraits +void bug_1260() { + Matrix4d A; + Vector4d v; + A*v; +} + +// check a compilation issue with numext::max +double bug_1261() { + typedef AutoDiffScalar<Matrix2d> AD; + typedef Matrix<AD,2,1> VectorAD; + + VectorAD v; + const AD maxVal = v.maxCoeff(); + const AD minVal = v.minCoeff(); + return maxVal.value() + minVal.value(); +} + +double bug_1264() { + typedef AutoDiffScalar<Vector2d> AD; + const AD s; + const Matrix<AD, 3, 1> v1; + const Matrix<AD, 3, 1> v2 = (s + 3.0) * v1; + return v2(0).value(); +} + void test_autodiff() { for(int i = 0; i < g_repeat; i++) { @@ -245,5 +275,7 @@ void test_autodiff() bug_1222(); bug_1223(); + bug_1260(); + bug_1261(); } diff --git a/unsupported/test/autodiff_scalar.cpp b/unsupported/test/autodiff_scalar.cpp index c631c734a..4df2f5c57 100644 --- a/unsupported/test/autodiff_scalar.cpp +++ b/unsupported/test/autodiff_scalar.cpp @@ -36,13 +36,48 @@ template<typename Scalar> void check_atan2() VERIFY_IS_APPROX(res.derivatives(), x.derivatives()); } +template<typename Scalar> void check_hyperbolic_functions() +{ + using std::sinh; + using std::cosh; + using std::tanh; + typedef Matrix<Scalar, 1, 1> Deriv1; + typedef AutoDiffScalar<Deriv1> AD; + Deriv1 p = Deriv1::Random(); + AD val(p.x(),Deriv1::UnitX()); + + Scalar cosh_px = std::cosh(p.x()); + AD res1 = tanh(val); + VERIFY_IS_APPROX(res1.value(), std::tanh(p.x())); + VERIFY_IS_APPROX(res1.derivatives().x(), Scalar(1.0) / (cosh_px * cosh_px)); + AD res2 = sinh(val); + VERIFY_IS_APPROX(res2.value(), std::sinh(p.x())); + VERIFY_IS_APPROX(res2.derivatives().x(), cosh_px); + AD res3 = cosh(val); + VERIFY_IS_APPROX(res3.value(), cosh_px); + VERIFY_IS_APPROX(res3.derivatives().x(), std::sinh(p.x())); + + // Check constant values. + const Scalar sample_point = Scalar(1) / Scalar(3); + val = AD(sample_point,Deriv1::UnitX()); + res1 = tanh(val); + VERIFY_IS_APPROX(res1.derivatives().x(), Scalar(0.896629559604914)); + + res2 = sinh(val); + VERIFY_IS_APPROX(res2.derivatives().x(), Scalar(1.056071867829939)); + + res3 = cosh(val); + VERIFY_IS_APPROX(res3.derivatives().x(), Scalar(0.339540557256150)); +} void test_autodiff_scalar() { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( check_atan2<float>() ); CALL_SUBTEST_2( check_atan2<double>() ); + CALL_SUBTEST_3( check_hyperbolic_functions<float>() ); + CALL_SUBTEST_4( check_hyperbolic_functions<double>() ); } } diff --git a/unsupported/test/cxx11_float16.cpp b/unsupported/test/cxx11_float16.cpp deleted file mode 100644 index e39a7f83c..000000000 --- a/unsupported/test/cxx11_float16.cpp +++ /dev/null @@ -1,203 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_float16 - -#include "main.h" -#include <Eigen/src/Core/arch/CUDA/Half.h> - -using Eigen::half; - -void test_conversion() -{ - // Conversion from float. - VERIFY_IS_EQUAL(half(1.0f).x, 0x3c00); - VERIFY_IS_EQUAL(half(0.5f).x, 0x3800); - VERIFY_IS_EQUAL(half(0.33333f).x, 0x3555); - VERIFY_IS_EQUAL(half(0.0f).x, 0x0000); - VERIFY_IS_EQUAL(half(-0.0f).x, 0x8000); - VERIFY_IS_EQUAL(half(65504.0f).x, 0x7bff); - VERIFY_IS_EQUAL(half(65536.0f).x, 0x7c00); // Becomes infinity. - - // Denormals. - VERIFY_IS_EQUAL(half(-5.96046e-08f).x, 0x8001); - VERIFY_IS_EQUAL(half(5.96046e-08f).x, 0x0001); - VERIFY_IS_EQUAL(half(1.19209e-07f).x, 0x0002); - - // Verify round-to-nearest-even behavior. - float val1 = float(half(__half(0x3c00))); - float val2 = float(half(__half(0x3c01))); - float val3 = float(half(__half(0x3c02))); - VERIFY_IS_EQUAL(half(0.5f * (val1 + val2)).x, 0x3c00); - VERIFY_IS_EQUAL(half(0.5f * (val2 + val3)).x, 0x3c02); - - // Conversion from int. - VERIFY_IS_EQUAL(half(-1).x, 0xbc00); - VERIFY_IS_EQUAL(half(0).x, 0x0000); - VERIFY_IS_EQUAL(half(1).x, 0x3c00); - VERIFY_IS_EQUAL(half(2).x, 0x4000); - VERIFY_IS_EQUAL(half(3).x, 0x4200); - - // Conversion from bool. - VERIFY_IS_EQUAL(half(false).x, 0x0000); - VERIFY_IS_EQUAL(half(true).x, 0x3c00); - - // Conversion to float. - VERIFY_IS_EQUAL(float(half(__half(0x0000))), 0.0f); - VERIFY_IS_EQUAL(float(half(__half(0x3c00))), 1.0f); - - // Denormals. - VERIFY_IS_APPROX(float(half(__half(0x8001))), -5.96046e-08f); - VERIFY_IS_APPROX(float(half(__half(0x0001))), 5.96046e-08f); - VERIFY_IS_APPROX(float(half(__half(0x0002))), 1.19209e-07f); - - // NaNs and infinities. - VERIFY(!(numext::isinf)(float(half(65504.0f)))); // Largest finite number. - VERIFY(!(numext::isnan)(float(half(0.0f)))); - VERIFY((numext::isinf)(float(half(__half(0xfc00))))); - VERIFY((numext::isnan)(float(half(__half(0xfc01))))); - VERIFY((numext::isinf)(float(half(__half(0x7c00))))); - VERIFY((numext::isnan)(float(half(__half(0x7c01))))); - -#if !EIGEN_COMP_MSVC - // Visual Studio errors out on divisions by 0 - VERIFY((numext::isnan)(float(half(0.0 / 0.0)))); - VERIFY((numext::isinf)(float(half(1.0 / 0.0)))); - VERIFY((numext::isinf)(float(half(-1.0 / 0.0)))); -#endif - - // Exactly same checks as above, just directly on the half representation. - VERIFY(!(numext::isinf)(half(__half(0x7bff)))); - VERIFY(!(numext::isnan)(half(__half(0x0000)))); - VERIFY((numext::isinf)(half(__half(0xfc00)))); - VERIFY((numext::isnan)(half(__half(0xfc01)))); - VERIFY((numext::isinf)(half(__half(0x7c00)))); - VERIFY((numext::isnan)(half(__half(0x7c01)))); - -#if !EIGEN_COMP_MSVC - // Visual Studio errors out on divisions by 0 - VERIFY((numext::isnan)(half(0.0 / 0.0))); - VERIFY((numext::isinf)(half(1.0 / 0.0))); - VERIFY((numext::isinf)(half(-1.0 / 0.0))); -#endif -} - -void test_numtraits() -{ - std::cout << "expsilin = " << NumTraits<half>::epsilon() << std::endl; - std::cout << "highest = " << NumTraits<half>::highest() << std::endl; - std::cout << "lowest = " << NumTraits<half>::lowest() << std::endl; - std::cout << "inifinty = " << NumTraits<half>::infinity() << std::endl; - std::cout << "nan = " << NumTraits<half>::quiet_NaN() << std::endl; - -} - -void test_arithmetic() -{ - VERIFY_IS_EQUAL(float(half(2) + half(2)), 4); - VERIFY_IS_EQUAL(float(half(2) + half(-2)), 0); - VERIFY_IS_APPROX(float(half(0.33333f) + half(0.66667f)), 1.0f); - VERIFY_IS_EQUAL(float(half(2.0f) * half(-5.5f)), -11.0f); - VERIFY_IS_APPROX(float(half(1.0f) / half(3.0f)), 0.33333f); - VERIFY_IS_EQUAL(float(-half(4096.0f)), -4096.0f); - VERIFY_IS_EQUAL(float(-half(-4096.0f)), 4096.0f); -} - -void test_comparison() -{ - VERIFY(half(1.0f) > half(0.5f)); - VERIFY(half(0.5f) < half(1.0f)); - VERIFY(!(half(1.0f) < half(0.5f))); - VERIFY(!(half(0.5f) > half(1.0f))); - - VERIFY(!(half(4.0f) > half(4.0f))); - VERIFY(!(half(4.0f) < half(4.0f))); - - VERIFY(!(half(0.0f) < half(-0.0f))); - VERIFY(!(half(-0.0f) < half(0.0f))); - VERIFY(!(half(0.0f) > half(-0.0f))); - VERIFY(!(half(-0.0f) > half(0.0f))); - - VERIFY(half(0.2f) > half(-1.0f)); - VERIFY(half(-1.0f) < half(0.2f)); - VERIFY(half(-16.0f) < half(-15.0f)); - - VERIFY(half(1.0f) == half(1.0f)); - VERIFY(half(1.0f) != half(2.0f)); - - // Comparisons with NaNs and infinities. -#if !EIGEN_COMP_MSVC - // Visual Studio errors out on divisions by 0 - VERIFY(!(half(0.0 / 0.0) == half(0.0 / 0.0))); - VERIFY(half(0.0 / 0.0) != half(0.0 / 0.0)); - - VERIFY(!(half(1.0) == half(0.0 / 0.0))); - VERIFY(!(half(1.0) < half(0.0 / 0.0))); - VERIFY(!(half(1.0) > half(0.0 / 0.0))); - VERIFY(half(1.0) != half(0.0 / 0.0)); - - VERIFY(half(1.0) < half(1.0 / 0.0)); - VERIFY(half(1.0) > half(-1.0 / 0.0)); -#endif -} - -void test_basic_functions() -{ - VERIFY_IS_EQUAL(float(numext::abs(half(3.5f))), 3.5f); - VERIFY_IS_EQUAL(float(numext::abs(half(-3.5f))), 3.5f); - - VERIFY_IS_EQUAL(float(numext::floor(half(3.5f))), 3.0f); - VERIFY_IS_EQUAL(float(numext::floor(half(-3.5f))), -4.0f); - - VERIFY_IS_EQUAL(float(numext::ceil(half(3.5f))), 4.0f); - VERIFY_IS_EQUAL(float(numext::ceil(half(-3.5f))), -3.0f); - - VERIFY_IS_APPROX(float(numext::sqrt(half(0.0f))), 0.0f); - VERIFY_IS_APPROX(float(numext::sqrt(half(4.0f))), 2.0f); - - VERIFY_IS_APPROX(float(numext::pow(half(0.0f), half(1.0f))), 0.0f); - VERIFY_IS_APPROX(float(numext::pow(half(2.0f), half(2.0f))), 4.0f); - - VERIFY_IS_EQUAL(float(numext::exp(half(0.0f))), 1.0f); - VERIFY_IS_APPROX(float(numext::exp(half(EIGEN_PI))), float(20.0 + EIGEN_PI)); - - VERIFY_IS_EQUAL(float(numext::log(half(1.0f))), 0.0f); - VERIFY_IS_APPROX(float(numext::log(half(10.0f))), 2.30273f); -} - -void test_trigonometric_functions() -{ - VERIFY_IS_APPROX(numext::cos(half(0.0f)), half(cosf(0.0f))); - VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI)), half(cosf(EIGEN_PI))); - //VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI/2)), half(cosf(EIGEN_PI/2))); - //VERIFY_IS_APPROX(numext::cos(half(3*EIGEN_PI/2)), half(cosf(3*EIGEN_PI/2))); - VERIFY_IS_APPROX(numext::cos(half(3.5f)), half(cosf(3.5f))); - - VERIFY_IS_APPROX(numext::sin(half(0.0f)), half(sinf(0.0f))); - // VERIFY_IS_APPROX(numext::sin(half(EIGEN_PI)), half(sinf(EIGEN_PI))); - VERIFY_IS_APPROX(numext::sin(half(EIGEN_PI/2)), half(sinf(EIGEN_PI/2))); - VERIFY_IS_APPROX(numext::sin(half(3*EIGEN_PI/2)), half(sinf(3*EIGEN_PI/2))); - VERIFY_IS_APPROX(numext::sin(half(3.5f)), half(sinf(3.5f))); - - VERIFY_IS_APPROX(numext::tan(half(0.0f)), half(tanf(0.0f))); - // VERIFY_IS_APPROX(numext::tan(half(EIGEN_PI)), half(tanf(EIGEN_PI))); - // VERIFY_IS_APPROX(numext::tan(half(EIGEN_PI/2)), half(tanf(EIGEN_PI/2))); - //VERIFY_IS_APPROX(numext::tan(half(3*EIGEN_PI/2)), half(tanf(3*EIGEN_PI/2))); - VERIFY_IS_APPROX(numext::tan(half(3.5f)), half(tanf(3.5f))); -} - -void test_cxx11_float16() -{ - CALL_SUBTEST(test_conversion()); - CALL_SUBTEST(test_numtraits()); - CALL_SUBTEST(test_arithmetic()); - CALL_SUBTEST(test_comparison()); - CALL_SUBTEST(test_basic_functions()); - CALL_SUBTEST(test_trigonometric_functions()); -} diff --git a/unsupported/test/cxx11_non_blocking_thread_pool.cpp b/unsupported/test/cxx11_non_blocking_thread_pool.cpp index 6569218c4..5f9bb938b 100644 --- a/unsupported/test/cxx11_non_blocking_thread_pool.cpp +++ b/unsupported/test/cxx11_non_blocking_thread_pool.cpp @@ -27,6 +27,8 @@ static void test_parallelism() // Test we never-ever fail to match available tasks with idle threads. const int kThreads = 16; // code below expects that this is a multiple of 4 NonBlockingThreadPool tp(kThreads); + VERIFY_IS_EQUAL(tp.NumThreads(), kThreads); + VERIFY_IS_EQUAL(tp.CurrentThreadId(), -1); for (int iter = 0; iter < 100; ++iter) { std::atomic<int> running(0); std::atomic<int> done(0); @@ -34,6 +36,9 @@ static void test_parallelism() // Schedule kThreads tasks and ensure that they all are running. for (int i = 0; i < kThreads; ++i) { tp.Schedule([&]() { + const int thread_id = tp.CurrentThreadId(); + VERIFY_GE(thread_id, 0); + VERIFY_LE(thread_id, kThreads - 1); running++; while (phase < 1) { } diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index 4026f48f0..284b46803 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -1019,6 +1019,153 @@ void test_cuda_erfc(const Scalar stddev) cudaFree(d_out); } +template <typename Scalar> +void test_cuda_betainc() +{ + Tensor<Scalar, 1> in_x(125); + Tensor<Scalar, 1> in_a(125); + Tensor<Scalar, 1> in_b(125); + Tensor<Scalar, 1> out(125); + Tensor<Scalar, 1> expected_out(125); + out.setZero(); + + Scalar nan = std::numeric_limits<Scalar>::quiet_NaN(); + + Array<Scalar, 1, Dynamic> x(125); + Array<Scalar, 1, Dynamic> a(125); + Array<Scalar, 1, Dynamic> b(125); + Array<Scalar, 1, Dynamic> v(125); + + a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, + 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, + 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999, + 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, + 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, + 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999; + + b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999, + 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999, + 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999, + 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999, + 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999, + 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999, + 999.999, 999.999, 999.999; + + x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, + 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, + 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, + 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, + 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, + -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, + 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, + 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, + 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1; + + v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, + nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, + nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan, + 0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan, + 0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan, + 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan, + nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256, + 0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001, + 0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403, + 0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999, + 0.9999999999999999, nan, nan, nan, nan, nan, nan, nan, + 1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan, + nan, 7.864342668429763e-23, 3.015969667594166e-10, 0.0008598571564165444, + nan, nan, 6.031987710123844e-08, 0.5000000000000007, 0.9999999396801229, + nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, + nan, nan, nan, nan, nan, nan, 0.0, 7.029920380986636e-306, + 2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302, + 1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252, + 2.9303043666183996e-60, nan, nan, 2.248913486879199e-196, + 0.5000000000004947, 0.9999999999999999, nan; + + for (int i = 0; i < 125; ++i) { + in_x(i) = x(i); + in_a(i) = a(i); + in_b(i) = b(i); + expected_out(i) = v(i); + } + + std::size_t bytes = in_x.size() * sizeof(Scalar); + + Scalar* d_in_x; + Scalar* d_in_a; + Scalar* d_in_b; + Scalar* d_out; + cudaMalloc((void**)(&d_in_x), bytes); + cudaMalloc((void**)(&d_in_a), bytes); + cudaMalloc((void**)(&d_in_b), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in_a, in_a.data(), bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in_b, in_b.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125); + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_a(d_in_a, 125); + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_b(d_in_b, 125); + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 125); + + gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 1; i < 125; ++i) { + if ((std::isnan)(expected_out(i))) { + VERIFY((std::isnan)(out(i))); + } else { + VERIFY_IS_APPROX(out(i), expected_out(i)); + } + } + + cudaFree(d_in_x); + cudaFree(d_in_a); + cudaFree(d_in_b); + cudaFree(d_out); +} + + void test_cxx11_tensor_cuda() { CALL_SUBTEST_1(test_cuda_elementwise_small()); @@ -1086,5 +1233,8 @@ void test_cxx11_tensor_cuda() CALL_SUBTEST_5(test_cuda_igamma<double>()); CALL_SUBTEST_5(test_cuda_igammac<double>()); + + CALL_SUBTEST_6(test_cuda_betainc<float>()); + CALL_SUBTEST_6(test_cuda_betainc<double>()); #endif } diff --git a/unsupported/test/cxx11_tensor_dimension.cpp b/unsupported/test/cxx11_tensor_dimension.cpp index 0bccc3396..16f168ed4 100644 --- a/unsupported/test/cxx11_tensor_dimension.cpp +++ b/unsupported/test/cxx11_tensor_dimension.cpp @@ -21,7 +21,7 @@ static void test_dynamic_size() VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2); VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3); VERIFY_IS_EQUAL((int)Eigen::internal::array_get<2>(dimensions), 7); - VERIFY_IS_EQUAL(dimensions.TotalSize(), 2*3*7); + VERIFY_IS_EQUAL((int)dimensions.TotalSize(), 2*3*7); VERIFY_IS_EQUAL((int)dimensions[0], 2); VERIFY_IS_EQUAL((int)dimensions[1], 3); VERIFY_IS_EQUAL((int)dimensions[2], 7); @@ -34,12 +34,12 @@ static void test_fixed_size() VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2); VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3); VERIFY_IS_EQUAL((int)Eigen::internal::array_get<2>(dimensions), 7); - VERIFY_IS_EQUAL(dimensions.TotalSize(), 2*3*7); + VERIFY_IS_EQUAL((int)dimensions.TotalSize(), 2*3*7); } static void test_match() { - Eigen::DSizes<int, 3> dyn(2,3,7); + Eigen::DSizes<unsigned int, 3> dyn((unsigned int)2,(unsigned int)3,(unsigned int)7); Eigen::Sizes<2,3,7> stat; VERIFY_IS_EQUAL(Eigen::dimensions_match(dyn, stat), true); @@ -51,13 +51,13 @@ static void test_match() static void test_rank_zero() { Eigen::Sizes<> scalar; - VERIFY_IS_EQUAL(scalar.TotalSize(), 1); - VERIFY_IS_EQUAL(scalar.rank(), 0); - VERIFY_IS_EQUAL(internal::array_prod(scalar), 1); + VERIFY_IS_EQUAL((int)scalar.TotalSize(), 1); + VERIFY_IS_EQUAL((int)scalar.rank(), 0); + VERIFY_IS_EQUAL((int)internal::array_prod(scalar), 1); Eigen::DSizes<ptrdiff_t, 0> dscalar; - VERIFY_IS_EQUAL(dscalar.TotalSize(), 1); - VERIFY_IS_EQUAL(dscalar.rank(), 0u); + VERIFY_IS_EQUAL((int)dscalar.TotalSize(), 1); + VERIFY_IS_EQUAL((int)dscalar.rank(), 0); } void test_cxx11_tensor_dimension() diff --git a/unsupported/test/cxx11_tensor_io.cpp b/unsupported/test/cxx11_tensor_io.cpp index 8bbcf7089..489960529 100644 --- a/unsupported/test/cxx11_tensor_io.cpp +++ b/unsupported/test/cxx11_tensor_io.cpp @@ -14,6 +14,20 @@ template<int DataLayout> +static void test_output_0d() +{ + Tensor<int, 0, DataLayout> tensor; + tensor() = 123; + + std::stringstream os; + os << tensor; + + std::string expected("123"); + VERIFY_IS_EQUAL(std::string(os.str()), expected); +} + + +template<int DataLayout> static void test_output_1d() { Tensor<int, 1, DataLayout> tensor(5); @@ -26,6 +40,12 @@ static void test_output_1d() std::string expected("0\n1\n2\n3\n4"); VERIFY_IS_EQUAL(std::string(os.str()), expected); + + Eigen::Tensor<double,1,DataLayout> empty_tensor(0); + std::stringstream empty_os; + empty_os << empty_tensor; + std::string empty_string; + VERIFY_IS_EQUAL(std::string(empty_os.str()), empty_string); } @@ -101,6 +121,8 @@ static void test_output_const() void test_cxx11_tensor_io() { + CALL_SUBTEST(test_output_0d<ColMajor>()); + CALL_SUBTEST(test_output_0d<RowMajor>()); CALL_SUBTEST(test_output_1d<ColMajor>()); CALL_SUBTEST(test_output_1d<RowMajor>()); CALL_SUBTEST(test_output_2d<ColMajor>()); diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp index c575d3fdc..f7de43110 100644 --- a/unsupported/test/cxx11_tensor_morphing.cpp +++ b/unsupported/test/cxx11_tensor_morphing.cpp @@ -13,6 +13,7 @@ using Eigen::Tensor; +template<typename> static void test_simple_reshape() { Tensor<float, 5> tensor1(2,3,1,7,1); @@ -40,7 +41,7 @@ static void test_simple_reshape() } } - +template<typename> static void test_reshape_in_expr() { MatrixXf m1(2,3*5*7*11); MatrixXf m2(3*5*7*11,13); @@ -65,7 +66,7 @@ static void test_reshape_in_expr() { } } - +template<typename> static void test_reshape_as_lvalue() { Tensor<float, 3> tensor(2,3,7); @@ -114,6 +115,7 @@ static void test_simple_slice() } } +template<typename=void> static void test_const_slice() { const float b[1] = {42}; @@ -459,25 +461,25 @@ static void test_composition() void test_cxx11_tensor_morphing() { - CALL_SUBTEST(test_simple_reshape()); - CALL_SUBTEST(test_reshape_in_expr()); - CALL_SUBTEST(test_reshape_as_lvalue()); - - CALL_SUBTEST(test_simple_slice<ColMajor>()); - CALL_SUBTEST(test_simple_slice<RowMajor>()); - CALL_SUBTEST(test_const_slice()); - CALL_SUBTEST(test_slice_in_expr<ColMajor>()); - CALL_SUBTEST(test_slice_in_expr<RowMajor>()); - CALL_SUBTEST(test_slice_as_lvalue<ColMajor>()); - CALL_SUBTEST(test_slice_as_lvalue<RowMajor>()); - CALL_SUBTEST(test_slice_raw_data<ColMajor>()); - CALL_SUBTEST(test_slice_raw_data<RowMajor>()); - - CALL_SUBTEST(test_strided_slice_write<ColMajor>()); - CALL_SUBTEST(test_strided_slice<ColMajor>()); - CALL_SUBTEST(test_strided_slice_write<RowMajor>()); - CALL_SUBTEST(test_strided_slice<RowMajor>()); - - CALL_SUBTEST(test_composition<ColMajor>()); - CALL_SUBTEST(test_composition<RowMajor>()); + CALL_SUBTEST_1(test_simple_reshape<void>()); + CALL_SUBTEST_1(test_reshape_in_expr<void>()); + CALL_SUBTEST_1(test_reshape_as_lvalue<void>()); + + CALL_SUBTEST_1(test_simple_slice<ColMajor>()); + CALL_SUBTEST_1(test_simple_slice<RowMajor>()); + CALL_SUBTEST_1(test_const_slice()); + CALL_SUBTEST_2(test_slice_in_expr<ColMajor>()); + CALL_SUBTEST_3(test_slice_in_expr<RowMajor>()); + CALL_SUBTEST_4(test_slice_as_lvalue<ColMajor>()); + CALL_SUBTEST_4(test_slice_as_lvalue<RowMajor>()); + CALL_SUBTEST_5(test_slice_raw_data<ColMajor>()); + CALL_SUBTEST_5(test_slice_raw_data<RowMajor>()); + + CALL_SUBTEST_6(test_strided_slice_write<ColMajor>()); + CALL_SUBTEST_6(test_strided_slice<ColMajor>()); + CALL_SUBTEST_6(test_strided_slice_write<RowMajor>()); + CALL_SUBTEST_6(test_strided_slice<RowMajor>()); + + CALL_SUBTEST_7(test_composition<ColMajor>()); + CALL_SUBTEST_7(test_composition<RowMajor>()); } diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index 34e9f54a0..a6375d34a 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -13,14 +13,53 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU - +#include <cuda_fp16.h> #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> using Eigen::Tensor; +template<typename> +void test_cuda_numext() { + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int num_elem = 101; + + float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + bool* d_res_half = (bool*)gpu_device.allocate(num_elem * sizeof(bool)); + bool* d_res_float = (bool*)gpu_device.allocate(num_elem * sizeof(bool)); + + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float( + d_float, num_elem); + Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_half( + d_res_half, num_elem); + Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_float( + d_res_float, num_elem); + + gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f); + gpu_res_float.device(gpu_device) = gpu_float.unaryExpr(Eigen::internal::scalar_isnan_op<float>()); + gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().unaryExpr(Eigen::internal::scalar_isnan_op<Eigen::half>()); + + Tensor<bool, 1> half_prec(num_elem); + Tensor<bool, 1> full_prec(num_elem); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(bool)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(bool)); + gpu_device.synchronize(); + + for (int i = 0; i < num_elem; ++i) { + std::cout << "Checking numext " << i << std::endl; + VERIFY_IS_EQUAL(full_prec(i), half_prec(i)); + } + + gpu_device.deallocate(d_float); + gpu_device.deallocate(d_res_half); + gpu_device.deallocate(d_res_float); +} + + #ifdef EIGEN_HAS_CUDA_FP16 +template<typename> void test_cuda_conversion() { Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); @@ -55,7 +94,7 @@ void test_cuda_conversion() { gpu_device.deallocate(d_conv); } - +template<typename> void test_cuda_unary() { Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); @@ -92,7 +131,7 @@ void test_cuda_unary() { gpu_device.deallocate(d_res_float); } - +template<typename> void test_cuda_elementwise() { Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); @@ -134,6 +173,7 @@ void test_cuda_elementwise() { gpu_device.deallocate(d_res_float); } +template<typename> void test_cuda_trancendental() { Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); @@ -141,30 +181,39 @@ void test_cuda_trancendental() { float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_float3 = (float*)gpu_device.allocate(num_elem * sizeof(float)); Eigen::half* d_res1_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); Eigen::half* d_res1_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); Eigen::half* d_res2_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); Eigen::half* d_res2_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); - - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1( - d_float1, num_elem); - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2( - d_float2, num_elem); - Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half( - d_res1_half, num_elem); - Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float( - d_res1_float, num_elem); - Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half( - d_res2_half, num_elem); - Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float( - d_res2_float, num_elem); + Eigen::half* d_res3_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); + Eigen::half* d_res3_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); + + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float3(d_float3, num_elem); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem); gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f); gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f); + gpu_float3.device(gpu_device) = gpu_float3.random(); gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>(); gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>(); - gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().exp(); - gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>().log(); + gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>(); + + gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>(); + gpu_res1_half.device(gpu_device) = gpu_res1_half.exp(); + + gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>(); + gpu_res2_half.device(gpu_device) = gpu_res2_half.log(); + + gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>(); + gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p(); Tensor<float, 1> input1(num_elem); Tensor<Eigen::half, 1> half_prec1(num_elem); @@ -172,12 +221,18 @@ void test_cuda_trancendental() { Tensor<float, 1> input2(num_elem); Tensor<Eigen::half, 1> half_prec2(num_elem); Tensor<Eigen::half, 1> full_prec2(num_elem); + Tensor<float, 1> input3(num_elem); + Tensor<Eigen::half, 1> half_prec3(num_elem); + Tensor<Eigen::half, 1> full_prec3(num_elem); gpu_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float)); gpu_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float)); gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half)); gpu_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half)); gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half)); gpu_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half)); + gpu_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half)); + gpu_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half)); gpu_device.synchronize(); for (int i = 0; i < num_elem; ++i) { @@ -186,17 +241,27 @@ void test_cuda_trancendental() { } for (int i = 0; i < num_elem; ++i) { std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl; - VERIFY_IS_APPROX(full_prec2(i), half_prec2(i)); + if(std::abs(input2(i)-1.f)<0.05f) // log lacks accurary nearby 1 + VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f)); + else + VERIFY_IS_APPROX(full_prec2(i), half_prec2(i)); + } + for (int i = 0; i < num_elem; ++i) { + std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl; + VERIFY_IS_APPROX(full_prec3(i), half_prec3(i)); } gpu_device.deallocate(d_float1); gpu_device.deallocate(d_float2); + gpu_device.deallocate(d_float3); gpu_device.deallocate(d_res1_half); gpu_device.deallocate(d_res1_float); gpu_device.deallocate(d_res2_half); gpu_device.deallocate(d_res2_float); + gpu_device.deallocate(d_res3_float); + gpu_device.deallocate(d_res3_half); } - +template<typename> void test_cuda_contractions() { Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); @@ -247,7 +312,7 @@ void test_cuda_contractions() { gpu_device.deallocate(d_res_float); } - +template<typename> void test_cuda_reductions(int size1, int size2, int redux) { std::cout << "Reducing " << size1 << " by " << size2 @@ -296,17 +361,19 @@ void test_cuda_reductions(int size1, int size2, int redux) { gpu_device.deallocate(d_res_float); } +template<typename> void test_cuda_reductions() { - test_cuda_reductions(13, 13, 0); - test_cuda_reductions(13, 13, 1); + test_cuda_reductions<void>(13, 13, 0); + test_cuda_reductions<void>(13, 13, 1); - test_cuda_reductions(35, 36, 0); - test_cuda_reductions(35, 36, 1); + test_cuda_reductions<void>(35, 36, 0); + test_cuda_reductions<void>(35, 36, 1); - test_cuda_reductions(36, 35, 0); - test_cuda_reductions(36, 35, 1); + test_cuda_reductions<void>(36, 35, 0); + test_cuda_reductions<void>(36, 35, 1); } +template<typename> void test_cuda_full_reductions() { Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); @@ -355,7 +422,7 @@ void test_cuda_full_reductions() { gpu_device.deallocate(d_res_float); } - +template<typename> void test_cuda_forced_evals() { Eigen::CudaStreamDevice stream; @@ -408,15 +475,17 @@ void test_cuda_forced_evals() { void test_cxx11_tensor_of_float16_cuda() { + CALL_SUBTEST_1(test_cuda_numext<void>()); + #ifdef EIGEN_HAS_CUDA_FP16 - CALL_SUBTEST_1(test_cuda_conversion()); - CALL_SUBTEST_1(test_cuda_unary()); - CALL_SUBTEST_1(test_cuda_elementwise()); - CALL_SUBTEST_1(test_cuda_trancendental()); - CALL_SUBTEST_2(test_cuda_contractions()); - CALL_SUBTEST_3(test_cuda_reductions()); - CALL_SUBTEST_4(test_cuda_full_reductions()); - CALL_SUBTEST_5(test_cuda_forced_evals()); + CALL_SUBTEST_1(test_cuda_conversion<void>()); + CALL_SUBTEST_1(test_cuda_unary<void>()); + CALL_SUBTEST_1(test_cuda_elementwise<void>()); + CALL_SUBTEST_1(test_cuda_trancendental<void>()); + CALL_SUBTEST_2(test_cuda_contractions<void>()); + CALL_SUBTEST_3(test_cuda_reductions<void>()); + CALL_SUBTEST_4(test_cuda_full_reductions<void>()); + CALL_SUBTEST_5(test_cuda_forced_evals<void>()); #else std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl; #endif diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp index ca483257b..1490ec3da 100644 --- a/unsupported/test/cxx11_tensor_reduction.cpp +++ b/unsupported/test/cxx11_tensor_reduction.cpp @@ -239,6 +239,33 @@ static void test_simple_reductions() { } } + +template <int DataLayout> +static void test_reductions_in_expr() { + Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7); + tensor.setRandom(); + array<ptrdiff_t, 2> reduction_axis2; + reduction_axis2[0] = 1; + reduction_axis2[1] = 3; + + Tensor<float, 2, DataLayout> result(2, 5); + result = result.constant(1.0f) - tensor.sum(reduction_axis2); + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_EQUAL(result.dimension(1), 5); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 5; ++j) { + float sum = 0.0f; + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 7; ++l) { + sum += tensor(i, k, j, l); + } + } + VERIFY_IS_APPROX(result(i, j), 1.0f - sum); + } + } +} + + template <int DataLayout> static void test_full_reductions() { Tensor<float, 2, DataLayout> tensor(2, 3); @@ -462,6 +489,8 @@ void test_cxx11_tensor_reduction() { CALL_SUBTEST(test_trivial_reductions<RowMajor>()); CALL_SUBTEST(test_simple_reductions<ColMajor>()); CALL_SUBTEST(test_simple_reductions<RowMajor>()); + CALL_SUBTEST(test_reductions_in_expr<ColMajor>()); + CALL_SUBTEST(test_reductions_in_expr<RowMajor>()); CALL_SUBTEST(test_full_reductions<ColMajor>()); CALL_SUBTEST(test_full_reductions<RowMajor>()); CALL_SUBTEST(test_user_defined_reductions<ColMajor>()); diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cu b/unsupported/test/cxx11_tensor_reduction_cuda.cu index cad0c08e0..6d8f01c02 100644 --- a/unsupported/test/cxx11_tensor_reduction_cuda.cu +++ b/unsupported/test/cxx11_tensor_reduction_cuda.cu @@ -16,7 +16,7 @@ #include <unsupported/Eigen/CXX11/Tensor> -template<int DataLayout> +template<typename Type, int DataLayout> static void test_full_reductions() { Eigen::CudaStreamDevice stream; @@ -25,24 +25,24 @@ static void test_full_reductions() { const int num_rows = internal::random<int>(1024, 5*1024); const int num_cols = internal::random<int>(1024, 5*1024); - Tensor<float, 2, DataLayout> in(num_rows, num_cols); + Tensor<Type, 2, DataLayout> in(num_rows, num_cols); in.setRandom(); - Tensor<float, 0, DataLayout> full_redux; + Tensor<Type, 0, DataLayout> full_redux; full_redux = in.sum(); - std::size_t in_bytes = in.size() * sizeof(float); - std::size_t out_bytes = full_redux.size() * sizeof(float); - float* gpu_in_ptr = static_cast<float*>(gpu_device.allocate(in_bytes)); - float* gpu_out_ptr = static_cast<float*>(gpu_device.allocate(out_bytes)); + std::size_t in_bytes = in.size() * sizeof(Type); + std::size_t out_bytes = full_redux.size() * sizeof(Type); + Type* gpu_in_ptr = static_cast<Type*>(gpu_device.allocate(in_bytes)); + Type* gpu_out_ptr = static_cast<Type*>(gpu_device.allocate(out_bytes)); gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes); - TensorMap<Tensor<float, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols); - TensorMap<Tensor<float, 0, DataLayout> > out_gpu(gpu_out_ptr); + TensorMap<Tensor<Type, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols); + TensorMap<Tensor<Type, 0, DataLayout> > out_gpu(gpu_out_ptr); out_gpu.device(gpu_device) = in_gpu.sum(); - Tensor<float, 0, DataLayout> full_redux_gpu; + Tensor<Type, 0, DataLayout> full_redux_gpu; gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes); gpu_device.synchronize(); @@ -54,6 +54,8 @@ static void test_full_reductions() { } void test_cxx11_tensor_reduction_cuda() { - CALL_SUBTEST_1(test_full_reductions<ColMajor>()); - CALL_SUBTEST_2(test_full_reductions<RowMajor>()); + CALL_SUBTEST_1((test_full_reductions<float, ColMajor>())); + CALL_SUBTEST_1((test_full_reductions<double, ColMajor>())); + CALL_SUBTEST_2((test_full_reductions<float, RowMajor>())); + CALL_SUBTEST_2((test_full_reductions<double, RowMajor>())); } diff --git a/unsupported/test/cxx11_tensor_scan.cpp b/unsupported/test/cxx11_tensor_scan.cpp index dbd3023d7..af59aa3ef 100644 --- a/unsupported/test/cxx11_tensor_scan.cpp +++ b/unsupported/test/cxx11_tensor_scan.cpp @@ -14,63 +14,73 @@ using Eigen::Tensor; -template <int DataLayout, typename Type=float> +template <int DataLayout, typename Type=float, bool Exclusive = false> static void test_1d_scan() { - int size = 50; - Tensor<Type, 1, DataLayout> tensor(size); - tensor.setRandom(); - Tensor<Type, 1, DataLayout> result = tensor.cumsum(0); + int size = 50; + Tensor<Type, 1, DataLayout> tensor(size); + tensor.setRandom(); + Tensor<Type, 1, DataLayout> result = tensor.cumsum(0, Exclusive); - VERIFY_IS_EQUAL(tensor.dimension(0), result.dimension(0)); + VERIFY_IS_EQUAL(tensor.dimension(0), result.dimension(0)); - float accum = 0; - for (int i = 0; i < size; i++) { + float accum = 0; + for (int i = 0; i < size; i++) { + if (Exclusive) { + VERIFY_IS_EQUAL(result(i), accum); + accum += tensor(i); + } else { accum += tensor(i); VERIFY_IS_EQUAL(result(i), accum); } + } - accum = 1; - result = tensor.cumprod(0); - for (int i = 0; i < size; i++) { + accum = 1; + result = tensor.cumprod(0, Exclusive); + for (int i = 0; i < size; i++) { + if (Exclusive) { + VERIFY_IS_EQUAL(result(i), accum); + accum *= tensor(i); + } else { accum *= tensor(i); VERIFY_IS_EQUAL(result(i), accum); } + } } template <int DataLayout, typename Type=float> static void test_4d_scan() { - int size = 5; - Tensor<Type, 4, DataLayout> tensor(size, size, size, size); - tensor.setRandom(); + int size = 5; + Tensor<Type, 4, DataLayout> tensor(size, size, size, size); + tensor.setRandom(); - Tensor<Type, 4, DataLayout> result(size, size, size, size); + Tensor<Type, 4, DataLayout> result(size, size, size, size); - result = tensor.cumsum(0); - float accum = 0; - for (int i = 0; i < size; i++) { - accum += tensor(i, 0, 0, 0); - VERIFY_IS_EQUAL(result(i, 0, 0, 0), accum); - } - result = tensor.cumsum(1); - accum = 0; - for (int i = 0; i < size; i++) { - accum += tensor(0, i, 0, 0); - VERIFY_IS_EQUAL(result(0, i, 0, 0), accum); - } - result = tensor.cumsum(2); - accum = 0; - for (int i = 0; i < size; i++) { - accum += tensor(0, 0, i, 0); - VERIFY_IS_EQUAL(result(0, 0, i, 0), accum); - } - result = tensor.cumsum(3); - accum = 0; - for (int i = 0; i < size; i++) { - accum += tensor(0, 0, 0, i); - VERIFY_IS_EQUAL(result(0, 0, 0, i), accum); - } + result = tensor.cumsum(0); + float accum = 0; + for (int i = 0; i < size; i++) { + accum += tensor(i, 1, 2, 3); + VERIFY_IS_EQUAL(result(i, 1, 2, 3), accum); + } + result = tensor.cumsum(1); + accum = 0; + for (int i = 0; i < size; i++) { + accum += tensor(1, i, 2, 3); + VERIFY_IS_EQUAL(result(1, i, 2, 3), accum); + } + result = tensor.cumsum(2); + accum = 0; + for (int i = 0; i < size; i++) { + accum += tensor(1, 2, i, 3); + VERIFY_IS_EQUAL(result(1, 2, i, 3), accum); + } + result = tensor.cumsum(3); + accum = 0; + for (int i = 0; i < size; i++) { + accum += tensor(1, 2, 3, i); + VERIFY_IS_EQUAL(result(1, 2, 3, i), accum); + } } template <int DataLayout> @@ -89,8 +99,10 @@ static void test_tensor_maps() { } void test_cxx11_tensor_scan() { - CALL_SUBTEST(test_1d_scan<ColMajor>()); - CALL_SUBTEST(test_1d_scan<RowMajor>()); + CALL_SUBTEST((test_1d_scan<ColMajor, float, true>())); + CALL_SUBTEST((test_1d_scan<ColMajor, float, false>())); + CALL_SUBTEST((test_1d_scan<RowMajor, float, true>())); + CALL_SUBTEST((test_1d_scan<RowMajor, float, false>())); CALL_SUBTEST(test_4d_scan<ColMajor>()); CALL_SUBTEST(test_4d_scan<RowMajor>()); CALL_SUBTEST(test_tensor_maps<ColMajor>()); diff --git a/unsupported/test/cxx11_tensor_scan_cuda.cu b/unsupported/test/cxx11_tensor_scan_cuda.cu new file mode 100644 index 000000000..35e19e51c --- /dev/null +++ b/unsupported/test/cxx11_tensor_scan_cuda.cu @@ -0,0 +1,77 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_scan_cuda +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_GPU + + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::Tensor; +typedef Tensor<float, 1>::DimensionPair DimPair; + +template<int DataLayout> +void test_cuda_cumsum(int m_size, int k_size, int n_size) +{ + std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; + Tensor<float, 3, DataLayout> t_input(m_size, k_size, n_size); + Tensor<float, 3, DataLayout> t_result(m_size, k_size, n_size); + Tensor<float, 3, DataLayout> t_result_gpu(m_size, k_size, n_size); + + t_input.setRandom(); + + std::size_t t_input_bytes = t_input.size() * sizeof(float); + std::size_t t_result_bytes = t_result.size() * sizeof(float); + + float* d_t_input; + float* d_t_result; + + cudaMalloc((void**)(&d_t_input), t_input_bytes); + cudaMalloc((void**)(&d_t_result), t_result_bytes); + + cudaMemcpy(d_t_input, t_input.data(), t_input_bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > + gpu_t_input(d_t_input, Eigen::array<int, 3>(m_size, k_size, n_size)); + Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > + gpu_t_result(d_t_result, Eigen::array<int, 3>(m_size, k_size, n_size)); + + gpu_t_result.device(gpu_device) = gpu_t_input.cumsum(1); + t_result = t_input.cumsum(1); + + cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); + for (size_t i = 0; i < t_result.size(); i++) { + if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) { + continue; + } + if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) { + continue; + } + std::cout << "mismatch detected at index " << i << ": " << t_result(i) + << " vs " << t_result_gpu(i) << std::endl; + assert(false); + } + + cudaFree((void*)d_t_input); + cudaFree((void*)d_t_result); +} + + +void test_cxx11_tensor_scan_cuda() +{ + CALL_SUBTEST_1(test_cuda_cumsum<ColMajor>(128, 128, 128)); + CALL_SUBTEST_2(test_cuda_cumsum<RowMajor>(128, 128, 128)); +} diff --git a/unsupported/test/cxx11_tensor_sugar.cpp b/unsupported/test/cxx11_tensor_sugar.cpp index a03f75cfe..2f56eb495 100644 --- a/unsupported/test/cxx11_tensor_sugar.cpp +++ b/unsupported/test/cxx11_tensor_sugar.cpp @@ -33,7 +33,7 @@ static void test_comparison_sugar() { } -static void test_scalar_sugar() { +static void test_scalar_sugar_add_mul() { Tensor<float, 3> A(6, 7, 5); Tensor<float, 3> B(6, 7, 5); A.setRandom(); @@ -41,21 +41,41 @@ static void test_scalar_sugar() { const float alpha = 0.43f; const float beta = 0.21f; + const float gamma = 0.14f; - Tensor<float, 3> R = A * A.constant(alpha) + B * B.constant(beta); - Tensor<float, 3> S = A * alpha + B * beta; - - // TODO: add enough syntactic sugar to support this - // Tensor<float, 3> T = alpha * A + beta * B; + Tensor<float, 3> R = A.constant(gamma) + A * A.constant(alpha) + B * B.constant(beta); + Tensor<float, 3> S = A * alpha + B * beta + gamma; + Tensor<float, 3> T = gamma + alpha * A + beta * B; for (int i = 0; i < 6*7*5; ++i) { VERIFY_IS_APPROX(R(i), S(i)); + VERIFY_IS_APPROX(R(i), T(i)); } } +static void test_scalar_sugar_sub_div() { + Tensor<float, 3> A(6, 7, 5); + Tensor<float, 3> B(6, 7, 5); + A.setRandom(); + B.setRandom(); + + const float alpha = 0.43f; + const float beta = 0.21f; + const float gamma = 0.14f; + const float delta = 0.32f; + + Tensor<float, 3> R = A.constant(gamma) - A / A.constant(alpha) + - B.constant(beta) / B - A.constant(delta); + Tensor<float, 3> S = gamma - A / alpha - beta / B - delta; + + for (int i = 0; i < 6*7*5; ++i) { + VERIFY_IS_APPROX(R(i), S(i)); + } +} void test_cxx11_tensor_sugar() { CALL_SUBTEST(test_comparison_sugar()); - CALL_SUBTEST(test_scalar_sugar()); + CALL_SUBTEST(test_scalar_sugar_add_mul()); + CALL_SUBTEST(test_scalar_sugar_sub_div()); } diff --git a/unsupported/test/kronecker_product.cpp b/unsupported/test/kronecker_product.cpp index 02411a262..e770049e5 100644 --- a/unsupported/test/kronecker_product.cpp +++ b/unsupported/test/kronecker_product.cpp @@ -9,12 +9,12 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#ifdef EIGEN_TEST_PART_1 #include "sparse.h" #include <Eigen/SparseExtra> #include <Eigen/KroneckerProduct> - template<typename MatrixType> void check_dimension(const MatrixType& ab, const int rows, const int cols) { @@ -230,3 +230,23 @@ void test_kronecker_product() VERIFY_IS_APPROX(MatrixXf(sC2),dC); } } + +#endif + +#ifdef EIGEN_TEST_PART_2 + +// simply check that for a dense kronecker product, sparse module is not needed + +#include "main.h" +#include <Eigen/KroneckerProduct> + +void test_kronecker_product() +{ + MatrixXd a(2,2), b(3,3), c; + a.setRandom(); + b.setRandom(); + c = kroneckerProduct(a,b); + VERIFY_IS_APPROX(c.block(3,3,3,3), a(1,1)*b); +} + +#endif diff --git a/unsupported/test/mpreal_support.cpp b/unsupported/test/mpreal_support.cpp index 1aa9e786a..ffa5691eb 100644 --- a/unsupported/test/mpreal_support.cpp +++ b/unsupported/test/mpreal_support.cpp @@ -17,6 +17,7 @@ void test_mpreal_support() std::cerr << "dummy_precision = " << NumTraits<mpreal>::dummy_precision() << "\n"; std::cerr << "highest = " << NumTraits<mpreal>::highest() << "\n"; std::cerr << "lowest = " << NumTraits<mpreal>::lowest() << "\n"; + std::cerr << "digits10 = " << NumTraits<mpreal>::digits10() << "\n"; for(int i = 0; i < g_repeat; i++) { int s = Eigen::internal::random<int>(1,100); diff --git a/unsupported/test/special_functions.cpp b/unsupported/test/special_functions.cpp new file mode 100644 index 000000000..057fb3e92 --- /dev/null +++ b/unsupported/test/special_functions.cpp @@ -0,0 +1,345 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" +#include "../Eigen/SpecialFunctions" + +template<typename X, typename Y> +void verify_component_wise(const X& x, const Y& y) +{ + for(Index i=0; i<x.size(); ++i) + { + if((numext::isfinite)(y(i))) + VERIFY_IS_APPROX( x(i), y(i) ); + else if((numext::isnan)(y(i))) + VERIFY((numext::isnan)(x(i))); + else + VERIFY_IS_EQUAL( x(i), y(i) ); + } +} + +template<typename ArrayType> void array_special_functions() +{ + using std::abs; + using std::sqrt; + typedef typename ArrayType::Scalar Scalar; + typedef typename NumTraits<Scalar>::Real RealScalar; + + Scalar plusinf = std::numeric_limits<Scalar>::infinity(); + Scalar nan = std::numeric_limits<Scalar>::quiet_NaN(); + + Index rows = internal::random<Index>(1,30); + Index cols = 1; + + // API + { + ArrayType m1 = ArrayType::Random(rows,cols); +#if EIGEN_HAS_C99_MATH + VERIFY_IS_APPROX(m1.lgamma(), lgamma(m1)); + VERIFY_IS_APPROX(m1.digamma(), digamma(m1)); + VERIFY_IS_APPROX(m1.erf(), erf(m1)); + VERIFY_IS_APPROX(m1.erfc(), erfc(m1)); +#endif // EIGEN_HAS_C99_MATH + } + + +#if EIGEN_HAS_C99_MATH + // check special functions (comparing against numpy implementation) + if (!NumTraits<Scalar>::IsComplex) + { + + { + ArrayType m1 = ArrayType::Random(rows,cols); + ArrayType m2 = ArrayType::Random(rows,cols); + + // Test various propreties of igamma & igammac. These are normalized + // gamma integrals where + // igammac(a, x) = Gamma(a, x) / Gamma(a) + // igamma(a, x) = gamma(a, x) / Gamma(a) + // where Gamma and gamma are considered the standard unnormalized + // upper and lower incomplete gamma functions, respectively. + ArrayType a = m1.abs() + 2; + ArrayType x = m2.abs() + 2; + ArrayType zero = ArrayType::Zero(rows, cols); + ArrayType one = ArrayType::Constant(rows, cols, Scalar(1.0)); + ArrayType a_m1 = a - one; + ArrayType Gamma_a_x = Eigen::igammac(a, x) * a.lgamma().exp(); + ArrayType Gamma_a_m1_x = Eigen::igammac(a_m1, x) * a_m1.lgamma().exp(); + ArrayType gamma_a_x = Eigen::igamma(a, x) * a.lgamma().exp(); + ArrayType gamma_a_m1_x = Eigen::igamma(a_m1, x) * a_m1.lgamma().exp(); + + // Gamma(a, 0) == Gamma(a) + VERIFY_IS_APPROX(Eigen::igammac(a, zero), one); + + // Gamma(a, x) + gamma(a, x) == Gamma(a) + VERIFY_IS_APPROX(Gamma_a_x + gamma_a_x, a.lgamma().exp()); + + // Gamma(a, x) == (a - 1) * Gamma(a-1, x) + x^(a-1) * exp(-x) + VERIFY_IS_APPROX(Gamma_a_x, (a - 1) * Gamma_a_m1_x + x.pow(a-1) * (-x).exp()); + + // gamma(a, x) == (a - 1) * gamma(a-1, x) - x^(a-1) * exp(-x) + VERIFY_IS_APPROX(gamma_a_x, (a - 1) * gamma_a_m1_x - x.pow(a-1) * (-x).exp()); + } + + { + // Check exact values of igamma and igammac against a third party calculation. + Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + + // location i*6+j corresponds to a_s[i], x_s[j]. + Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan}, + {0.0, 0.6321205588285578, 0.7768698398515702, + 0.9816843611112658, 9.999500016666262e-05, 1.0}, + {0.0, 0.4275932955291202, 0.608374823728911, + 0.9539882943107686, 7.522076445089201e-07, 1.0}, + {0.0, 0.01898815687615381, 0.06564245437845008, + 0.5665298796332909, 4.166333347221828e-18, 1.0}, + {0.0, 0.9999780593618628, 0.9999899967080838, + 0.9999996219837988, 0.9991370418689945, 1.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}}; + Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan}, + {1.0, 0.36787944117144233, 0.22313016014842982, + 0.018315638888734182, 0.9999000049998333, 0.0}, + {1.0, 0.5724067044708798, 0.3916251762710878, + 0.04601170568923136, 0.9999992477923555, 0.0}, + {1.0, 0.9810118431238462, 0.9343575456215499, + 0.4334701203667089, 1.0, 0.0}, + {1.0, 2.1940638138146658e-05, 1.0003291916285e-05, + 3.7801620118431334e-07, 0.0008629581310054535, + 0.0}, + {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}}; + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 6; ++j) { + if ((std::isnan)(igamma_s[i][j])) { + VERIFY((std::isnan)(numext::igamma(a_s[i], x_s[j]))); + } else { + VERIFY_IS_APPROX(numext::igamma(a_s[i], x_s[j]), igamma_s[i][j]); + } + + if ((std::isnan)(igammac_s[i][j])) { + VERIFY((std::isnan)(numext::igammac(a_s[i], x_s[j]))); + } else { + VERIFY_IS_APPROX(numext::igammac(a_s[i], x_s[j]), igammac_s[i][j]); + } + } + } + } + } +#endif // EIGEN_HAS_C99_MATH + + // Check the zeta function against scipy.special.zeta + { + ArrayType x(7), q(7), res(7), ref(7); + x << 1.5, 4, 10.5, 10000.5, 3, 1, 0.9; + q << 2, 1.5, 3, 1.0001, -2.5, 1.2345, 1.2345; + ref << 1.61237534869, 0.234848505667, 1.03086757337e-5, 0.367879440865, 0.054102025820864097, plusinf, nan; + CALL_SUBTEST( verify_component_wise(ref, ref); ); + CALL_SUBTEST( res = x.zeta(q); verify_component_wise(res, ref); ); + CALL_SUBTEST( res = zeta(x,q); verify_component_wise(res, ref); ); + } + + // digamma + { + ArrayType x(7), res(7), ref(7); + x << 1, 1.5, 4, -10.5, 10000.5, 0, -1; + ref << -0.5772156649015329, 0.03648997397857645, 1.2561176684318, 2.398239129535781, 9.210340372392849, plusinf, plusinf; + CALL_SUBTEST( verify_component_wise(ref, ref); ); + + CALL_SUBTEST( res = x.digamma(); verify_component_wise(res, ref); ); + CALL_SUBTEST( res = digamma(x); verify_component_wise(res, ref); ); + } + + +#if EIGEN_HAS_C99_MATH + { + ArrayType n(11), x(11), res(11), ref(11); + n << 1, 1, 1, 1.5, 17, 31, 28, 8, 42, 147, 170; + x << 2, 3, 25.5, 1.5, 4.7, 11.8, 17.7, 30.2, 15.8, 54.1, 64; + ref << 0.644934066848, 0.394934066848, 0.0399946696496, nan, 293.334565435, 0.445487887616, -2.47810300902e-07, -8.29668781082e-09, -0.434562276666, 0.567742190178, -0.0108615497927; + CALL_SUBTEST( verify_component_wise(ref, ref); ); + + if(sizeof(RealScalar)>=8) { // double + // Reason for commented line: http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1232 + // CALL_SUBTEST( res = x.polygamma(n); verify_component_wise(res, ref); ); + CALL_SUBTEST( res = polygamma(n,x); verify_component_wise(res, ref); ); + } + else { + // CALL_SUBTEST( res = x.polygamma(n); verify_component_wise(res.head(8), ref.head(8)); ); + CALL_SUBTEST( res = polygamma(n,x); verify_component_wise(res.head(8), ref.head(8)); ); + } + } +#endif + +#if EIGEN_HAS_C99_MATH + { + // Inputs and ground truth generated with scipy via: + // a = np.logspace(-3, 3, 5) - 1e-3 + // b = np.logspace(-3, 3, 5) - 1e-3 + // x = np.linspace(-0.1, 1.1, 5) + // (full_a, full_b, full_x) = np.vectorize(lambda a, b, x: (a, b, x))(*np.ix_(a, b, x)) + // full_a = full_a.flatten().tolist() # same for full_b, full_x + // v = scipy.special.betainc(full_a, full_b, full_x).flatten().tolist() + // + // Note in Eigen, we call betainc with arguments in the order (x, a, b). + ArrayType a(125); + ArrayType b(125); + ArrayType x(125); + ArrayType v(125); + ArrayType res(125); + + a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, + 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, + 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, + 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, + 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, + 999.999, 999.999, 999.999; + + b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999, + 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, + 999.999, 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, + 0.999, 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999, + 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999, + 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999, + 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999, + 999.999, 999.999; + + x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, + 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, + 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, + 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, + -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, + 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, + 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, + 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, + 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, + 0.8, 1.1; + + v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, + nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, + nan, nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan, + 0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan, + 0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan, + 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, + nan, nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256, + 0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001, + 0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403, + 0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999, + 0.9999999999999999, nan, nan, nan, nan, nan, nan, nan, + 1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, + nan, nan, 7.864342668429763e-23, 3.015969667594166e-10, + 0.0008598571564165444, nan, nan, 6.031987710123844e-08, + 0.5000000000000007, 0.9999999396801229, nan, nan, 0.9999999999999999, + 0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan, nan, nan, + nan, 0.0, 7.029920380986636e-306, 2.2450728208591345e-101, nan, nan, + 0.0, 9.275871147869727e-302, 1.2232913026152827e-97, nan, nan, 0.0, + 3.0891393081932924e-252, 2.9303043666183996e-60, nan, nan, + 2.248913486879199e-196, 0.5000000000004947, 0.9999999999999999, nan; + + CALL_SUBTEST(res = betainc(a, b, x); + verify_component_wise(res, v);); + } + + // Test various properties of betainc + { + ArrayType m1 = ArrayType::Random(32); + ArrayType m2 = ArrayType::Random(32); + ArrayType m3 = ArrayType::Random(32); + ArrayType one = ArrayType::Constant(32, Scalar(1.0)); + const Scalar eps = std::numeric_limits<Scalar>::epsilon(); + ArrayType a = (m1 * 4.0).exp(); + ArrayType b = (m2 * 4.0).exp(); + ArrayType x = m3.abs(); + + // betainc(a, 1, x) == x**a + CALL_SUBTEST( + ArrayType test = betainc(a, one, x); + ArrayType expected = x.pow(a); + verify_component_wise(test, expected);); + + // betainc(1, b, x) == 1 - (1 - x)**b + CALL_SUBTEST( + ArrayType test = betainc(one, b, x); + ArrayType expected = one - (one - x).pow(b); + verify_component_wise(test, expected);); + + // betainc(a, b, x) == 1 - betainc(b, a, 1-x) + CALL_SUBTEST( + ArrayType test = betainc(a, b, x) + betainc(b, a, one - x); + ArrayType expected = one; + verify_component_wise(test, expected);); + + // betainc(a+1, b, x) = betainc(a, b, x) - x**a * (1 - x)**b / (a * beta(a, b)) + CALL_SUBTEST( + ArrayType num = x.pow(a) * (one - x).pow(b); + ArrayType denom = a * (a.lgamma() + b.lgamma() - (a + b).lgamma()).exp(); + // Add eps to rhs and lhs so that component-wise test doesn't result in + // nans when both outputs are zeros. + ArrayType expected = betainc(a, b, x) - num / denom + eps; + ArrayType test = betainc(a + one, b, x) + eps; + if (sizeof(Scalar) >= 8) { // double + verify_component_wise(test, expected); + } else { + // Reason for limited test: http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1232 + verify_component_wise(test.head(8), expected.head(8)); + }); + + // betainc(a, b+1, x) = betainc(a, b, x) + x**a * (1 - x)**b / (b * beta(a, b)) + CALL_SUBTEST( + // Add eps to rhs and lhs so that component-wise test doesn't result in + // nans when both outputs are zeros. + ArrayType num = x.pow(a) * (one - x).pow(b); + ArrayType denom = b * (a.lgamma() + b.lgamma() - (a + b).lgamma()).exp(); + ArrayType expected = betainc(a, b, x) + num / denom + eps; + ArrayType test = betainc(a, b + one, x) + eps; + verify_component_wise(test, expected);); + } +#endif +} + +void test_special_functions() +{ + CALL_SUBTEST_1(array_special_functions<ArrayXf>()); + CALL_SUBTEST_2(array_special_functions<ArrayXd>()); +} |