diff options
Diffstat (limited to 'unsupported')
157 files changed, 9359 insertions, 1981 deletions
diff --git a/unsupported/Eigen/CMakeLists.txt b/unsupported/Eigen/CMakeLists.txt index 6d0cf4f9d..631a06014 100644 --- a/unsupported/Eigen/CMakeLists.txt +++ b/unsupported/Eigen/CMakeLists.txt @@ -4,6 +4,7 @@ set(Eigen_HEADERS ArpackSupport AutoDiff BVH + EulerAngles FFT IterativeSolvers KroneckerProduct @@ -17,6 +18,7 @@ set(Eigen_HEADERS Polynomials Skyline SparseExtra + SpecialFunctions Splines ) @@ -25,5 +27,6 @@ install(FILES DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel ) -add_subdirectory(src) -add_subdirectory(CXX11)
\ No newline at end of file +install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel FILES_MATCHING PATTERN "*.h") + +add_subdirectory(CXX11) diff --git a/unsupported/Eigen/CXX11/CMakeLists.txt b/unsupported/Eigen/CXX11/CMakeLists.txt index a40bc4715..385ed240c 100644 --- a/unsupported/Eigen/CXX11/CMakeLists.txt +++ b/unsupported/Eigen/CXX11/CMakeLists.txt @@ -5,4 +5,4 @@ install(FILES DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel ) -add_subdirectory(src) +install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel FILES_MATCHING PATTERN "*.h") diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 1e97ad3c0..4976a1254 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -15,6 +15,7 @@ #include <Eigen/src/Core/util/DisableStupidWarnings.h> +#include "../SpecialFunctions" #include "src/util/CXX11Meta.h" #include "src/util/MaxSizeVector.h" @@ -60,15 +61,17 @@ typedef unsigned __int64 uint64_t; #ifdef EIGEN_USE_GPU #include <iostream> #include <cuda_runtime.h> -#if defined(__CUDACC__) -#include <curand_kernel.h> +#if __cplusplus >= 201103L +#include <atomic> +#include <unistd.h> #endif #endif - #include "src/Tensor/TensorMacros.h" #include "src/Tensor/TensorForwardDeclarations.h" #include "src/Tensor/TensorMeta.h" +#include "src/Tensor/TensorFunctors.h" +#include "src/Tensor/TensorCostModel.h" #include "src/Tensor/TensorDeviceDefault.h" #include "src/Tensor/TensorDeviceThreadPool.h" #include "src/Tensor/TensorDeviceCuda.h" @@ -77,13 +80,13 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorDimensions.h" #include "src/Tensor/TensorInitializer.h" #include "src/Tensor/TensorTraits.h" -#include "src/Tensor/TensorFunctors.h" +#include "src/Tensor/TensorRandom.h" #include "src/Tensor/TensorUInt128.h" #include "src/Tensor/TensorIntDiv.h" +#include "src/Tensor/TensorGlobalFunctions.h" #include "src/Tensor/TensorBase.h" -#include "src/Tensor/TensorCostModel.h" #include "src/Tensor/TensorEvaluator.h" #include "src/Tensor/TensorExpr.h" #include "src/Tensor/TensorReduction.h" @@ -115,6 +118,7 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorForcedEval.h" #include "src/Tensor/TensorGenerator.h" #include "src/Tensor/TensorAssign.h" +#include "src/Tensor/TensorScan.h" #include "src/Tensor/TensorExecutor.h" #include "src/Tensor/TensorDevice.h" diff --git a/unsupported/Eigen/CXX11/src/CMakeLists.txt b/unsupported/Eigen/CXX11/src/CMakeLists.txt deleted file mode 100644 index 1734262bb..000000000 --- a/unsupported/Eigen/CXX11/src/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -add_subdirectory(util) -add_subdirectory(ThreadPool) -add_subdirectory(Tensor) -add_subdirectory(TensorSymmetry) diff --git a/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt b/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt deleted file mode 100644 index 6d4b3ea0d..000000000 --- a/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_CXX11_Tensor_SRCS "*.h") - -INSTALL(FILES - ${Eigen_CXX11_Tensor_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/Tensor COMPONENT Devel - ) diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md index eeca2f69e..02146527b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/README.md +++ b/unsupported/Eigen/CXX11/src/Tensor/README.md @@ -1102,7 +1102,7 @@ Example: Reduction along two dimensions. As a special case, if you pass no parameter to a reduction operation the original tensor is reduced along *all* its dimensions. The result is a -one-dimension tensor with a single value. +scalar, represented as a zero-dimension tensor. Eigen::Tensor<float, 3> a(2, 3, 4); a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f}, @@ -1112,7 +1112,7 @@ one-dimension tensor with a single value. {19.0f, 18.0f, 17.0f, 16.0f}, {20.0f, 21.0f, 22.0f, 23.0f}}}); // Reduce along all dimensions using the sum() operator. - Eigen::Tensor<float, 1> b = a.sum(); + Eigen::Tensor<float, 0> b = a.sum(); cout << "b" << endl << b << endl << endl; => b @@ -1168,6 +1168,44 @@ Reduce a tensor using a user-defined reduction operator. See ```SumReducer``` in TensorFunctors.h for information on how to implement a reduction operator. +## Scan Operations + +A *Scan* operation returns a tensor with the same dimensions as the original +tensor. The operation performs an inclusive scan along the specified +axis, which means it computes a running total along the axis for a given +reduction operation. +If the reduction operation corresponds to summation, then this computes the +prefix sum of the tensor along the given axis. + +Example: +dd a comment to this line + + // Create a tensor of 2 dimensions + Eigen::Tensor<int, 2> a(2, 3); + a.setValues({{1, 2, 3}, {4, 5, 6}}); + // Scan it along the second dimension (1) using summation + Eigen::Tensor<int, 2> b = a.cumsum(1); + // The result is a tensor with the same size as the input + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 1 2 3 + 6 5 4 + + b + 1 3 6 + 4 9 15 + +### <Operation> cumsum(const Index& axis) + +Perform a scan by summing consecutive entries. + +### <Operation> cumprod(const Index& axis) + +Perform a scan by multiplying consecutive entries. + + ## Convolutions ### <Operation> convolve(const Kernel& kernel, const Dimensions& dims) diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index 759dede3f..1940a9692 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -110,7 +110,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp inline Self& base() { return *this; } inline const Self& base() const { return *this; } -#ifdef EIGEN_HAS_VARIADIC_TEMPLATES +#if EIGEN_HAS_VARIADIC_TEMPLATES template<typename... IndexTypes> EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const { @@ -150,7 +150,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp return m_storage.data()[index]; } -#ifdef EIGEN_HAS_VARIADIC_TEMPLATES +#if EIGEN_HAS_VARIADIC_TEMPLATES template<typename... IndexTypes> inline Scalar& coeffRef(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) { @@ -190,7 +190,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp return m_storage.data()[index]; } -#ifdef EIGEN_HAS_VARIADIC_TEMPLATES +#if EIGEN_HAS_VARIADIC_TEMPLATES template<typename... IndexTypes> inline const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const { @@ -257,7 +257,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp return coeff(index); } -#ifdef EIGEN_HAS_VARIADIC_TEMPLATES +#if EIGEN_HAS_VARIADIC_TEMPLATES template<typename... IndexTypes> inline Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) { @@ -336,7 +336,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp { } -#ifdef EIGEN_HAS_VARIADIC_TEMPLATES +#if EIGEN_HAS_VARIADIC_TEMPLATES template<typename... IndexTypes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index firstDimension, IndexTypes... otherDimensions) : m_storage(firstDimension, otherDimensions...) @@ -350,22 +350,22 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp { EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1, Index dim2) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2) : m_storage(dim1*dim2, array<Index, 2>(dim1, dim2)) { EIGEN_STATIC_ASSERT(2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1, Index dim2, Index dim3) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3) : m_storage(dim1*dim2*dim3, array<Index, 3>(dim1, dim2, dim3)) { EIGEN_STATIC_ASSERT(3 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4) : m_storage(dim1*dim2*dim3*dim4, array<Index, 4>(dim1, dim2, dim3, dim4)) { EIGEN_STATIC_ASSERT(4 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_storage(dim1*dim2*dim3*dim4*dim5, array<Index, 5>(dim1, dim2, dim3, dim4, dim5)) { EIGEN_STATIC_ASSERT(5 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) @@ -418,7 +418,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp return *this; } -#ifdef EIGEN_HAS_VARIADIC_TEMPLATES +#if EIGEN_HAS_VARIADIC_TEMPLATES template<typename... IndexTypes> EIGEN_DEVICE_FUNC void resize(Index firstDimension, IndexTypes... otherDimensions) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index babafe108..d06f40cd8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -254,6 +254,14 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double compute_cost = 1.0 + + (m_return_dim < 0 ? 0.0 : (TensorOpCost::ModCost<Index>() + TensorOpCost::DivCost<Index>())); + return m_orig_impl.costPerCoeff(vectorized) + + m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost); + } + private: EIGEN_DEVICE_FUNC void gen_strides(const InputDimensions& dims, StrideDims& strides) { if (m_return_dim < 0) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 1a34f3ccc..7a45a5cf4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -192,6 +192,12 @@ class TensorBase<Derived, ReadOnlyAccessors> } EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log1p_op<Scalar>, const Derived> + log1p() const { + return unaryExpr(internal::scalar_log1p_op<Scalar>()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived> abs() const { return unaryExpr(internal::scalar_abs_op<Scalar>()); @@ -204,34 +210,74 @@ class TensorBase<Derived, ReadOnlyAccessors> } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_pow_op<Scalar>, const Derived> + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_pow_op<Scalar,Scalar> >, const Derived> pow(Scalar exponent) const { - return unaryExpr(internal::scalar_pow_op<Scalar>(exponent)); + return unaryExpr(internal::bind2nd_op<internal::scalar_pow_op<Scalar,Scalar> >(exponent)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_real_op<Scalar>, const Derived> + real() const { + return unaryExpr(internal::scalar_real_op<Scalar>()); } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived> + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_imag_op<Scalar>, const Derived> + imag() const { + return unaryExpr(internal::scalar_imag_op<Scalar>()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_sum_op<Scalar,Scalar> >, const Derived> operator+ (Scalar rhs) const { - return unaryExpr(internal::scalar_add_op<Scalar>(rhs)); + return unaryExpr(internal::bind2nd_op<internal::scalar_sum_op<Scalar,Scalar> >(rhs)); } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sub_op<Scalar>, const Derived> + EIGEN_STRONG_INLINE friend + const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_sum_op<Scalar> >, const Derived> + operator+ (Scalar lhs, const Derived& rhs) { + return rhs.unaryExpr(internal::bind1st_op<internal::scalar_sum_op<Scalar> >(lhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_difference_op<Scalar,Scalar> >, const Derived> operator- (Scalar rhs) const { EIGEN_STATIC_ASSERT((NumTraits<Scalar>::IsSigned || internal::is_same<Scalar, const std::complex<float> >::value), YOU_MADE_A_PROGRAMMING_MISTAKE); - return unaryExpr(internal::scalar_sub_op<Scalar>(rhs)); + return unaryExpr(internal::bind2nd_op<internal::scalar_difference_op<Scalar,Scalar> >(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE friend + const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_difference_op<Scalar> >, const Derived> + operator- (Scalar lhs, const Derived& rhs) { + return rhs.unaryExpr(internal::bind1st_op<internal::scalar_difference_op<Scalar> >(lhs)); } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Derived> + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_product_op<Scalar,Scalar> >, const Derived> operator* (Scalar rhs) const { - return unaryExpr(internal::scalar_multiple_op<Scalar>(rhs)); + return unaryExpr(internal::bind2nd_op<internal::scalar_product_op<Scalar,Scalar> >(rhs)); } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_quotient1_op<Scalar>, const Derived> + EIGEN_STRONG_INLINE friend + const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_product_op<Scalar> >, const Derived> + operator* (Scalar lhs, const Derived& rhs) { + return rhs.unaryExpr(internal::bind1st_op<internal::scalar_product_op<Scalar> >(lhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_quotient_op<Scalar,Scalar> >, const Derived> operator/ (Scalar rhs) const { - return unaryExpr(internal::scalar_quotient1_op<Scalar>(rhs)); + return unaryExpr(internal::bind2nd_op<internal::scalar_quotient_op<Scalar,Scalar> >(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE friend + const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_quotient_op<Scalar> >, const Derived> + operator/ (Scalar lhs, const Derived& rhs) { + return rhs.unaryExpr(internal::bind1st_op<internal::scalar_quotient_op<Scalar> >(lhs)); } EIGEN_DEVICE_FUNC @@ -277,7 +323,6 @@ class TensorBase<Derived, ReadOnlyAccessors> return unaryExpr(internal::scalar_floor_op<Scalar>()); } - // Generic binary operation support. template <typename CustomBinaryOp, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived> @@ -342,66 +387,66 @@ class TensorBase<Derived, ReadOnlyAccessors> // Comparisons and tests. template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LT>, const Derived, const OtherDerived> + const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const OtherDerived> operator<(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_LT>()); + return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>()); } template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LE>, const Derived, const OtherDerived> + const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const OtherDerived> operator<=(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_LE>()); + return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>()); } template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GT>, const Derived, const OtherDerived> + const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const OtherDerived> operator>(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_GT>()); + return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>()); } template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GE>, const Derived, const OtherDerived> + const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const OtherDerived> operator>=(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_GE>()); + return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>()); } template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_EQ>, const Derived, const OtherDerived> + const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const OtherDerived> operator==(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_EQ>()); + return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>()); } template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>, const Derived, const OtherDerived> + const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const OtherDerived> operator!=(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>()); + return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>()); } // comparisons and tests for Scalars EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > operator<(Scalar threshold) const { return operator<(constant(threshold)); } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > operator<=(Scalar threshold) const { return operator<=(constant(threshold)); } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > operator>(Scalar threshold) const { return operator>(constant(threshold)); } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > operator>=(Scalar threshold) const { return operator>=(constant(threshold)); } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_EQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > operator==(Scalar threshold) const { return operator==(constant(threshold)); } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> > operator!=(Scalar threshold) const { return operator!=(constant(threshold)); } @@ -453,6 +498,28 @@ class TensorBase<Derived, ReadOnlyAccessors> return TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>(derived(), fft); } + // Scan. + typedef TensorScanOp<internal::SumReducer<CoeffReturnType>, const Derived> TensorScanSumOp; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorScanSumOp + cumsum(const Index& axis, bool exclusive = false) const { + return TensorScanSumOp(derived(), axis, exclusive); + } + + typedef TensorScanOp<internal::ProdReducer<CoeffReturnType>, const Derived> TensorScanProdOp; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorScanProdOp + cumprod(const Index& axis, bool exclusive = false) const { + return TensorScanProdOp(derived(), axis, exclusive); + } + + template <typename Reducer> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorScanOp<Reducer, const Derived> + scan(const Index& axis, const Reducer& reducer, bool exclusive = false) const { + return TensorScanOp<Reducer, const Derived>(derived(), axis, exclusive, reducer); + } + // Reductions. template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived> @@ -676,6 +743,12 @@ class TensorBase<Derived, ReadOnlyAccessors> slice(const StartIndices& startIndices, const Sizes& sizes) const { return TensorSlicingOp<const StartIndices, const Sizes, const Derived>(derived(), startIndices, sizes); } + template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, const Derived> + stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const { + return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, + const Derived>(derived(), startIndices, stopIndices, strides); + } template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorChippingOp<DimId, const Derived> chip(const Index offset) const { @@ -750,8 +823,8 @@ class TensorBase<Derived, ReadOnlyAccessors> EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); } }; -template<typename Derived> -class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyAccessors> { +template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value> +class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> { public: typedef internal::traits<Derived> DerivedTraits; typedef typename DerivedTraits::Scalar Scalar; @@ -761,7 +834,7 @@ class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyA template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor; template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize; - template <typename OtherDerived, int AccessLevel> friend class TensorBase; + template <typename OtherDerived, int OtherAccessLevel> friend class TensorBase; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& setZero() { @@ -780,7 +853,7 @@ class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyA return derived() = this->template random<RandomGenerator>(); } -#ifdef EIGEN_HAS_VARIADIC_TEMPLATES +#if EIGEN_HAS_VARIADIC_TEMPLATES EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& setValues( const typename internal::Initializer<Derived, NumDimensions>::InitList& vals) { @@ -851,6 +924,19 @@ class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyA return TensorSlicingOp<const StartIndices, const Sizes, Derived>(derived(), startIndices, sizes); } + template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, const Derived> + stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const { + return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, + const Derived>(derived(), startIndices, stopIndices, strides); + } + template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, Derived> + stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) { + return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, + Derived>(derived(), startIndices, stopIndices, strides); + } + template <DenseIndex DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorChippingOp<DimId, const Derived> chip(const Index offset) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index c771496e2..5d67f69f3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -106,7 +106,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { - IsAligned = false, + IsAligned = true, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, Layout = TensorEvaluator<ArgType, Device>::Layout, RawAccess = false @@ -118,7 +118,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar // and store the result in a scalar. Instead one should reshape the scalar into a a N-D // tensor with N >= 1 of 1 element first and then broadcast. - EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); const InputDimensions& input_dims = m_impl.dimensions(); const Broadcast& broadcast = op.broadcast(); for (int i = 0; i < NumDims; ++i) { @@ -247,7 +247,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const { - EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); const Index originalIndex = index; @@ -299,7 +299,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const { - EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); const Index originalIndex = index; @@ -354,11 +354,11 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> if (NumDims > 0) { for (int i = NumDims - 1; i > 0; --i) { compute_cost += TensorOpCost::DivCost<Index>(); - if (internal::index_statically_eq<Broadcast>()(i, 1)) { + if (internal::index_statically_eq<Broadcast>(i, 1)) { compute_cost += TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>(); } else { - if (!internal::index_statically_eq<InputDimensions>()(i, 1)) { + if (!internal::index_statically_eq<InputDimensions>(i, 1)) { compute_cost += TensorOpCost::MulCost<Index>() + TensorOpCost::ModCost<Index>() + TensorOpCost::AddCost<Index>(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 2742dbb95..1ba7ef170 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -152,8 +152,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device) { - // We could also support the case where NumInputDims==1 if needed. - EIGEN_STATIC_ASSERT(NumInputDims >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); eigen_assert(NumInputDims > m_dim.actualDim()); const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); @@ -203,7 +202,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) || @@ -342,7 +341,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device> template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == 0) || (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 839c6e3e5..59bf90d93 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -128,8 +128,8 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis()) { EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); - EIGEN_STATIC_ASSERT(NumDims == RightNumDims, YOU_MADE_A_PROGRAMMING_MISTAKE); - EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((NumDims == RightNumDims), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); eigen_assert(0 <= m_axis && m_axis < NumDims); const Dimensions& lhs_dims = m_leftImpl.dimensions(); @@ -248,8 +248,8 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - static const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index + packetSize - 1 < dimensions().TotalSize()); EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; @@ -344,8 +344,8 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - static const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + const int packetSize = internal::unpacket_traits<PacketReturnType>::size; + EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize()); EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 6f113b903..20b29e5fd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -25,8 +25,9 @@ template<typename Dimensions, typename LhsXprType, typename RhsXprType> struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> > { // Type promotion to handle the case where the types of the lhs and the rhs are different. - typedef typename internal::promote_storage_type<typename LhsXprType::Scalar, - typename RhsXprType::Scalar>::ret Scalar; + typedef typename gebp_traits<typename remove_const<typename LhsXprType::Scalar>::type, + typename remove_const<typename RhsXprType::Scalar>::type>::ResScalar Scalar; + typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind, typename traits<RhsXprType>::StorageKind>::ret StorageKind; typedef typename promote_index_type<typename traits<LhsXprType>::Index, @@ -37,7 +38,7 @@ struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> > typedef typename remove_reference<RhsNested>::type _RhsNested; // From NumDims below. - static const int NumDimensions = max_n_1<traits<RhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value>::size; + static const int NumDimensions = traits<RhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value; static const int Layout = traits<LhsXprType>::Layout; enum { @@ -65,7 +66,7 @@ struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_, typedef Device_ Device; // From NumDims below. - static const int NumDimensions = max_n_1<traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value>::size; + static const int NumDimensions = traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value; }; } // end namespace internal @@ -75,8 +76,8 @@ class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXp { public: typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar; - typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType, - typename RhsXprType::CoeffReturnType>::ret CoeffReturnType; + typedef typename internal::gebp_traits<typename LhsXprType::CoeffReturnType, + typename RhsXprType::CoeffReturnType>::ResScalar CoeffReturnType; typedef typename Eigen::internal::nested<TensorContractionOp>::type Nested; typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind; typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index; @@ -140,11 +141,11 @@ struct TensorContractionEvaluatorBase static const int RDims = internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value; static const int ContractDims = internal::array_size<Indices>::value; - static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size; + static const int NumDims = LDims + RDims - 2 * ContractDims; typedef array<Index, ContractDims> contract_t; - typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t; - typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t; + typedef array<Index, LDims - ContractDims> left_nocontract_t; + typedef array<Index, RDims - ContractDims> right_nocontract_t; typedef DSizes<Index, NumDims> Dimensions; @@ -218,11 +219,9 @@ struct TensorContractionEvaluatorBase rhs_strides[i+1] = rhs_strides[i] * eval_right_dims[i]; } - m_i_strides[0] = 1; - m_j_strides[0] = 1; - if(ContractDims) { - m_k_strides[0] = 1; - } + if (m_i_strides.size() > 0) m_i_strides[0] = 1; + if (m_j_strides.size() > 0) m_j_strides[0] = 1; + if (m_k_strides.size() > 0) m_k_strides[0] = 1; m_i_size = 1; m_j_size = 1; @@ -318,11 +317,6 @@ struct TensorContractionEvaluatorBase } } - // Scalar case. We represent the result as a 1d tensor of size 1. - if (LDims + RDims == 2 * ContractDims) { - m_dimensions[0] = 1; - } - // If the layout is RowMajor, we need to reverse the m_dimensions if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) { for (int i = 0, j = NumDims - 1; i < j; i++, j--) { @@ -510,7 +504,7 @@ struct TensorContractionEvaluatorBase // call gebp (matrix kernel) // The parameters here are copied from Eigen's GEMM implementation - gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, 1.0, -1, -1, 0, 0); + gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, Scalar(1), -1, -1, 0, 0); } } } @@ -607,15 +601,14 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT static const int ContractDims = internal::array_size<Indices>::value; typedef array<Index, ContractDims> contract_t; - typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t; - typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t; + typedef array<Index, LDims - ContractDims> left_nocontract_t; + typedef array<Index, RDims - ContractDims> right_nocontract_t; - static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size; + static const int NumDims = LDims + RDims - 2 * ContractDims; // Could we use NumDimensions here? typedef DSizes<Index, NumDims> Dimensions; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index 6a3ef14ef..d65dbb40f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -461,8 +461,8 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, #undef writeResultShmem #undef writeRow - const int max_i_write = (min)((int)((m_size - base_m - threadIdx.y + 7) / 8), 8); - const int max_j_write = (min)((int)((n_size - base_n - threadIdx.z + 7) / 8), 8); + const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8); + const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8); if (threadIdx.x < max_i_write) { if (max_j_write == 8) { @@ -1240,10 +1240,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT typedef array<Index, RDims> right_dim_mapper_t; typedef array<Index, ContractDims> contract_t; - typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t; - typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t; + typedef array<Index, LDims - ContractDims> left_nocontract_t; + typedef array<Index, RDims - ContractDims> right_nocontract_t; - static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size; + static const int NumDims = LDims + RDims - 2 * ContractDims; typedef DSizes<Index, NumDims> Dimensions; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h index b27e1a1b4..9b2cb3ff6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -130,19 +130,19 @@ class SimpleTensorContractionMapper { } Index contract_val = left ? col : row; - for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) { - const Index idx = contract_val / m_k_strides[i]; - linidx += idx * m_contract_strides[i]; - contract_val -= idx * m_k_strides[i]; - } - if(array_size<contract_t>::value > 0) { - if (side == Rhs && inner_dim_contiguous) { - eigen_assert(m_contract_strides[0] == 1); - linidx += contract_val; - } else { - linidx += contract_val * m_contract_strides[0]; - } + for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) { + const Index idx = contract_val / m_k_strides[i]; + linidx += idx * m_contract_strides[i]; + contract_val -= idx * m_k_strides[i]; + } + + if (side == Rhs && inner_dim_contiguous) { + eigen_assert(m_contract_strides[0] == 1); + linidx += contract_val; + } else { + linidx += contract_val * m_contract_strides[0]; + } } return linidx; @@ -153,15 +153,15 @@ class SimpleTensorContractionMapper { const bool left = (side == Lhs); Index nocontract_val[2] = {left ? row : col, left ? row + distance : col}; Index linidx[2] = {0, 0}; - for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) { - const Index idx0 = nocontract_val[0] / m_ij_strides[i]; - const Index idx1 = nocontract_val[1] / m_ij_strides[i]; - linidx[0] += idx0 * m_nocontract_strides[i]; - linidx[1] += idx1 * m_nocontract_strides[i]; - nocontract_val[0] -= idx0 * m_ij_strides[i]; - nocontract_val[1] -= idx1 * m_ij_strides[i]; - } if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) { + for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) { + const Index idx0 = nocontract_val[0] / m_ij_strides[i]; + const Index idx1 = nocontract_val[1] / m_ij_strides[i]; + linidx[0] += idx0 * m_nocontract_strides[i]; + linidx[1] += idx1 * m_nocontract_strides[i]; + nocontract_val[0] -= idx0 * m_ij_strides[i]; + nocontract_val[1] -= idx1 * m_ij_strides[i]; + } if (side == Lhs && inner_dim_contiguous) { eigen_assert(m_nocontract_strides[0] == 1); linidx[0] += nocontract_val[0]; @@ -173,22 +173,24 @@ class SimpleTensorContractionMapper { } Index contract_val[2] = {left ? col : row, left ? col : row + distance}; - for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) { - const Index idx0 = contract_val[0] / m_k_strides[i]; - const Index idx1 = contract_val[1] / m_k_strides[i]; - linidx[0] += idx0 * m_contract_strides[i]; - linidx[1] += idx1 * m_contract_strides[i]; - contract_val[0] -= idx0 * m_k_strides[i]; - contract_val[1] -= idx1 * m_k_strides[i]; - } + if (array_size<contract_t>::value> 0) { + for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) { + const Index idx0 = contract_val[0] / m_k_strides[i]; + const Index idx1 = contract_val[1] / m_k_strides[i]; + linidx[0] += idx0 * m_contract_strides[i]; + linidx[1] += idx1 * m_contract_strides[i]; + contract_val[0] -= idx0 * m_k_strides[i]; + contract_val[1] -= idx1 * m_k_strides[i]; + } - if (side == Rhs && inner_dim_contiguous) { - eigen_assert(m_contract_strides[0] == 1); - linidx[0] += contract_val[0]; - linidx[1] += contract_val[1]; - } else { - linidx[0] += contract_val[0] * m_contract_strides[0]; - linidx[1] += contract_val[1] * m_contract_strides[0]; + if (side == Rhs && inner_dim_contiguous) { + eigen_assert(m_contract_strides[0] == 1); + linidx[0] += contract_val[0]; + linidx[1] += contract_val[1]; + } else { + linidx[0] += contract_val[0] * m_contract_strides[0]; + linidx[1] += contract_val[1] * m_contract_strides[0]; + } } return IndexPair<Index>(linidx[0], linidx[1]); } @@ -200,7 +202,7 @@ class SimpleTensorContractionMapper { return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size; } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const { - return ((side == Lhs) && inner_dim_contiguous) ? m_contract_strides[0] : 1; + return ((side == Lhs) && inner_dim_contiguous && array_size<contract_t>::value > 0) ? m_contract_strides[0] : 1; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 9044454fd..ee16cde9b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -14,6 +14,8 @@ #ifdef EIGEN_USE_THREADS namespace Eigen { + +#ifdef EIGEN_USE_SIMPLE_THREAD_POOL namespace internal { template<typename LhsScalar, typename LhsMapper, typename Index> @@ -52,7 +54,7 @@ struct packRhsAndKernelArg { }; } // end namespace internal - +#endif // EIGEN_USE_SIMPLE_THREAD_POOL template<typename Indices, typename LeftArgType, typename RightArgType> struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> : @@ -92,10 +94,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT typedef array<Index, RDims> right_dim_mapper_t; typedef array<Index, ContractDims> contract_t; - typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t; - typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t; + typedef array<Index, LDims - ContractDims> left_nocontract_t; + typedef array<Index, RDims - ContractDims> right_nocontract_t; - static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size; + static const int NumDims = LDims + RDims - 2 * ContractDims; typedef DSizes<Index, NumDims> Dimensions; @@ -110,6 +112,623 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {} +#ifndef EIGEN_USE_SIMPLE_THREAD_POOL + template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, + bool rhs_inner_dim_reordered, int Alignment> + void evalProduct(Scalar* buffer) const { + typedef + typename internal::remove_const<typename EvalLeftArgType::Scalar>::type + LhsScalar; + typedef + typename internal::remove_const<typename EvalRightArgType::Scalar>::type + RhsScalar; + typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits; + typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator; + typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator; + typedef internal::TensorContractionInputMapper< + LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t, + contract_t, internal::packet_traits<LhsScalar>::size, + lhs_inner_dim_contiguous, false, Unaligned> + LhsMapper; + typedef internal::TensorContractionInputMapper< + RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t, + contract_t, internal::packet_traits<RhsScalar>::size, + rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned> + RhsMapper; + typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper; + typedef internal::gemm_pack_lhs<LhsScalar, Index, + typename LhsMapper::SubMapper, Traits::mr, + Traits::LhsProgress, ColMajor> + LhsPacker; + typedef internal::gemm_pack_rhs< + RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> + RhsPacker; + typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, + Traits::mr, Traits::nr, false, false> + GebpKernel; + + const Index m = this->m_i_size; + const Index n = this->m_j_size; + const Index k = this->m_k_size; + if (m == 0 || n == 0 || k == 0) return; + + // Compute a set of algorithm parameters: + // - kernel block sizes (bm, bn, bk) + // - task grain sizes (number of kernels executed per task: gm, gn) + // - number of threads + // - sharding by row/column + // - parallel packing or first lhs then rhs + // and some derived parameters: + // - number of tasks (nm, nn, nk) + // - number of kernels (nm0, nn0) + // Unfortunately, all these parameters are tightly interdependent. + // So in some cases we first compute approximate values, then compute other + // values based on these approximations and then refine the approximations. + + // There are lots of heuristics here. There is some reasoning behind them, + // but ultimately they are just tuned on contraction benchmarks for + // different input configurations, thread counts and instruction sets. + // So feel free to question any of them. + + // Compute whether we want to shard by row or by column. + // This is a first approximation, it will be refined later. Since we don't + // know number of threads yet we use 2, because what's we are most + // interested in at this point is whether it makes sense to use + // parallelization at all or not. + bool shard_by_col = shardByCol(m, n, 2); + + // First approximation of kernel blocking sizes. + // Again, we don't know number of threads yet, so we use 2. + Index bm, bn, bk; + if (shard_by_col) { + internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, + internal::ShardByCol> + blocking(k, m, n, 2); + bm = blocking.mc(); + bn = blocking.nc(); + bk = blocking.kc(); + } else { + internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, + internal::ShardByRow> + blocking(k, m, n, 2); + bm = blocking.mc(); + bn = blocking.nc(); + bk = blocking.kc(); + } + + // Compute optimal number of threads. + // Note: we use bk instead of k here because we are interested in amount of + // _parallelizable_ computations, and computations are not parallelizable + // across k dimension. + const TensorOpCost cost = + contractionCost(m, n, bm, bn, bk, shard_by_col, false); + int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads( + static_cast<double>(n) * m, cost, this->m_device.numThreads()); + + // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost + // model is not tuned. Remove this when the cost model is tuned. + if (n == 1) num_threads = 1; + + if (num_threads == 1) { + // The single-threaded algorithm should be faster in this case. + if (n == 1) + this->template evalGemv<lhs_inner_dim_contiguous, + rhs_inner_dim_contiguous, + rhs_inner_dim_reordered, Alignment>(buffer); + else + this->template evalGemm<lhs_inner_dim_contiguous, + rhs_inner_dim_contiguous, + rhs_inner_dim_reordered, Alignment>(buffer); + return; + } + + // Now that we know number of threads, recalculate sharding and blocking. + shard_by_col = shardByCol(m, n, num_threads); + if (shard_by_col) { + internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, + internal::ShardByCol> + blocking(k, m, n, num_threads); + bm = blocking.mc(); + bn = blocking.nc(); + bk = blocking.kc(); + } else { + internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, + internal::ShardByRow> + blocking(k, m, n, num_threads); + bm = blocking.mc(); + bn = blocking.nc(); + bk = blocking.kc(); + } + + // Number of kernels for each dimension. + Index nm0 = divup(m, bm); + Index nn0 = divup(n, bn); + Index nk = divup(k, bk); + + // Calculate task grain size (number of kernels executed per task). + // This task size coarsening serves two purposes: + // 1. It reduces per-task overheads including synchronization overheads. + // 2. It allows to use caches better (reuse the same packed rhs in several + // consecutive kernels). + Index gm = 1; + Index gn = 1; + // If we are sharding by column, then we prefer to reduce rows first. + if (shard_by_col) { + gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col); + gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col); + } else { + gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col); + gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col); + } + // Number of tasks in each dimension. + Index nm = divup(nm0, gm); + Index nn = divup(nn0, gn); + + // Last by not least, decide whether we want to issue both lhs and rhs + // packing in parallel; or issue lhs packing first, and then issue rhs + // packing when lhs packing completes (for !shard_by_col lhs and rhs are + // swapped). Parallel packing allows more parallelism (for both packing and + // kernels), while sequential packing provides better locality (once + // a thread finishes rhs packing it proceed to kernels with that rhs). + // First, we are interested in parallel packing if there are few tasks. + bool parallel_pack = num_threads >= nm * nn; + // Also do parallel packing if all data fits into L2$. + if (m * bk * Index(sizeof(LhsScalar)) + n * bk * Index(sizeof(RhsScalar)) <= + l2CacheSize() * num_threads) + parallel_pack = true; + // But don't do it if we will use each rhs only once. Locality seems to be + // more important in this case. + if ((shard_by_col ? nm : nn) == 1) parallel_pack = false; + + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, + this->m_i_strides, this->m_left_contracting_strides, + this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, + this->m_j_strides, this->m_right_contracting_strides, + this->m_k_strides); + + Context<LhsPacker, RhsPacker, GebpKernel, LhsMapper, RhsMapper, + OutputMapper>(this->m_device, num_threads, lhs, rhs, buffer, m, n, + k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0, + shard_by_col, parallel_pack) + .run(); + } + + // Context coordinates a single parallel gemm operation. + template <typename LhsPacker, typename RhsPacker, typename GebpKernel, + typename LhsMapper, typename RhsMapper, typename OutputMapper> + class Context { + public: + Context(const Device& device, int num_threads, LhsMapper& lhs, + RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm, + Index bn, Index bk, Index nm, Index nn, Index nk, Index gm, + Index gn, Index nm0, Index nn0, bool shard_by_col, + bool parallel_pack) + : device_(device), + lhs_(lhs), + rhs_(rhs), + buffer_(buffer), + output_(buffer, tm), + num_threads_(num_threads), + shard_by_col_(shard_by_col), + parallel_pack_(parallel_pack), + m_(tm), + n_(tn), + k_(tk), + bm_(bm), + bn_(bn), + bk_(bk), + nm_(nm), + nn_(nn), + nk_(nk), + gm_(gm), + gn_(gn), + nm0_(nm0), + nn0_(nn0) + { + for (Index x = 0; x < P; x++) { + // Normal number of notifications for k slice switch is + // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only + // nm_ + nn_ notifications, because they will not receive notifications + // from preceeding kernels. + state_switch_[x] = + x == 0 + ? 1 + : (parallel_pack_ ? nn_ + nm_ : (shard_by_col_ ? nn_ : nm_)) + + (x == P - 1 ? nm_ * nn_ : 0); + state_packing_ready_[x] = + parallel_pack_ ? 0 : (shard_by_col_ ? nm_ : nn_); + state_kernel_[x] = new std::atomic<uint8_t>*[nm_]; + for (Index m = 0; m < nm_; m++) { + state_kernel_[x][m] = new std::atomic<uint8_t>[nn_]; + // Kernels generally receive 3 notifications (previous kernel + 2 + // packing), but the first slice won't get notifications from previous + // kernels. + for (Index n = 0; n < nn_; n++) + state_kernel_[x][m][n].store( + (x == 0 ? 0 : 1) + (parallel_pack_ ? 2 : 1), + std::memory_order_relaxed); + } + } + + // Allocate memory for packed rhs/lhs matrices. + size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); + size_t lhs_size = + divup<size_t>(bm_ * bk_ * sizeof(LhsScalar), align) * align; + size_t rhs_size = + divup<size_t>(bn_ * bk_ * sizeof(RhsScalar), align) * align; + packed_mem_ = static_cast<char*>(internal::aligned_malloc( + (nm0_ * lhs_size + nn0_ * rhs_size) * std::min<size_t>(nk_, P - 1))); + char* mem = static_cast<char*>(packed_mem_); + for (Index x = 0; x < numext::mini<Index>(nk_, P - 1); x++) { + packed_lhs_[x].resize(nm0_); + for (Index m = 0; m < nm0_; m++) { + packed_lhs_[x][m] = reinterpret_cast<LhsScalar*>(mem); + mem += lhs_size; + } + packed_rhs_[x].resize(nn0_); + for (Index n = 0; n < nn0_; n++) { + packed_rhs_[x][n] = reinterpret_cast<RhsScalar*>(mem); + mem += rhs_size; + } + } + } + + ~Context() { + for (Index x = 0; x < P; x++) { + for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m]; + delete[] state_kernel_[x]; + } + internal::aligned_free(packed_mem_); + } + + void run() { + // Kick off packing of the first slice. + signal_switch(0, 1); + // Wait for overall completion. + // TODO(dvyukov): this wait can lead to deadlock. + // If nthreads contractions are concurrently submitted from worker + // threads, this wait will block all worker threads and the system will + // deadlock. + done_.Wait(); + } + + private: + Notification done_; + const Device& device_; + LhsMapper& lhs_; + RhsMapper& rhs_; + Scalar* const buffer_; + OutputMapper output_; + const int num_threads_; + const bool shard_by_col_; + const bool parallel_pack_; + // Matrix sizes. + const Index m_; + const Index n_; + const Index k_; + // Block sizes. + const Index bm_; + const Index bn_; + const Index bk_; + // Number of tasks. + const Index nm_; + const Index nn_; + const Index nk_; + // Task grain sizes (number of kernels executed per task). + const Index gm_; + const Index gn_; + // Number of blocks (this is different from ni_/nn_ because of task size + // coarsening). + const Index nm0_; + const Index nn0_; + + // Parallelization strategy. + // + // Blocks related to the same k block can run in parallel because they write + // to different output blocks. So we parallelize within k slices, this + // gives us parallelism level of m x n. Before we can start any kernels + // related to k-th slice, we need to issue m lhs packing tasks and n rhs + // packing tasks. + // + // However, there is a bottleneck when we are finishing kernels for k-th + // slice (at the very end there is only 1 runnable kernel). To mitigate this + // bottleneck we allow kernels from k-th and k+1-th slices to run in + // parallel. Note that (m, n, k) and (m, n, k+1) kernels write to the same + // output block, so they must not run in parallel. + // + // This gives us the following dependency graph. + // On each k slice we have m x n kernel tasks, m lhs paking tasks and n rhs + // packing tasks. + // Kernel (m, n, k) can start when: + // - kernel (m, n, k-1) has finished + // - lhs packing (m, k) has finished + // - rhs packing (n, k) has finished + // Lhs/rhs packing can start when: + // - all k-1 packing has finished (artificially imposed to limit amount of + // parallel packing) + // + // On top of that we limit runnable tasks to two consecutive k slices. + // This is done to limit amount of memory we need for packed lhs/rhs + // (for each k slice we need m*bk + n*bk memory in packed_lhs_/packed_rhs_). + // + // state_switch_ tracks when we are ready to switch to the next k slice. + // state_kernel_[m][n] tracks when we are ready to kick off kernel (m, n). + // These variable are rolling over 3 consecutive k slices: first two we are + // actively executing + one to track completion of kernels in the second + // slice. + static const Index P = 3; + void* packed_mem_; + std::vector<LhsScalar*> packed_lhs_[P - 1]; + std::vector<RhsScalar*> packed_rhs_[P - 1]; + std::atomic<uint8_t>** state_kernel_[P]; + // state_switch_ is frequently modified by worker threads, while other + // fields are read-only after constructor. Let's move it to a separate cache + // line to reduce cache-coherency traffic. + char pad_[128]; + std::atomic<Index> state_packing_ready_[P]; + std::atomic<Index> state_switch_[P]; + + void pack_lhs(Index m, Index k) { + const Index mend = m * gm_ + gm(m); + for (Index m1 = m * gm_; m1 < mend; m1++) + LhsPacker()(packed_lhs_[k % (P - 1)][m1], + lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1)); + + if (!parallel_pack_ && shard_by_col_) { + signal_packing(k); + } else { + signal_switch(k + 1); + for (Index n = nn_ - 1; n >= 0; n--) signal_kernel(m, n, k, n == 0); + } + } + + void pack_rhs(Index n, Index k) { + const Index nend = n * gn_ + gn(n); + for (Index n1 = n * gn_; n1 < nend; n1++) { + if (k == 0) { + // Zero the output memory in parallel. + // On 10000x2x10000 mm zeroing can easily take half of time. + // Zero (bn x m) row. Safe to do here because all kernels that will + // write to this memory depend on completion of this task. + // Note: don't call device_.memset() here. device_.memset() blocks on + // thread pool worker thread, which can lead to underutilization and + // deadlocks. + memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar)); + } + RhsPacker()(packed_rhs_[k % (P - 1)][n1], + rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1)); + } + + if (parallel_pack_ || shard_by_col_) { + signal_switch(k + 1); + for (Index m = nm_ - 1; m >= 0; m--) signal_kernel(m, n, k, m == 0); + } else { + signal_packing(k); + } + } + + void kernel(Index m, Index n, Index k) { + // Note: order of iteration matters here. Iteration over m is innermost + // because we want to reuse the same packed rhs in consequetive tasks + // (rhs fits into L2$ while lhs only into L3$). + const Index nend = n * gn_ + gn(n); + const Index mend = m * gm_ + gm(m); + if (shard_by_col_) { + for (Index n1 = n * gn_; n1 < nend; n1++) { + for (Index m1 = m * gm_; m1 < mend; m1++) + GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_), + packed_lhs_[k % (P - 1)][m1], + packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), + Scalar(1), -1, -1, 0, 0); + } + } else { + for (Index m1 = m * gm_; m1 < mend; m1++) + for (Index n1 = n * gn_; n1 < nend; n1++) { + GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_), + packed_lhs_[k % (P - 1)][m1], + packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), + Scalar(1), -1, -1, 0, 0); + } + } + signal_kernel(m, n, k + 1, false); + signal_switch(k + 2); + } + + void signal_packing(Index k) { + eigen_assert(!parallel_pack_); + Index s = state_packing_ready_[k % P].fetch_sub(1); + eigen_assert(s > 0); + if (s != 1) return; + state_packing_ready_[k % P] = shard_by_col_ ? nm_ : nn_; + enqueue_packing(k, shard_by_col_); + } + + void signal_kernel(Index m, Index n, Index k, bool sync) { + std::atomic<uint8_t>* state = &state_kernel_[k % P][m][n]; + Index s = state->load(); + eigen_assert(s > 0); + if (s != 1 && state->fetch_sub(1) != 1) return; + state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed); + if (sync) + kernel(m, n, k); + else + device_.enqueueNoNotification([=]() { kernel(m, n, k); }); + } + + void signal_switch(Index k, Index v = 1) { + Index s = state_switch_[k % P].fetch_sub(v); + eigen_assert(s >= v); + if (s != v) return; + + // Ready to switch to the next k slice. + // Reset counter for the next iteration. + state_switch_[k % P] = + (parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)) + + nm_ * nn_; + if (k < nk_) { + // Issue lhs/rhs packing. Their completion will in turn kick off + // kernels. + if (parallel_pack_) { + enqueue_packing(k, !shard_by_col_); + enqueue_packing(k, shard_by_col_); + } else if (shard_by_col_) { + enqueue_packing(k, false); + } else { + enqueue_packing(k, true); + } + + // Termination handling. + // Because kernel completion signals k + 2 switch, we need to finish nk + // + 2 slices without issuing any tasks on nk + 1 slice. So here we + // pretend that all nk + 1 packing tasks just finish instantly; so that + // nk + 2 switch only waits for completion of nk kernels. + } else if (k == nk_) { + signal_switch(k + 1, + parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)); + } else { + done_.Notify(); + } + } + + // Enqueue all rhs/lhs packing for k-th slice. + void enqueue_packing(Index k, bool rhs) { + enqueue_packing_helper(0, rhs ? nn_ : nm_, k, rhs); + } + + void enqueue_packing_helper(Index start, Index end, Index k, bool rhs) { + if (end - start == 1) { + if (rhs) + pack_rhs(start, k); + else + pack_lhs(start, k); + } else { + Index mid = (start + end) / 2; + device_.enqueueNoNotification( + [=]() { enqueue_packing_helper(mid, end, k, rhs); }); + device_.enqueueNoNotification( + [=]() { enqueue_packing_helper(start, mid, k, rhs); }); + } + } + + // Block sizes with accounting for potentially incomplete last block. + Index bm(Index m) const { return m + 1 < nm0_ ? bm_ : m_ + bm_ - bm_ * nm0_; } + Index bn(Index n) const { return n + 1 < nn0_ ? bn_ : n_ + bn_ - bn_ * nn0_; } + Index bk(Index k) const { return k + 1 < nk_ ? bk_ : k_ + bk_ - bk_ * nk_; } + // Task grain sizes accounting for potentially incomplete last task. + Index gm(Index m) const { return m + 1 < nm_ ? gm_ : nm0_ + gm_ - gm_ * nm_; } + Index gn(Index n) const { return n + 1 < nn_ ? gn_ : nn0_ + gn_ - gn_ * nn_; } + + Context(const Context&) = delete; + void operator=(const Context&) = delete; + }; + + // Decide whether we want to shard m x n contraction by columns or by rows. + static bool shardByCol(Index m, Index n, Index num_threads) { + // Note: we are comparing both n and m against Traits::nr, it is not + // a mistake. We are trying to figure out how both n and m will fit into + // the main sharding dimension. + + // Sharding by column is the default + // ... unless there is enough data for vectorization over rows + if (m / num_threads >= Traits::nr && + // and not enough data for vectorization over columns + (n / num_threads < Traits::nr || + // ... or barely enough data for vectorization over columns, + // but it is not evenly dividable across threads + (n / num_threads < 4 * Traits::nr && + (n % (num_threads * Traits::nr)) != 0 && + // ... and it is evenly dividable across threads for rows + ((m % (num_threads * Traits::nr)) == 0 || + // .. or it is not evenly dividable for both dimensions but + // there is much more data over rows so that corner effects are + // mitigated. + (m / n >= 6))))) + return false; + // Wait, or if matrices are just substantially prolonged over the other + // dimension. + if (n / num_threads < 16 * Traits::nr && m > n * 32) return false; + return true; + } + + Index coarsenM(Index m, Index n, Index bm, Index bn, Index bk, Index gn, + int num_threads, bool shard_by_col) const { + Index gm = 1; + Index gm1 = 1; + Index nm0 = divup(m, bm); + Index nm1 = nm0; + for (;;) { + // Find the next candidate for m grain size. It needs to result in + // different number of blocks. E.g. if we have 10 kernels, we want to try + // 5 and 10, but not 6, 7, 8 and 9. + while (gm1 <= nm0 && nm1 == divup(nm0, gm1)) gm1++; + if (gm1 > nm0) break; + // Check the candidate. + int res = checkGrain(m, n, bm, bn, bk, gm1, gn, gm, gn, num_threads, + shard_by_col); + if (res < 0) break; + nm1 = divup(nm0, gm1); + if (res == 0) continue; + // Commit new grain size. + gm = gm1; + } + return gm; + } + + Index coarsenN(Index m, Index n, Index bm, Index bn, Index bk, Index gm, + int num_threads, bool shard_by_col) const { + Index gn = 1; + Index gn1 = 1; + Index nn0 = divup(n, bn); + Index nn1 = nn0; + for (;;) { + while (gn1 <= nn0 && nn1 == divup(nn0, gn1)) gn1++; + if (gn1 > nn0) break; + int res = checkGrain(m, n, bm, bn, bk, gm, gn1, gm, gn, num_threads, + shard_by_col); + if (res < 0) break; + nn1 = divup(nn0, gn1); + if (res == 0) continue; + gn = gn1; + } + return gn; + } + + // checkGrain checks whether grain (gm, gn) is suitable and is better than + // (oldgm, oldgn). + int checkGrain(Index m, Index n, Index bm, Index bn, Index bk, Index gm, + Index gn, Index oldgm, Index oldgn, int num_threads, + bool shard_by_col) const { + const TensorOpCost cost = + contractionCost(bm * gm, bn * gn, bm, bn, bk, shard_by_col, true); + double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize( + static_cast<double>(bm) * gm * bn * gn, cost); + // If the task is too small, then we agree on it regardless of anything + // else. Otherwise synchronization overheads will dominate. + if (taskSize < 1) return 1; + // If it is too large, then we reject it and all larger tasks. + if (taskSize > 2) return -1; + // Now we are in presumably good task size range. + // The main deciding factor here is parallelism. Consider that we have 12 + // kernels and 4 threads. Grains of 2, 3 and 4 all yield good task sizes. + // But 2/4 yield 6/3 tasks, which gives us parallelism of 0.75 (at most 3/4 + // of cores will be busy). While grain size 3 gives us 4 tasks, which gives + // us parallelism of 1 (we can load all cores). + Index nm0 = divup(m, bm); + Index nn0 = divup(n, bn); + Index new_tasks = divup(nm0, gm) * divup(nn0, gn); + double new_parallelism = static_cast<double>(new_tasks) / + (divup<int>(new_tasks, num_threads) * num_threads); + Index old_tasks = divup(nm0, oldgm) * divup(nn0, oldgn); + double old_parallelism = static_cast<double>(old_tasks) / + (divup<int>(old_tasks, num_threads) * num_threads); + if (new_parallelism > old_parallelism || new_parallelism == 1) return 1; + return 0; + } + +#else // EIGEN_USE_SIMPLE_THREAD_POOL + template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> void evalProduct(Scalar* buffer) const { if (this->m_j_size == 1) { @@ -376,7 +995,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT const Index actual_mc = numext::mini(m_base_start + arg.mc, arg.max_m) - m_base_start; gebp(arg.output.getSubMapper(m_base_start, arg.n), (*arg.blockAs)[blockAId], arg.blockB, - actual_mc, arg.kc, arg.nc, 1.0, -1, -1, 0, 0); + actual_mc, arg.kc, arg.nc, Scalar(1), -1, -1, 0, 0); // Notify that the kernel is done. const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx; @@ -384,6 +1003,47 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT } } } +#endif // EIGEN_USE_SIMPLE_THREAD_POOL + + TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk, + bool shard_by_col, bool prepacked) const { + const int packed_size = std::min<int>(PacketType<LhsScalar, Device>::size, + PacketType<RhsScalar, Device>::size); + const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size; + const double kd = static_cast<double>(bk); + // Peak VFMA bandwidth is 0.5. However if we have not enough data for + // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined + // experimentally. + double computeBandwidth = bk == 1 ? 4.0 : + (shard_by_col ? bn : bm) < Traits::nr || + (shard_by_col ? bm : bn) < Traits::mr ? 2.0 : 0.5; +#ifndef EIGEN_VECTORIZE_FMA + // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors. + // However for MULPS/ADDPS we have dependent sequence of 2 such instructions, + // so overall bandwidth is 1.0. + if (computeBandwidth == 0.5) computeBandwidth = 1.0; +#endif + // Computations. + TensorOpCost cost = TensorOpCost(0, 0, kd * computeBandwidth, true, packed_size); + // Output stores. + cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size); + if (prepacked) { + // Packing and kernels are executed in different tasks. When we calculate + // task grain size we look only at kernel cost assuming that kernel + // is more expensive than packing. + return cost; + } + // Lhs/rhs loads + computations. + TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * (kd / n); + TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * (kd / m); + // Lhs packing memory cost does not contribute considerably to overall + // execution time because lhs is prefetched early and accessed sequentially. + if (shard_by_col) + lhsCost.dropMemoryCost(); + else + rhsCost.dropMemoryCost(); + return cost + lhsCost + rhsCost; + } }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index a2f1f71f5..860a6949a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -164,14 +164,14 @@ class TensorConversionOp : public TensorBase<TensorConversionOp<TargetType, XprT }; template <bool SameType, typename Eval, typename Scalar> struct ConversionSubExprEval { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool run(Eval& impl, Scalar*) { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar*) { impl.evalSubExprsIfNeeded(NULL); return true; } }; template <typename Eval, typename Scalar> struct ConversionSubExprEval<true, Eval, Scalar> { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool run(Eval& impl, Scalar* data) { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar* data) { return impl.evalSubExprsIfNeeded(data); } }; @@ -193,7 +193,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device> enum { IsAligned = false, - PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess && internal::type_casting_traits<SrcType, TargetType>::VectorizedCast, + PacketAccess = true, Layout = TensorEvaluator<ArgType, Device>::Layout, RawAccess = false }; @@ -224,11 +224,9 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device> template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio; - const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio; - PacketConverter<TensorEvaluator<ArgType, Device>, PacketSourceType, PacketReturnType, - SrcCoeffRatio, TgtCoeffRatio> converter(m_impl); - return converter.template packet<LoadMode>(index); + const bool Vectorizable = TensorEvaluator<ArgType, Device>::PacketAccess & + internal::type_casting_traits<SrcType, TargetType>::VectorizedCast; + return PacketConv<LoadMode, Vectorizable>::run(m_impl, index); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost @@ -249,7 +247,31 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: - TensorEvaluator<ArgType, Device> m_impl; + template <int LoadMode, bool ActuallyVectorize> + struct PacketConv { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator<ArgType, Device>& impl, Index index) { + internal::scalar_cast_op<SrcType, TargetType> converter; + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { + values[i] = converter(impl.coeff(index+i)); + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } + }; + + template <int LoadMode> + struct PacketConv<LoadMode, true> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator<ArgType, Device>& impl, Index index) { + const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio; + const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio; + PacketConverter<TensorEvaluator<ArgType, Device>, PacketSourceType, PacketReturnType, + SrcCoeffRatio, TgtCoeffRatio> converter(impl); + return converter.template packet<LoadMode>(index); + } + }; + + TensorEvaluator<ArgType, Device> m_impl; }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 091007ab7..abdf742c6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -254,7 +254,7 @@ struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, t template<typename Indices, typename InputXprType, typename KernelXprType> -class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType> > +class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h index 0f6dcedaa..83c449cf1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h @@ -10,10 +10,6 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H -//#if !defined(EIGEN_USE_GPU) -//#define EIGEN_USE_COST_MODEL -//#endif - namespace Eigen { /** \class TensorEvaluator @@ -32,45 +28,47 @@ class TensorOpCost { // model based on minimal reciprocal throughput numbers from Intel or // Agner Fog's tables would be better than what is there now. template <typename ArgType> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int MulCost() { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() { return internal::functor_traits< internal::scalar_product_op<ArgType, ArgType> >::Cost; } template <typename ArgType> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int AddCost() { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() { return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost; } template <typename ArgType> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int DivCost() { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() { return internal::functor_traits< internal::scalar_quotient_op<ArgType, ArgType> >::Cost; } template <typename ArgType> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int ModCost() { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() { return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost; } template <typename SrcType, typename TargetType> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int CastCost() { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() { return internal::functor_traits< internal::scalar_cast_op<SrcType, TargetType> >::Cost; } + EIGEN_DEVICE_FUNC TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {} + EIGEN_DEVICE_FUNC TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles) : bytes_loaded_(bytes_loaded), bytes_stored_(bytes_stored), compute_cycles_(compute_cycles) {} + EIGEN_DEVICE_FUNC TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles, bool vectorized, double packet_size) : bytes_loaded_(bytes_loaded), bytes_stored_(bytes_stored), compute_cycles_(vectorized ? compute_cycles / packet_size : compute_cycles) { - using std::isfinite; - eigen_assert(bytes_loaded >= 0 && (isfinite)(bytes_loaded)); - eigen_assert(bytes_stored >= 0 && (isfinite)(bytes_stored)); - eigen_assert(compute_cycles >= 0 && (isfinite)(compute_cycles)); + eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded)); + eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored)); + eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const { @@ -96,21 +94,21 @@ class TensorOpCost { } // TODO(rmlarsen): Define min in terms of total cost, not elementwise. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMin( - const TensorOpCost& rhs) { - bytes_loaded_ = numext::mini(bytes_loaded_, rhs.bytes_loaded()); - bytes_stored_ = numext::mini(bytes_stored_, rhs.bytes_stored()); - compute_cycles_ = numext::mini(compute_cycles_, rhs.compute_cycles()); - return *this; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin( + const TensorOpCost& rhs) const { + double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded()); + double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored()); + double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles()); + return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles); } // TODO(rmlarsen): Define max in terms of total cost, not elementwise. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMax( - const TensorOpCost& rhs) { - bytes_loaded_ = numext::maxi(bytes_loaded_, rhs.bytes_loaded()); - bytes_stored_ = numext::maxi(bytes_stored_, rhs.bytes_stored()); - compute_cycles_ = numext::maxi(compute_cycles_, rhs.compute_cycles()); - return *this; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax( + const TensorOpCost& rhs) const { + double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded()); + double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored()); + double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles()); + return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=( diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index 1d2d162dc..4f5767bc7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -12,6 +12,8 @@ namespace Eigen { +static const int kCudaScratchSize = 1024; + // This defines an interface that GPUDevice can take to use // CUDA streams underneath. class StreamInterface { @@ -24,6 +26,15 @@ class StreamInterface { // Allocate memory on the actual device where the computation will run virtual void* allocate(size_t num_bytes) const = 0; virtual void deallocate(void* buffer) const = 0; + + // Return a scratchpad buffer of size 1k + virtual void* scratchpad() const = 0; + + // Return a semaphore. The semaphore is initially initialized to 0, and + // each kernel using it is responsible for resetting to 0 upon completion + // to maintain the invariant that the semaphore is always equal to 0 upon + // each kernel start. + virtual unsigned int* semaphore() const = 0; }; static cudaDeviceProp* m_deviceProperties; @@ -31,7 +42,21 @@ static bool m_devicePropInitialized = false; static void initializeDeviceProp() { if (!m_devicePropInitialized) { - if (!m_devicePropInitialized) { + // Attempts to ensure proper behavior in the case of multiple threads + // calling this function simultaneously. This would be trivial to + // implement if we could use std::mutex, but unfortunately mutex don't + // compile with nvcc, so we resort to atomics and thread fences instead. + // Note that if the caller uses a compiler that doesn't support c++11 we + // can't ensure that the initialization is thread safe. +#if __cplusplus >= 201103L + static std::atomic<bool> first(true); + if (first.exchange(false)) { +#else + static bool first = true; + if (first) { + first = false; +#endif + // We're the first thread to reach this point. int num_devices; cudaError_t status = cudaGetDeviceCount(&num_devices); if (status != cudaSuccess) { @@ -52,7 +77,19 @@ static void initializeDeviceProp() { assert(status == cudaSuccess); } } + +#if __cplusplus >= 201103L + std::atomic_thread_fence(std::memory_order_release); +#endif m_devicePropInitialized = true; + } else { + // Wait for the other thread to inititialize the properties. + while (!m_devicePropInitialized) { +#if __cplusplus >= 201103L + std::atomic_thread_fence(std::memory_order_acquire); +#endif + sleep(1); + } } } } @@ -62,12 +99,12 @@ static const cudaStream_t default_stream = cudaStreamDefault; class CudaStreamDevice : public StreamInterface { public: // Use the default stream on the current device - CudaStreamDevice() : stream_(&default_stream) { + CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) { cudaGetDevice(&device_); initializeDeviceProp(); } // Use the default stream on the specified device - CudaStreamDevice(int device) : stream_(&default_stream), device_(device) { + CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) { initializeDeviceProp(); } // Use the specified stream. Note that it's the @@ -75,7 +112,7 @@ class CudaStreamDevice : public StreamInterface { // the specified device. If no device is specified the code // assumes that the stream is associated to the current gpu device. CudaStreamDevice(const cudaStream_t* stream, int device = -1) - : stream_(stream), device_(device) { + : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) { if (device < 0) { cudaGetDevice(&device_); } else { @@ -89,6 +126,12 @@ class CudaStreamDevice : public StreamInterface { initializeDeviceProp(); } + virtual ~CudaStreamDevice() { + if (scratch_) { + deallocate(scratch_); + } + } + const cudaStream_t& stream() const { return *stream_; } const cudaDeviceProp& deviceProperties() const { return m_deviceProperties[device_]; @@ -112,9 +155,29 @@ class CudaStreamDevice : public StreamInterface { assert(err == cudaSuccess); } + virtual void* scratchpad() const { + if (scratch_ == NULL) { + scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int)); + } + return scratch_; + } + + virtual unsigned int* semaphore() const { + if (semaphore_ == NULL) { + char* scratch = static_cast<char*>(scratchpad()) + kCudaScratchSize; + semaphore_ = reinterpret_cast<unsigned int*>(scratch); + cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_); + EIGEN_UNUSED_VARIABLE(err) + assert(err == cudaSuccess); + } + return semaphore_; + } + private: const cudaStream_t* stream_; int device_; + mutable void* scratch_; + mutable unsigned int* semaphore_; }; struct GpuDevice { @@ -131,22 +194,20 @@ struct GpuDevice { return stream_->stream(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { -#ifndef __CUDA_ARCH__ + EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { return stream_->allocate(num_bytes); -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); - return NULL; -#endif } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { -#ifndef __CUDA_ARCH__ + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { stream_->deallocate(buffer); + } -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); -#endif + EIGEN_STRONG_INLINE void* scratchpad() const { + return stream_->scratchpad(); + } + + EIGEN_STRONG_INLINE unsigned int* semaphore() const { + return stream_->semaphore(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { @@ -156,30 +217,22 @@ struct GpuDevice { EIGEN_UNUSED_VARIABLE(err) assert(err == cudaSuccess); #else - eigen_assert(false && "The default device should be used instead to generate kernel code"); + eigen_assert(false && "The default device should be used instead to generate kernel code"); #endif } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { -#ifndef __CUDA_ARCH__ + EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream()); EIGEN_UNUSED_VARIABLE(err) assert(err == cudaSuccess); -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); -#endif } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { -#ifndef __CUDA_ARCH__ + EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream()); EIGEN_UNUSED_VARIABLE(err) assert(err == cudaSuccess); -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); -#endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { @@ -188,21 +241,21 @@ struct GpuDevice { EIGEN_UNUSED_VARIABLE(err) assert(err == cudaSuccess); #else - eigen_assert(false && "The default device should be used instead to generate kernel code"); + eigen_assert(false && "The default device should be used instead to generate kernel code"); #endif } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { + EIGEN_STRONG_INLINE size_t numThreads() const { // FIXME return 32; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { // FIXME return 48*1024; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { // We won't try to take advantage of the l2 cache for the time being, and // there is no l3 cache on cuda devices. return firstLevelCacheSize(); @@ -222,56 +275,26 @@ struct GpuDevice { #endif } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const { -#ifndef __CUDA_ARCH__ + EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const { return stream_->deviceProperties().multiProcessorCount; -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); - return 0; -#endif } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const { -#ifndef __CUDA_ARCH__ + EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const { return stream_->deviceProperties().maxThreadsPerBlock; -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); - return 0; -#endif } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const { -#ifndef __CUDA_ARCH__ + EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const { return stream_->deviceProperties().maxThreadsPerMultiProcessor; -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); - return 0; -#endif } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int sharedMemPerBlock() const { -#ifndef __CUDA_ARCH__ + EIGEN_STRONG_INLINE int sharedMemPerBlock() const { return stream_->deviceProperties().sharedMemPerBlock; -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); - return 0; -#endif } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { -#ifndef __CUDA_ARCH__ + EIGEN_STRONG_INLINE int majorDeviceVersion() const { return stream_->deviceProperties().major; -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); - return 0; -#endif } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int minorDeviceVersion() const { -#ifndef __CUDA_ARCH__ + EIGEN_STRONG_INLINE int minorDeviceVersion() const { return stream_->deviceProperties().minor; -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); - return 0; -#endif } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const { + EIGEN_STRONG_INLINE int maxBlocks() const { return max_blocks_; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index c02891465..069680a11 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -14,7 +14,7 @@ namespace Eigen { // Use the SimpleThreadPool by default. We'll switch to the new non blocking // thread pool later. -#ifdef EIGEN_USE_NONBLOCKING_THREAD_POOL +#ifndef EIGEN_USE_SIMPLE_THREAD_POOL template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>; typedef NonBlockingThreadPool ThreadPool; #else @@ -106,7 +106,7 @@ static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) { // Build a thread pool device on top the an existing pool of threads. struct ThreadPoolDevice { // The ownership of the thread pool remains with the caller. - ThreadPoolDevice(ThreadPoolInterface* pool, size_t num_cores) : pool_(pool), num_threads_(num_cores) { } + ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores) : pool_(pool), num_threads_(num_cores) { } EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { return internal::aligned_malloc(num_bytes); @@ -130,7 +130,7 @@ struct ThreadPoolDevice { ::memset(buffer, c, n); } - EIGEN_STRONG_INLINE size_t numThreads() const { + EIGEN_STRONG_INLINE int numThreads() const { return num_threads_; } @@ -151,9 +151,7 @@ struct ThreadPoolDevice { template <class Function, class... Args> EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const { Notification* n = new Notification(); - std::function<void()> func = - std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, f, args...); - pool_->Schedule(func); + pool_->Schedule(std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, f, args...)); return n; } @@ -161,20 +159,118 @@ struct ThreadPoolDevice { EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b, Function&& f, Args&&... args) const { - std::function<void()> func = std::bind( - &FunctionWrapperWithBarrier<Function, Args...>::run, b, f, args...); - pool_->Schedule(func); + pool_->Schedule(std::bind( + &FunctionWrapperWithBarrier<Function, Args...>::run, b, f, args...)); } template <class Function, class... Args> EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const { - std::function<void()> func = std::bind(f, args...); - pool_->Schedule(func); + pool_->Schedule(std::bind(f, args...)); + } + + // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if + // called from one of the threads in pool_. Returns -1 otherwise. + EIGEN_STRONG_INLINE int currentThreadId() const { + return pool_->CurrentThreadId(); + } + + // parallelFor executes f with [0, n) arguments in parallel and waits for + // completion. F accepts a half-open interval [first, last). + // Block size is choosen based on the iteration cost and resulting parallel + // efficiency. If block_align is not nullptr, it is called to round up the + // block size. + void parallelFor(Index n, const TensorOpCost& cost, + std::function<Index(Index)> block_align, + std::function<void(Index, Index)> f) const { + typedef TensorCostModel<ThreadPoolDevice> CostModel; + if (n <= 1 || numThreads() == 1 || + CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) { + f(0, n); + return; + } + + // Calculate block size based on (1) the iteration cost and (2) parallel + // efficiency. We want blocks to be not too small to mitigate + // parallelization overheads; not too large to mitigate tail + // effect and potential load imbalance and we also want number + // of blocks to be evenly dividable across threads. + + double block_size_f = 1.0 / CostModel::taskSize(1, cost); + Index block_size = numext::mini(n, numext::maxi<Index>(1, block_size_f)); + const Index max_block_size = + numext::mini(n, numext::maxi<Index>(1, 2 * block_size_f)); + if (block_align) { + Index new_block_size = block_align(block_size); + eigen_assert(new_block_size >= block_size); + block_size = numext::mini(n, new_block_size); + } + Index block_count = divup(n, block_size); + // Calculate parallel efficiency as fraction of total CPU time used for + // computations: + double max_efficiency = + static_cast<double>(block_count) / + (divup<int>(block_count, numThreads()) * numThreads()); + // Now try to increase block size up to max_block_size as long as it + // doesn't decrease parallel efficiency. + for (Index prev_block_count = block_count; prev_block_count > 1;) { + // This is the next block size that divides size into a smaller number + // of blocks than the current block_size. + Index coarser_block_size = divup(n, prev_block_count - 1); + if (block_align) { + Index new_block_size = block_align(coarser_block_size); + eigen_assert(new_block_size >= coarser_block_size); + coarser_block_size = numext::mini(n, new_block_size); + } + if (coarser_block_size > max_block_size) { + break; // Reached max block size. Stop. + } + // Recalculate parallel efficiency. + const Index coarser_block_count = divup(n, coarser_block_size); + eigen_assert(coarser_block_count < prev_block_count); + prev_block_count = coarser_block_count; + const double coarser_efficiency = + static_cast<double>(coarser_block_count) / + (divup<int>(coarser_block_count, numThreads()) * numThreads()); + if (coarser_efficiency + 0.01 >= max_efficiency) { + // Taking it. + block_size = coarser_block_size; + block_count = coarser_block_count; + if (max_efficiency < coarser_efficiency) { + max_efficiency = coarser_efficiency; + } + } + } + + // Recursively divide size into halves until we reach block_size. + // Division code rounds mid to block_size, so we are guaranteed to get + // block_count leaves that do actual computations. + Barrier barrier(static_cast<unsigned int>(block_count)); + std::function<void(Index, Index)> handleRange; + handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) { + if (last - first <= block_size) { + // Single block or less, execute directly. + f(first, last); + barrier.Notify(); + return; + } + // Split into halves and submit to the pool. + Index mid = first + divup((last - first) / 2, block_size) * block_size; + pool_->Schedule([=, &handleRange]() { handleRange(mid, last); }); + pool_->Schedule([=, &handleRange]() { handleRange(first, mid); }); + }; + handleRange(0, n); + barrier.Wait(); + } + + // Convenience wrapper for parallelFor that does not align blocks. + void parallelFor(Index n, const TensorOpCost& cost, + std::function<void(Index, Index)> f) const { + parallelFor(n, cost, nullptr, std::move(f)); } private: ThreadPoolInterface* pool_; - size_t num_threads_; + int num_threads_; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h index ca9ac79df..1a30e45fb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h @@ -44,7 +44,7 @@ template<DenseIndex n, typename Index, std::size_t Rank> const Index array_get(c } -#if defined(EIGEN_HAS_CONSTEXPR) +#if EIGEN_HAS_CONSTEXPR template <typename Index, std::size_t Rank> struct index_known_statically_impl<DimensionList<Index, Rank> > { EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index f0b8ac958..b24cdebf1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -29,14 +29,6 @@ namespace Eigen { * \sa Tensor */ -// Can't use std::pair on cuda devices -template <typename Index> struct IndexPair { - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) { } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Index f, Index s) : first(f), second(s) { } - Index first; - Index second; -}; - // Boilerplate code namespace internal { @@ -115,7 +107,7 @@ struct Sizes : internal::numeric_list<std::ptrdiff_t, Indices...> { explicit EIGEN_DEVICE_FUNC Sizes(const array<DenseIndex, Base::count>& /*indices*/) { // todo: add assertion } -#ifdef EIGEN_HAS_VARIADIC_TEMPLATES +#if EIGEN_HAS_VARIADIC_TEMPLATES template <typename... DenseIndex> EIGEN_DEVICE_FUNC Sizes(DenseIndex...) { } explicit EIGEN_DEVICE_FUNC Sizes(std::initializer_list<std::ptrdiff_t> /*l*/) { // todo: add assertion @@ -182,7 +174,7 @@ template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0 return *this; } -#ifdef EIGEN_HAS_VARIADIC_TEMPLATES +#if EIGEN_HAS_VARIADIC_TEMPLATES template <typename... DenseIndex> Sizes(DenseIndex... /*indices*/) { } explicit Sizes(std::initializer_list<std::size_t>) { // todo: add assertion @@ -190,13 +182,13 @@ template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0 #else EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex) { } - EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex) { + EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex) { } - EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex, const DenseIndex) { + EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex) { } - EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) { + EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) { } - EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) { + EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) { } #endif @@ -290,31 +282,31 @@ struct DSizes : array<DenseIndex, NumDims> { (*this)[0] = i0; } -#ifdef EIGEN_HAS_VARIADIC_TEMPLATES +#if EIGEN_HAS_VARIADIC_TEMPLATES template<typename... IndexTypes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) { EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 2 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE) } #else - EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1) { + EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1) { eigen_assert(NumDims == 2); (*this)[0] = i0; (*this)[1] = i1; } - EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) { + EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) { eigen_assert(NumDims == 3); (*this)[0] = i0; (*this)[1] = i1; (*this)[2] = i2; } - EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) { + EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) { eigen_assert(NumDims == 4); (*this)[0] = i0; (*this)[1] = i1; (*this)[2] = i2; (*this)[3] = i3; } - EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) { + EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) { eigen_assert(NumDims == 5); (*this)[0] = i0; (*this)[1] = i1; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index c556fec0f..a08dfa7c3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -56,7 +56,7 @@ struct nested<TensorEvalToOp<XprType>, 1, typename eval<TensorEvalToOp<XprType> template<typename XprType> -class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType> > +class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType>, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits<TensorEvalToOp>::Scalar Scalar; @@ -94,7 +94,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device> static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { - IsAligned = true, + IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index ae4ce3c90..61c111cec 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -129,6 +129,10 @@ template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double loadConstant(const double* address) { return __ldg(address); } +template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +Eigen::half loadConstant(const Eigen::half* address) { + return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x))); +} #endif } @@ -222,7 +226,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device> EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device) + : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper() { } typedef typename XprType::Index Index; @@ -239,13 +243,13 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device> EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { - return m_functor(index); + return m_wrapper(m_functor, index); } template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - return m_functor.template packetOp<Index, PacketReturnType>(index); + return m_wrapper.template packetOp<PacketReturnType, Index>(m_functor, index); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost @@ -259,6 +263,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device> private: const NullaryOp m_functor; TensorEvaluator<ArgType, Device> m_argImpl; + const internal::nullary_wrapper<CoeffReturnType,NullaryOp> m_wrapper; }; @@ -399,6 +404,101 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg TensorEvaluator<RightArgType, Device> m_rightImpl; }; +// -------------------- CwiseTernaryOp -------------------- + +template<typename TernaryOp, typename Arg1Type, typename Arg2Type, typename Arg3Type, typename Device> +struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type>, Device> +{ + typedef TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type> XprType; + + enum { + IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned & TensorEvaluator<Arg3Type, Device>::IsAligned, + PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess & + internal::functor_traits<TernaryOp>::PacketAccess, + Layout = TensorEvaluator<Arg1Type, Device>::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_functor(op.functor()), + m_arg1Impl(op.arg1Expression(), device), + m_arg2Impl(op.arg2Expression(), device), + m_arg3Impl(op.arg3Expression(), device) + { + EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<Arg1Type, Device>::Layout) == static_cast<int>(TensorEvaluator<Arg3Type, Device>::Layout) || internal::traits<XprType>::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + + EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind, + typename internal::traits<Arg2Type>::StorageKind>::value), + STORAGE_KIND_MUST_MATCH) + EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind, + typename internal::traits<Arg3Type>::StorageKind>::value), + STORAGE_KIND_MUST_MATCH) + EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index, + typename internal::traits<Arg2Type>::Index>::value), + STORAGE_INDEX_MUST_MATCH) + EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index, + typename internal::traits<Arg3Type>::Index>::value), + STORAGE_INDEX_MUST_MATCH) + + eigen_assert(dimensions_match(m_arg1Impl.dimensions(), m_arg2Impl.dimensions()) && dimensions_match(m_arg1Impl.dimensions(), m_arg3Impl.dimensions())); + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::traits<XprType>::Scalar CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; + typedef typename TensorEvaluator<Arg1Type, Device>::Dimensions Dimensions; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const + { + // TODO: use arg2 or arg3 dimensions if they are known at compile time. + return m_arg1Impl.dimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + m_arg1Impl.evalSubExprsIfNeeded(NULL); + m_arg2Impl.evalSubExprsIfNeeded(NULL); + m_arg3Impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_arg1Impl.cleanup(); + m_arg2Impl.cleanup(); + m_arg3Impl.cleanup(); + } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index)); + } + template<int LoadMode> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_functor.packetOp(m_arg1Impl.template packet<LoadMode>(index), + m_arg2Impl.template packet<LoadMode>(index), + m_arg3Impl.template packet<LoadMode>(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double functor_cost = internal::functor_traits<TernaryOp>::Cost; + return m_arg1Impl.costPerCoeff(vectorized) + + m_arg2Impl.costPerCoeff(vectorized) + + m_arg3Impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + + private: + const TernaryOp m_functor; + TensorEvaluator<Arg1Type, Device> m_arg1Impl; + TensorEvaluator<Arg1Type, Device> m_arg2Impl; + TensorEvaluator<Arg3Type, Device> m_arg3Impl; +}; + // -------------------- SelectOp -------------------- @@ -475,7 +575,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType> .cwiseMax(m_elseImpl.costPerCoeff(vectorized)); } - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return NULL; } private: TensorEvaluator<IfArgType, Device> m_condImpl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 5c3d4d630..0cac7b179 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -59,13 +59,14 @@ class TensorExecutor<Expression, DefaultDevice, true> { const Index size = array_prod(evaluator.dimensions()); const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size; - // Manually unroll this loop since compilers don't do it. + // Give the compiler a strong hint to unroll the loop. But don't insist + // on unrolling, because if the function is expensive the compiler should not + // unroll the loop at the expense of inlining. const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) { - evaluator.evalPacket(i); - evaluator.evalPacket(i+PacketSize); - evaluator.evalPacket(i+2*PacketSize); - evaluator.evalPacket(i+3*PacketSize); + for (Index j = 0; j < 4; j++) { + evaluator.evalPacket(i + j * PacketSize); + } } const Index VectorizedSize = (size / PacketSize) * PacketSize; for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) { @@ -92,24 +93,30 @@ struct EvalRange { evaluator.evalScalar(i); } } + + static Index alignBlockSize(Index size) { + return size; + } }; template <typename Evaluator, typename Index> struct EvalRange<Evaluator, Index, true> { + static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; + static void run(Evaluator* evaluator_in, const Index first, const Index last) { Evaluator evaluator = *evaluator_in; eigen_assert(last >= first); Index i = first; - const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; if (last - first >= PacketSize) { eigen_assert(first % PacketSize == 0); Index last_chunk_offset = last - 4 * PacketSize; - // Manually unroll this loop since compilers don't do it. + // Give the compiler a strong hint to unroll the loop. But don't insist + // on unrolling, because if the function is expensive the compiler should not + // unroll the loop at the expense of inlining. for (; i <= last_chunk_offset; i += 4*PacketSize) { - evaluator.evalPacket(i); - evaluator.evalPacket(i+PacketSize); - evaluator.evalPacket(i+2*PacketSize); - evaluator.evalPacket(i+3*PacketSize); + for (Index j = 0; j < 4; j++) { + evaluator.evalPacket(i + j * PacketSize); + } } last_chunk_offset = last - PacketSize; for (; i <= last_chunk_offset; i += PacketSize) { @@ -120,6 +127,15 @@ struct EvalRange<Evaluator, Index, true> { evaluator.evalScalar(i); } } + + static Index alignBlockSize(Index size) { + // Align block size to packet size and account for unrolling in run above. + if (size >= 16 * PacketSize) { + return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1); + } + // Aligning to 4 * PacketSize would increase block size by more than 25%. + return (size + PacketSize - 1) & ~(PacketSize - 1); + } }; template <typename Expression, bool Vectorizable> @@ -133,18 +149,23 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> { const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { - const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1; const Index size = array_prod(evaluator.dimensions()); +#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL) + device.parallelFor(size, evaluator.costPerCoeff(Vectorizable), + EvalRange<Evaluator, Index, Vectorizable>::alignBlockSize, + [&evaluator](Index first, Index last) { + EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, first, last); + }); +#else size_t num_threads = device.numThreads(); -#ifdef EIGEN_USE_COST_MODEL if (num_threads > 1) { num_threads = TensorCostModel<ThreadPoolDevice>::numThreads( size, evaluator.costPerCoeff(Vectorizable), num_threads); } -#endif if (num_threads == 1) { EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size); } else { + const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1; Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1; const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize))); const Index numblocks = size / blocksize; @@ -161,11 +182,12 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> { } barrier.Wait(); } +#endif // defined(!EIGEN_USE_SIMPLE_THREAD_POOL) } evaluator.cleanup(); } }; -#endif +#endif // EIGEN_USE_THREADS // GPU: the evaluation of the expression is offloaded to a GPU. @@ -212,16 +234,11 @@ struct EigenMetaKernelEval<Evaluator, Index, true> { template <typename Evaluator, typename Index> __global__ void __launch_bounds__(1024) -EigenMetaKernel(Evaluator memcopied_eval, Index size) { +EigenMetaKernel(Evaluator eval, Index size) { const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; const Index step_size = blockDim.x * gridDim.x; - // Cuda memcopies the kernel arguments. That's fine for POD, but for more - // complex types such as evaluators we should really conform to the C++ - // standard and call a proper copy constructor. - Evaluator eval(memcopied_eval); - const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned; EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h index 8491c4ca2..5f2e329f2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -219,6 +219,86 @@ class TensorCwiseBinaryOp : public TensorBase<TensorCwiseBinaryOp<BinaryOp, LhsX namespace internal { +template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType> +struct traits<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> > +{ + // Type promotion to handle the case where the types of the args are different. + typedef typename result_of< + TernaryOp(typename Arg1XprType::Scalar, + typename Arg2XprType::Scalar, + typename Arg3XprType::Scalar)>::type Scalar; + typedef traits<Arg1XprType> XprTraits; + typedef typename traits<Arg1XprType>::StorageKind StorageKind; + typedef typename traits<Arg1XprType>::Index Index; + typedef typename Arg1XprType::Nested Arg1Nested; + typedef typename Arg2XprType::Nested Arg2Nested; + typedef typename Arg3XprType::Nested Arg3Nested; + typedef typename remove_reference<Arg1Nested>::type _Arg1Nested; + typedef typename remove_reference<Arg2Nested>::type _Arg2Nested; + typedef typename remove_reference<Arg3Nested>::type _Arg3Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + + enum { + Flags = 0 + }; +}; + +template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType> +struct eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, Eigen::Dense> +{ + typedef const TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>& type; +}; + +template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType> +struct nested<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, 1, typename eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> >::type> +{ + typedef TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> type; +}; + +} // end namespace internal + + + +template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType> +class TensorCwiseTernaryOp : public TensorBase<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Scalar Scalar; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef Scalar CoeffReturnType; + typedef typename Eigen::internal::nested<TensorCwiseTernaryOp>::type Nested; + typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseTernaryOp(const Arg1XprType& arg1, const Arg2XprType& arg2, const Arg3XprType& arg3, const TernaryOp& func = TernaryOp()) + : m_arg1_xpr(arg1), m_arg2_xpr(arg2), m_arg3_xpr(arg3), m_functor(func) {} + + EIGEN_DEVICE_FUNC + const TernaryOp& functor() const { return m_functor; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename Arg1XprType::Nested>::type& + arg1Expression() const { return m_arg1_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename Arg1XprType::Nested>::type& + arg2Expression() const { return m_arg2_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename Arg3XprType::Nested>::type& + arg3Expression() const { return m_arg3_xpr; } + + protected: + typename Arg1XprType::Nested m_arg1_xpr; + typename Arg1XprType::Nested m_arg2_xpr; + typename Arg3XprType::Nested m_arg3_xpr; + const TernaryOp m_functor; +}; + + +namespace internal { template<typename IfXprType, typename ThenXprType, typename ElseXprType> struct traits<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> > : traits<ThenXprType> @@ -252,7 +332,7 @@ struct nested<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, 1, typename e template<typename IfXprType, typename ThenXprType, typename ElseXprType> -class TensorSelectOp : public TensorBase<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> > +class TensorSelectOp : public TensorBase<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits<TensorSelectOp>::Scalar Scalar; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index ece2ed91b..08eb5595a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -329,7 +329,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D for (Index i = 0; i < n; ++i) { if(FFTDir == FFT_FORWARD) { - a[i] = data[i] * std::conj(pos_j_base_powered[i]); + a[i] = data[i] * numext::conj(pos_j_base_powered[i]); } else { a[i] = data[i] * pos_j_base_powered[i]; @@ -344,7 +344,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D b[i] = pos_j_base_powered[i]; } else { - b[i] = std::conj(pos_j_base_powered[i]); + b[i] = numext::conj(pos_j_base_powered[i]); } } for (Index i = n; i < m - n; ++i) { @@ -355,7 +355,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D b[i] = pos_j_base_powered[m-i]; } else { - b[i] = std::conj(pos_j_base_powered[m-i]); + b[i] = numext::conj(pos_j_base_powered[m-i]); } } @@ -379,7 +379,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D for (Index i = 0; i < n; ++i) { if(FFTDir == FFT_FORWARD) { - data[i] = a[i] * std::conj(pos_j_base_powered[i]); + data[i] = a[i] * numext::conj(pos_j_base_powered[i]); } else { data[i] = a[i] * pos_j_base_powered[i]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index b27ee0084..fcee5f60d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -65,7 +65,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_, inline Self& base() { return *this; } inline const Self& base() const { return *this; } -#ifdef EIGEN_HAS_VARIADIC_TEMPLATES +#if EIGEN_HAS_VARIADIC_TEMPLATES template<typename... IndexTypes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const { @@ -97,7 +97,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_, } -#ifdef EIGEN_HAS_VARIADIC_TEMPLATES +#if EIGEN_HAS_VARIADIC_TEMPLATES template<typename... IndexTypes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices) { @@ -128,7 +128,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_, return m_storage.data()[0]; } -#ifdef EIGEN_HAS_VARIADIC_TEMPLATES +#if EIGEN_HAS_VARIADIC_TEMPLATES template<typename... IndexTypes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const { @@ -213,7 +213,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_, return coeff(index); } -#ifdef EIGEN_HAS_VARIADIC_TEMPLATES +#if EIGEN_HAS_VARIADIC_TEMPLATES template<typename... IndexTypes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) { @@ -309,7 +309,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_, { } -#ifdef EIGEN_HAVE_RVALUE_REFERENCES +#if EIGEN_HAS_RVALUE_REFERENCES EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(Self&& other) : m_storage(other.m_storage) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index 7ec757519..c23ecdbc4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -55,7 +55,7 @@ struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<X template<typename XprType> -class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType> > +class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar; @@ -102,7 +102,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device> EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { - const Index numValues = m_impl.dimensions().TotalSize(); + const Index numValues = internal::array_prod(m_impl.dimensions()); m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType)); // Should initialize the memory in case we're dealing with non POD types. if (NumTraits<CoeffReturnType>::RequireInitialization) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index a8bd8b888..490ddd8bd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -16,11 +16,12 @@ template<typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType template<typename Scalar_, typename Dimensions, int Options_ = 0, typename IndexType = DenseIndex> class TensorFixedSize; template<typename PlainObjectType, int Options_ = Unaligned> class TensorMap; template<typename PlainObjectType> class TensorRef; -template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value> class TensorBase; +template<typename Derived, int AccessLevel> class TensorBase; template<typename NullaryOp, typename PlainObjectType> class TensorCwiseNullaryOp; template<typename UnaryOp, typename XprType> class TensorCwiseUnaryOp; template<typename BinaryOp, typename LeftXprType, typename RightXprType> class TensorCwiseBinaryOp; +template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType> class TensorCwiseTernaryOp; template<typename IfXprType, typename ThenXprType, typename ElseXprType> class TensorSelectOp; template<typename Op, typename Dims, typename XprType> class TensorReductionOp; template<typename XprType> class TensorIndexTupleOp; @@ -42,9 +43,11 @@ template<typename ReverseDimensions, typename XprType> class TensorReverseOp; template<typename PaddingDimensions, typename XprType> class TensorPaddingOp; template<typename Shuffle, typename XprType> class TensorShufflingOp; template<typename Strides, typename XprType> class TensorStridingOp; +template<typename StartIndices, typename StopIndices, typename Strides, typename XprType> class TensorStridingSlicingOp; template<typename Strides, typename XprType> class TensorInflationOp; template<typename Generator, typename XprType> class TensorGeneratorOp; template<typename LeftXprType, typename RightXprType> class TensorAssignOp; +template<typename Op, typename XprType> class TensorScanOp; template<typename CustomUnaryFunc, typename XprType> class TensorCustomUnaryOp; template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> class TensorCustomBinaryOp; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index 33cd00391..7164e8d60 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -25,7 +25,7 @@ struct scalar_mod_op { }; template <typename Scalar> struct functor_traits<scalar_mod_op<Scalar> > -{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; }; +{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; }; /** \internal @@ -38,7 +38,7 @@ struct scalar_mod2_op { }; template <typename Scalar> struct functor_traits<scalar_mod2_op<Scalar> > -{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; }; +{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; }; template <typename Scalar> struct scalar_fmod_op { @@ -69,7 +69,7 @@ struct scalar_sigmoid_op { template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { - const Packet one = pset1<Packet>(1); + const Packet one = pset1<Packet>(T(1)); return pdiv(one, padd(one, pexp(pnegate(x)))); } }; @@ -84,14 +84,23 @@ struct functor_traits<scalar_sigmoid_op<T> > { }; +template<typename Reducer, typename Device> +struct reducer_traits { + enum { + Cost = 1, + PacketAccess = false + }; +}; + // Standard reduction functors template <typename T> struct SumReducer { - static const bool PacketAccess = true; + static const bool PacketAccess = packet_traits<T>::HasAdd; static const bool IsStateful = false; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { - (*accum) += t; + internal::scalar_sum_op<T> sum_op; + *accum = sum_op(*accum, t); } template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { @@ -119,16 +128,26 @@ template <typename T> struct SumReducer } }; +template <typename T, typename Device> +struct reducer_traits<SumReducer<T>, Device> { + enum { + Cost = NumTraits<T>::AddCost, + PacketAccess = PacketType<T, Device>::HasAdd + }; +}; + + template <typename T> struct MeanReducer { - static const bool PacketAccess = !NumTraits<T>::IsInteger; + static const bool PacketAccess = packet_traits<T>::HasAdd && !NumTraits<T>::IsInteger; static const bool IsStateful = true; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MeanReducer() : scalarCount_(0), packetCount_(0) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) { - (*accum) += t; + internal::scalar_sum_op<T> sum_op; + *accum = sum_op(*accum, t); scalarCount_++; } template <typename Packet> @@ -162,9 +181,44 @@ template <typename T> struct MeanReducer DenseIndex packetCount_; }; +template <typename T, typename Device> +struct reducer_traits<MeanReducer<T>, Device> { + enum { + Cost = NumTraits<T>::AddCost, + PacketAccess = PacketType<T, Device>::HasAdd + }; +}; + + +template <typename T, bool IsMax = true, bool IsInteger = true> +struct MinMaxBottomValue { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { + return Eigen::NumTraits<T>::lowest(); + } +}; +template <typename T> +struct MinMaxBottomValue<T, true, false> { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { + return -Eigen::NumTraits<T>::infinity(); + } +}; +template <typename T> +struct MinMaxBottomValue<T, false, true> { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { + return Eigen::NumTraits<T>::highest(); + } +}; +template <typename T> +struct MinMaxBottomValue<T, false, false> { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { + return Eigen::NumTraits<T>::infinity(); + } +}; + + template <typename T> struct MaxReducer { - static const bool PacketAccess = true; + static const bool PacketAccess = packet_traits<T>::HasMax; static const bool IsStateful = false; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { @@ -174,9 +228,8 @@ template <typename T> struct MaxReducer EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { (*accum) = pmax<Packet>(*accum, p); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - return Eigen::NumTraits<T>::lowest(); + return MinMaxBottomValue<T, true, Eigen::NumTraits<T>::IsInteger>::bottom_value(); } template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { @@ -195,9 +248,18 @@ template <typename T> struct MaxReducer } }; +template <typename T, typename Device> +struct reducer_traits<MaxReducer<T>, Device> { + enum { + Cost = NumTraits<T>::AddCost, + PacketAccess = PacketType<T, Device>::HasMax + }; +}; + + template <typename T> struct MinReducer { - static const bool PacketAccess = true; + static const bool PacketAccess = packet_traits<T>::HasMin; static const bool IsStateful = false; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { @@ -207,9 +269,8 @@ template <typename T> struct MinReducer EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { (*accum) = pmin<Packet>(*accum, p); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - return Eigen::NumTraits<T>::highest(); + return MinMaxBottomValue<T, false, Eigen::NumTraits<T>::IsInteger>::bottom_value(); } template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { @@ -228,10 +289,18 @@ template <typename T> struct MinReducer } }; +template <typename T, typename Device> +struct reducer_traits<MinReducer<T>, Device> { + enum { + Cost = NumTraits<T>::AddCost, + PacketAccess = PacketType<T, Device>::HasMin + }; +}; + template <typename T> struct ProdReducer { - static const bool PacketAccess = true; + static const bool PacketAccess = packet_traits<T>::HasMul; static const bool IsStateful = false; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { @@ -263,6 +332,14 @@ template <typename T> struct ProdReducer } }; +template <typename T, typename Device> +struct reducer_traits<ProdReducer<T>, Device> { + enum { + Cost = NumTraits<T>::MulCost, + PacketAccess = PacketType<T, Device>::HasMul + }; +}; + struct AndReducer { @@ -280,6 +357,15 @@ struct AndReducer } }; +template <typename Device> +struct reducer_traits<AndReducer, Device> { + enum { + Cost = 1, + PacketAccess = false + }; +}; + + struct OrReducer { static const bool PacketAccess = false; static const bool IsStateful = false; @@ -295,6 +381,15 @@ struct OrReducer { } }; +template <typename Device> +struct reducer_traits<OrReducer, Device> { + enum { + Cost = 1, + PacketAccess = false + }; +}; + + // Argmin/Argmax reducers template <typename T> struct ArgMaxTupleReducer { @@ -312,6 +407,15 @@ template <typename T> struct ArgMaxTupleReducer } }; +template <typename T, typename Device> +struct reducer_traits<ArgMaxTupleReducer<T>, Device> { + enum { + Cost = NumTraits<T>::AddCost, + PacketAccess = false + }; +}; + + template <typename T> struct ArgMinTupleReducer { static const bool PacketAccess = false; @@ -328,457 +432,11 @@ template <typename T> struct ArgMinTupleReducer } }; - -// Random number generation -namespace { -#ifdef __CUDA_ARCH__ -__device__ int get_random_seed() { - return clock(); -} -#else -int get_random_seed() { -#ifdef _WIN32 - SYSTEMTIME st; - GetSystemTime(&st); - return st.wSecond + 1000 * st.wMilliseconds; -#elif defined __APPLE__ - return static_cast<int>(mach_absolute_time()); -#else - timespec ts; - clock_gettime(CLOCK_REALTIME, &ts); - return static_cast<int>(ts.tv_nsec); -#endif -} -#endif -} - -#if !defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__) -// We're not compiling a cuda kernel -template <typename T> class UniformRandomGenerator { - - public: - static const bool PacketAccess = true; - - UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) { - if (!deterministic) { - srand(get_random_seed()); - } - } - UniformRandomGenerator(const UniformRandomGenerator& other) { - m_deterministic = other.m_deterministic; - } - - template<typename Index> - T operator()(Index) const { - return random<T>(); - } - template<typename Index, typename PacketType> - PacketType packetOp(Index) const { - const int packetSize = internal::unpacket_traits<PacketType>::size; - EIGEN_ALIGN_MAX T values[packetSize]; - for (int i = 0; i < packetSize; ++i) { - values[i] = random<T>(); - } - return internal::pload<PacketType>(values); - } - - private: - bool m_deterministic; -}; - -#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900 -template <> class UniformRandomGenerator<float> { - public: - static const bool PacketAccess = true; - - UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_generator(new std::mt19937()) { - if (!deterministic) { - m_generator->seed(get_random_seed()); - } - } - UniformRandomGenerator(const UniformRandomGenerator<float>& other) { - m_generator = new std::mt19937(); - m_generator->seed(other(0) * UINT_MAX); - m_deterministic = other.m_deterministic; - } - ~UniformRandomGenerator() { - delete m_generator; - } - - template<typename Index> - float operator()(Index) const { - return m_distribution(*m_generator); - } - template<typename Index, typename PacketType> - PacketType packetOp(Index i) const { - const int packetSize = internal::unpacket_traits<PacketType>::size; - EIGEN_ALIGN_MAX float values[packetSize]; - for (int k = 0; k < packetSize; ++k) { - values[k] = this->operator()(i); - } - return internal::pload<PacketType>(values); - } - - private: - UniformRandomGenerator& operator = (const UniformRandomGenerator&); - // Make sure m_deterministic comes first to match the layout of the cpu - // version of the code. - bool m_deterministic; - std::mt19937* m_generator; - mutable std::uniform_real_distribution<float> m_distribution; -}; - -template <> class UniformRandomGenerator<double> { - public: - static const bool PacketAccess = true; - - UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_generator(new std::mt19937()) { - if (!deterministic) { - m_generator->seed(get_random_seed()); - } - } - UniformRandomGenerator(const UniformRandomGenerator<double>& other) { - m_generator = new std::mt19937(); - m_generator->seed(other(0) * UINT_MAX); - m_deterministic = other.m_deterministic; - } - ~UniformRandomGenerator() { - delete m_generator; - } - - template<typename Index> - double operator()(Index) const { - return m_distribution(*m_generator); - } - template<typename Index, typename PacketType> - PacketType packetOp(Index i) const { - const int packetSize = internal::unpacket_traits<PacketType>::size; - EIGEN_ALIGN_MAX double values[packetSize]; - for (int k = 0; k < packetSize; ++k) { - values[k] = this->operator()(i); - } - return internal::pload<PacketType>(values); - } - - private: - UniformRandomGenerator& operator = (const UniformRandomGenerator&); - // Make sure m_deterministic comes first to match the layout of the cpu - // version of the code. - bool m_deterministic; - std::mt19937* m_generator; - mutable std::uniform_real_distribution<double> m_distribution; -}; -#endif - -#else - -// We're compiling a cuda kernel -template <typename T> class UniformRandomGenerator; - -template <> class UniformRandomGenerator<float> { - public: - static const bool PacketAccess = true; - - __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) { - const int tid = blockIdx.x * blockDim.x + threadIdx.x; - const int seed = deterministic ? 0 : get_random_seed(); - curand_init(seed, tid, 0, &m_state); - } - - __device__ UniformRandomGenerator(const UniformRandomGenerator& other) { - m_deterministic = other.m_deterministic; - const int tid = blockIdx.x * blockDim.x + threadIdx.x; - const int seed = m_deterministic ? 0 : get_random_seed(); - curand_init(seed, tid, 0, &m_state); - } - - template<typename Index> - __device__ float operator()(Index) const { - return curand_uniform(&m_state); - } - template<typename Index, typename PacketType> - __device__ float4 packetOp(Index) const { - EIGEN_STATIC_ASSERT((is_same<PacketType, float4>::value), YOU_MADE_A_PROGRAMMING_MISTAKE); - return curand_uniform4(&m_state); - } - - private: - bool m_deterministic; - mutable curandStatePhilox4_32_10_t m_state; -}; - -template <> class UniformRandomGenerator<double> { - public: - static const bool PacketAccess = true; - - __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) { - const int tid = blockIdx.x * blockDim.x + threadIdx.x; - const int seed = deterministic ? 0 : get_random_seed(); - curand_init(seed, tid, 0, &m_state); - } - __device__ UniformRandomGenerator(const UniformRandomGenerator& other) { - m_deterministic = other.m_deterministic; - const int tid = blockIdx.x * blockDim.x + threadIdx.x; - const int seed = m_deterministic ? 0 : get_random_seed(); - curand_init(seed, tid, 0, &m_state); - } - template<typename Index> - __device__ double operator()(Index) const { - return curand_uniform_double(&m_state); - } - template<typename Index, typename PacketType> - __device__ double2 packetOp(Index) const { - EIGEN_STATIC_ASSERT((is_same<PacketType, double2>::value), YOU_MADE_A_PROGRAMMING_MISTAKE); - return curand_uniform2_double(&m_state); - } - - private: - bool m_deterministic; - mutable curandStatePhilox4_32_10_t m_state; -}; - -template <> class UniformRandomGenerator<std::complex<float> > { - public: - static const bool PacketAccess = false; - - __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) { - const int tid = blockIdx.x * blockDim.x + threadIdx.x; - const int seed = deterministic ? 0 : get_random_seed(); - curand_init(seed, tid, 0, &m_state); - } - __device__ UniformRandomGenerator(const UniformRandomGenerator& other) { - m_deterministic = other.m_deterministic; - const int tid = blockIdx.x * blockDim.x + threadIdx.x; - const int seed = m_deterministic ? 0 : get_random_seed(); - curand_init(seed, tid, 0, &m_state); - } - template<typename Index> - __device__ std::complex<float> operator()(Index) const { - float4 vals = curand_uniform4(&m_state); - return std::complex<float>(vals.x, vals.y); - } - - private: - bool m_deterministic; - mutable curandStatePhilox4_32_10_t m_state; -}; - -template <> class UniformRandomGenerator<std::complex<double> > { - public: - static const bool PacketAccess = false; - - __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) { - const int tid = blockIdx.x * blockDim.x + threadIdx.x; - const int seed = deterministic ? 0 : get_random_seed(); - curand_init(seed, tid, 0, &m_state); - } - __device__ UniformRandomGenerator(const UniformRandomGenerator& other) { - m_deterministic = other.m_deterministic; - const int tid = blockIdx.x * blockDim.x + threadIdx.x; - const int seed = m_deterministic ? 0 : get_random_seed(); - curand_init(seed, tid, 0, &m_state); - } - template<typename Index> - __device__ std::complex<double> operator()(Index) const { - double2 vals = curand_uniform2_double(&m_state); - return std::complex<double>(vals.x, vals.y); - } - - private: - bool m_deterministic; - mutable curandStatePhilox4_32_10_t m_state; -}; - -#endif - -template <typename Scalar> -struct functor_traits<UniformRandomGenerator<Scalar> > { - enum { - // Rough estimate. - Cost = 100 * NumTraits<Scalar>::MulCost, - PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess - }; -}; - - - -#if (!defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)) && (__cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900) -// We're not compiling a cuda kernel -template <typename T> class NormalRandomGenerator { - public: - static const bool PacketAccess = true; - - NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_distribution(0, 1), m_generator(new std::mt19937()) { - if (!deterministic) { - m_generator->seed(get_random_seed()); - } - } - NormalRandomGenerator(const NormalRandomGenerator& other) - : m_deterministic(other.m_deterministic), m_distribution(other.m_distribution), m_generator(new std::mt19937()) { - m_generator->seed(other(0) * UINT_MAX); - } - ~NormalRandomGenerator() { - delete m_generator; - } - template<typename Index> - T operator()(Index) const { - return m_distribution(*m_generator); - } - template<typename Index, typename PacketType> - PacketType packetOp(Index) const { - const int packetSize = internal::unpacket_traits<PacketType>::size; - EIGEN_ALIGN_MAX T values[packetSize]; - for (int i = 0; i < packetSize; ++i) { - values[i] = m_distribution(*m_generator); - } - return internal::pload<PacketType>(values); - } - - private: - // No assignment - NormalRandomGenerator& operator = (const NormalRandomGenerator&); - - bool m_deterministic; - mutable std::normal_distribution<T> m_distribution; - std::mt19937* m_generator; -}; - -#elif defined (EIGEN_USE_GPU) && defined(__CUDACC__) && defined(__CUDA_ARCH__) - -// We're compiling a cuda kernel -template <typename T> class NormalRandomGenerator; - -template <> class NormalRandomGenerator<float> { - public: - static const bool PacketAccess = true; - - __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) { - const int tid = blockIdx.x * blockDim.x + threadIdx.x; - const int seed = deterministic ? 0 : get_random_seed(); - curand_init(seed, tid, 0, &m_state); - } - __device__ NormalRandomGenerator(const NormalRandomGenerator<float>& other) { - m_deterministic = other.m_deterministic; - const int tid = blockIdx.x * blockDim.x + threadIdx.x; - const int seed = m_deterministic ? 0 : get_random_seed(); - curand_init(seed, tid, 0, &m_state); - } - template<typename Index> - __device__ float operator()(Index) const { - return curand_normal(&m_state); - } - template<typename Index, typename PacketType> - __device__ float4 packetOp(Index) const { - EIGEN_STATIC_ASSERT((is_same<PacketType, float4>::value), YOU_MADE_A_PROGRAMMING_MISTAKE); - return curand_normal4(&m_state); - } - - private: - bool m_deterministic; - mutable curandStatePhilox4_32_10_t m_state; -}; - -template <> class NormalRandomGenerator<double> { - public: - static const bool PacketAccess = true; - - __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) { - const int tid = blockIdx.x * blockDim.x + threadIdx.x; - const int seed = deterministic ? 0 : get_random_seed(); - curand_init(seed, tid, 0, &m_state); - } - __device__ NormalRandomGenerator(const NormalRandomGenerator<double>& other) { - m_deterministic = other.m_deterministic; - const int tid = blockIdx.x * blockDim.x + threadIdx.x; - const int seed = m_deterministic ? 0 : get_random_seed(); - curand_init(seed, tid, 0, &m_state); - } - template<typename Index> - __device__ double operator()(Index) const { - return curand_normal_double(&m_state); - } - template<typename Index, typename PacketType> - __device__ double2 packetOp(Index) const { - EIGEN_STATIC_ASSERT((is_same<PacketType, double2>::value), YOU_MADE_A_PROGRAMMING_MISTAKE); - return curand_normal2_double(&m_state); - } - - private: - bool m_deterministic; - mutable curandStatePhilox4_32_10_t m_state; -}; - -template <> class NormalRandomGenerator<std::complex<float> > { - public: - static const bool PacketAccess = false; - - __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) { - const int tid = blockIdx.x * blockDim.x + threadIdx.x; - const int seed = deterministic ? 0 : get_random_seed(); - curand_init(seed, tid, 0, &m_state); - } - __device__ NormalRandomGenerator(const NormalRandomGenerator& other) { - m_deterministic = other.m_deterministic; - const int tid = blockIdx.x * blockDim.x + threadIdx.x; - const int seed = m_deterministic ? 0 : get_random_seed(); - curand_init(seed, tid, 0, &m_state); - } - template<typename Index> - __device__ std::complex<float> operator()(Index) const { - float4 vals = curand_normal4(&m_state); - return std::complex<float>(vals.x, vals.y); - } - - private: - bool m_deterministic; - mutable curandStatePhilox4_32_10_t m_state; -}; - -template <> class NormalRandomGenerator<std::complex<double> > { - public: - static const bool PacketAccess = false; - - __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) { - const int tid = blockIdx.x * blockDim.x + threadIdx.x; - const int seed = deterministic ? 0 : get_random_seed(); - curand_init(seed, tid, 0, &m_state); - } - __device__ NormalRandomGenerator(const NormalRandomGenerator& other) { - m_deterministic = other.m_deterministic; - const int tid = blockIdx.x * blockDim.x + threadIdx.x; - const int seed = m_deterministic ? 0 : get_random_seed(); - curand_init(seed, tid, 0, &m_state); - } - template<typename Index> - __device__ std::complex<double> operator()(Index) const { - double2 vals = curand_normal2_double(&m_state); - return std::complex<double>(vals.x, vals.y); - } - - private: - bool m_deterministic; - mutable curandStatePhilox4_32_10_t m_state; -}; - -#else - -template <typename T> class NormalRandomGenerator { - public: - static const bool PacketAccess = false; - NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {} - - private: - bool m_deterministic; -}; - -#endif - -template <typename Scalar> -struct functor_traits<NormalRandomGenerator<Scalar> > { +template <typename T, typename Device> +struct reducer_traits<ArgMinTupleReducer<T>, Device> { enum { - // Rough estimate. - Cost = 100 * NumTraits<Scalar>::MulCost, - PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess + Cost = NumTraits<T>::AddCost, + PacketAccess = false }; }; @@ -797,7 +455,7 @@ class GaussianGenerator { } } - T operator()(const array<Index, NumDims>& coordinates) const { + EIGEN_DEVICE_FUNC T operator()(const array<Index, NumDims>& coordinates) const { T tmp = T(0); for (size_t i = 0; i < NumDims; ++i) { T offset = coordinates[i] - m_means[i]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index 8ff7d5815..eb1d4934e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -134,7 +134,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+packetSize-1 < dimensions().TotalSize()); EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h new file mode 100644 index 000000000..665b861cf --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h @@ -0,0 +1,33 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H +#define EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H + +namespace Eigen { + +/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given tensors. + * + * This function computes the regularized incomplete beta function (integral). + * + */ +template <typename ADerived, typename BDerived, typename XDerived> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const + TensorCwiseTernaryOp<internal::scalar_betainc_op<typename XDerived::Scalar>, + const ADerived, const BDerived, const XDerived> + betainc(const ADerived& a, const BDerived& b, const XDerived& x) { + return TensorCwiseTernaryOp< + internal::scalar_betainc_op<typename XDerived::Scalar>, const ADerived, + const BDerived, const XDerived>( + a, b, x, internal::scalar_betainc_op<typename XDerived::Scalar>()); +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h index 38a833f82..a901c5dd4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h @@ -13,38 +13,61 @@ namespace Eigen { namespace internal { -template<> -struct significant_decimals_impl<std::string> - : significant_decimals_default_impl<std::string, true> -{}; -} +// Print the tensor as a 2d matrix +template <typename Tensor, int Rank> +struct TensorPrinter { + static void run (std::ostream& os, const Tensor& tensor) { + typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar; + typedef typename Tensor::Index Index; + const Index total_size = internal::array_prod(tensor.dimensions()); + if (total_size > 0) { + const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions()); + static const int layout = Tensor::Layout; + Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim); + os << matrix; + } + } +}; + + +// Print the tensor as a vector +template <typename Tensor> +struct TensorPrinter<Tensor, 1> { + static void run (std::ostream& os, const Tensor& tensor) { + typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar; + typedef typename Tensor::Index Index; + const Index total_size = internal::array_prod(tensor.dimensions()); + if (total_size > 0) { + Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size); + os << array; + } + } +}; + + +// Print the tensor as a scalar +template <typename Tensor> +struct TensorPrinter<Tensor, 0> { + static void run (std::ostream& os, const Tensor& tensor) { + os << tensor.coeff(0); + } +}; +} template <typename T> std::ostream& operator << (std::ostream& os, const TensorBase<T, ReadOnlyAccessors>& expr) { + typedef TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> Evaluator; + typedef typename Evaluator::Dimensions Dimensions; + // Evaluate the expression if needed TensorForcedEvalOp<const T> eval = expr.eval(); - TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> tensor(eval, DefaultDevice()); + Evaluator tensor(eval, DefaultDevice()); tensor.evalSubExprsIfNeeded(NULL); - typedef typename internal::remove_const<typename T::Scalar>::type Scalar; - typedef typename T::Index Index; - typedef typename TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Dimensions Dimensions; - const Index total_size = internal::array_prod(tensor.dimensions()); - - // Print the tensor as a 1d vector or a 2d matrix. + // Print the result static const int rank = internal::array_size<Dimensions>::value; - if (rank == 0) { - os << tensor.coeff(0); - } else if (rank == 1) { - Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size); - os << array; - } else { - const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions()); - static const int layout = TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Layout; - Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim); - os << matrix; - } + internal::TensorPrinter<Evaluator, rank>::run(os, tensor); // Cleanup. tensor.cleanup(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index bafcc67bd..566856ed2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -174,7 +174,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { - EIGEN_STATIC_ASSERT(NumDims >= 4, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE); m_paddingValue = op.padding_value(); @@ -362,7 +362,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h index 985594bc8..3209fecd3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -10,7 +10,8 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H #define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H -#if defined(EIGEN_HAS_CONSTEXPR) && defined(EIGEN_HAS_VARIADIC_TEMPLATES) + +#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES #define EIGEN_HAS_INDEX_LIST @@ -45,6 +46,24 @@ struct type2index { } }; +// This can be used with IndexPairList to get compile-time constant pairs, +// such as IndexPairList<type2indexpair<1,2>, type2indexpair<3,4>>(). +template <DenseIndex f, DenseIndex s> +struct type2indexpair { + static const DenseIndex first = f; + static const DenseIndex second = s; + + constexpr EIGEN_DEVICE_FUNC operator IndexPair<DenseIndex>() const { + return IndexPair<DenseIndex>(f, s); + } + + EIGEN_DEVICE_FUNC void set(const IndexPair<DenseIndex>& val) { + eigen_assert(val.first == f); + eigen_assert(val.second == s); + } +}; + + template<DenseIndex n> struct NumTraits<type2index<n> > { typedef DenseIndex Real; @@ -73,6 +92,16 @@ EIGEN_DEVICE_FUNC void update_value(type2index<n>& val, DenseIndex new_val) { } template <typename T> +EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair<DenseIndex> new_val) { + val = new_val; +} +template <DenseIndex f, DenseIndex s> +EIGEN_DEVICE_FUNC void update_value(type2indexpair<f, s>& val, IndexPair<DenseIndex> new_val) { + val.set(new_val); +} + + +template <typename T> struct is_compile_time_constant { static constexpr bool value = false; }; @@ -94,7 +123,22 @@ struct is_compile_time_constant<const type2index<idx>& > { static constexpr bool value = true; }; - +template <DenseIndex f, DenseIndex s> +struct is_compile_time_constant<type2indexpair<f, s> > { + static constexpr bool value = true; +}; +template <DenseIndex f, DenseIndex s> +struct is_compile_time_constant<const type2indexpair<f, s> > { + static constexpr bool value = true; +}; +template <DenseIndex f, DenseIndex s> +struct is_compile_time_constant<type2indexpair<f, s>& > { + static constexpr bool value = true; +}; +template <DenseIndex f, DenseIndex s> +struct is_compile_time_constant<const type2indexpair<f, s>& > { + static constexpr bool value = true; +}; template<typename... T> @@ -184,31 +228,32 @@ template <typename T, typename... O> -template <DenseIndex Idx> +template <DenseIndex Idx, typename ValueT> struct tuple_coeff { template <typename... T> - EIGEN_DEVICE_FUNC static constexpr DenseIndex get(const DenseIndex i, const IndexTuple<T...>& t) { - return array_get<Idx>(t) * (i == Idx) + tuple_coeff<Idx-1>::get(i, t) * (i != Idx); + EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex i, const IndexTuple<T...>& t) { + // return array_get<Idx>(t) * (i == Idx) + tuple_coeff<Idx-1>::get(i, t) * (i != Idx); + return (i == Idx ? array_get<Idx>(t) : tuple_coeff<Idx-1, ValueT>::get(i, t)); } template <typename... T> - EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple<T...>& t, const DenseIndex value) { + EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple<T...>& t, const ValueT& value) { if (i == Idx) { update_value(array_get<Idx>(t), value); } else { - tuple_coeff<Idx-1>::set(i, t, value); + tuple_coeff<Idx-1, ValueT>::set(i, t, value); } } template <typename... T> EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple<T...>& t) { return ((i == Idx) & is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value) || - tuple_coeff<Idx-1>::value_known_statically(i, t); + tuple_coeff<Idx-1, ValueT>::value_known_statically(i, t); } template <typename... T> EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple<T...>& t) { return is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value && - tuple_coeff<Idx-1>::values_up_to_known_statically(t); + tuple_coeff<Idx-1, ValueT>::values_up_to_known_statically(t); } template <typename... T> @@ -216,19 +261,19 @@ struct tuple_coeff { return is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value && is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value && array_get<Idx>(t) > array_get<Idx-1>(t) && - tuple_coeff<Idx-1>::values_up_to_statically_known_to_increase(t); + tuple_coeff<Idx-1, ValueT>::values_up_to_statically_known_to_increase(t); } }; -template <> -struct tuple_coeff<0> { +template <typename ValueT> +struct tuple_coeff<0, ValueT> { template <typename... T> - EIGEN_DEVICE_FUNC static constexpr DenseIndex get(const DenseIndex i, const IndexTuple<T...>& t) { + EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex /*i*/, const IndexTuple<T...>& t) { // eigen_assert (i == 0); // gcc fails to compile assertions in constexpr - return array_get<0>(t) * (i == 0); + return array_get<0>(t)/* * (i == 0)*/; } template <typename... T> - EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple<T...>& t, const DenseIndex value) { + EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple<T...>& t, const ValueT value) { eigen_assert (i == 0); update_value(array_get<0>(t), value); } @@ -254,13 +299,13 @@ struct tuple_coeff<0> { template<typename FirstType, typename... OtherTypes> struct IndexList : internal::IndexTuple<FirstType, OtherTypes...> { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex operator[] (const DenseIndex i) const { - return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::get(i, *this); + return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::get(i, *this); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex get(const DenseIndex i) const { - return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::get(i, *this); + return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::get(i, *this); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const DenseIndex value) { - return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::set(i, *this, value); + return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::set(i, *this, value); } EIGEN_DEVICE_FUNC constexpr IndexList(const internal::IndexTuple<FirstType, OtherTypes...>& other) : internal::IndexTuple<FirstType, OtherTypes...>(other) { } @@ -268,14 +313,14 @@ struct IndexList : internal::IndexTuple<FirstType, OtherTypes...> { EIGEN_DEVICE_FUNC constexpr IndexList() : internal::IndexTuple<FirstType, OtherTypes...>() { } EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const { - return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::value_known_statically(i, *this); + return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::value_known_statically(i, *this); } EIGEN_DEVICE_FUNC constexpr bool all_values_known_statically() const { - return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::values_up_to_known_statically(*this); + return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::values_up_to_known_statically(*this); } EIGEN_DEVICE_FUNC constexpr bool values_statically_known_to_increase() const { - return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::values_up_to_statically_known_to_increase(*this); + return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::values_up_to_statically_known_to_increase(*this); } }; @@ -286,6 +331,23 @@ constexpr IndexList<FirstType, OtherTypes...> make_index_list(FirstType val1, Ot } +template<typename FirstType, typename... OtherTypes> +struct IndexPairList : internal::IndexTuple<FirstType, OtherTypes...> { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair<DenseIndex> operator[] (const DenseIndex i) const { + return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, IndexPair<DenseIndex>>::get(i, *this); + } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const IndexPair<DenseIndex> value) { + return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value-1, IndexPair<DenseIndex> >::set(i, *this, value); + } + + EIGEN_DEVICE_FUNC constexpr IndexPairList(const internal::IndexTuple<FirstType, OtherTypes...>& other) : internal::IndexTuple<FirstType, OtherTypes...>(other) { } + EIGEN_DEVICE_FUNC constexpr IndexPairList() : internal::IndexTuple<FirstType, OtherTypes...>() { } + + EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const { + return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::value_known_statically(i, *this); + } +}; + namespace internal { template<typename FirstType, typename... OtherTypes> size_t array_prod(const IndexList<FirstType, OtherTypes...>& sizes) { @@ -303,6 +365,13 @@ template<typename FirstType, typename... OtherTypes> struct array_size<const Ind static const size_t value = array_size<IndexTuple<FirstType, OtherTypes...> >::value; }; +template<typename FirstType, typename... OtherTypes> struct array_size<IndexPairList<FirstType, OtherTypes...> > { + static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value; +}; +template<typename FirstType, typename... OtherTypes> struct array_size<const IndexPairList<FirstType, OtherTypes...> > { + static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value; +}; + template<DenseIndex N, typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC constexpr DenseIndex array_get(IndexList<FirstType, OtherTypes...>& a) { return IndexTupleExtractor<N, FirstType, OtherTypes...>::get_val(a); } @@ -472,6 +541,57 @@ struct index_statically_lt_impl<const IndexList<FirstType, OtherTypes...> > { } }; + + +template <typename Tx> +struct index_pair_first_statically_eq_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { + return false; + } +}; + +template <typename FirstType, typename... OtherTypes> +struct index_pair_first_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) & + (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value); + } +}; + +template <typename FirstType, typename... OtherTypes> +struct index_pair_first_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) & + (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value); + } +}; + + + +template <typename Tx> +struct index_pair_second_statically_eq_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { + return false; + } +}; + +template <typename FirstType, typename... OtherTypes> +struct index_pair_second_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) & + (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value); + } +}; + +template <typename FirstType, typename... OtherTypes> +struct index_pair_second_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) & + (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value); + } +}; + + } // end namespace internal } // end namespace Eigen @@ -482,53 +602,69 @@ namespace internal { template <typename T> struct index_known_statically_impl { - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(const DenseIndex) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex) { return false; } }; template <typename T> struct all_indices_known_statically_impl { - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run() { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() { return false; } }; template <typename T> struct indices_statically_known_to_increase_impl { - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run() { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() { return false; } }; template <typename T> struct index_statically_eq_impl { - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(DenseIndex, DenseIndex) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { return false; } }; template <typename T> struct index_statically_ne_impl { - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(DenseIndex, DenseIndex) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { return false; } }; template <typename T> struct index_statically_gt_impl { - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(DenseIndex, DenseIndex) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { return false; } }; template <typename T> struct index_statically_lt_impl { - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(DenseIndex, DenseIndex) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { + return false; + } +}; + +template <typename Tx> +struct index_pair_first_statically_eq_impl { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { + return false; + } +}; + +template <typename Tx> +struct index_pair_second_statically_eq_impl { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { return false; } }; + + } // end namespace internal } // end namespace Eigen @@ -572,6 +708,16 @@ static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(DenseIndex i, return index_statically_lt_impl<T>::run(i, value); } +template <typename T> +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(DenseIndex i, DenseIndex value) { + return index_pair_first_statically_eq_impl<T>::run(i, value); +} + +template <typename T> +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(DenseIndex i, DenseIndex value) { + return index_pair_second_statically_eq_impl<T>::run(i, value); +} + } // end namespace internal } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index de2f67d74..f391fb9ee 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -189,7 +189,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device> template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h index 2d223140e..33edc49e3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h @@ -10,7 +10,7 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H #define EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H -#ifdef EIGEN_HAS_VARIADIC_TEMPLATES +#if EIGEN_HAS_VARIADIC_TEMPLATES #include <initializer_list> diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index 33c6c1b0f..ede3939c2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -29,25 +29,47 @@ namespace Eigen { namespace internal { namespace { + // Note: result is undefined if val == 0 template <typename T> - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int count_leading_zeros(const T val) + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + typename internal::enable_if<sizeof(T)==4,int>::type count_leading_zeros(const T val) { #ifdef __CUDA_ARCH__ - return (sizeof(T) == 8) ? __clzll(val) : __clz(val); + return __clz(val); #elif EIGEN_COMP_MSVC - unsigned long index; - if (sizeof(T) == 8) { - _BitScanReverse64(&index, val); - } else { - _BitScanReverse(&index, val); - } - return (sizeof(T) == 8) ? 63 - index : 31 - index; + unsigned long index; + _BitScanReverse(&index, val); + return 31 - index; +#else + EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE); + return __builtin_clz(static_cast<uint32_t>(val)); +#endif + } + + template <typename T> + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + typename internal::enable_if<sizeof(T)==8,int>::type count_leading_zeros(const T val) + { +#ifdef __CUDA_ARCH__ + return __clzll(val); +#elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64 + unsigned long index; + _BitScanReverse64(&index, val); + return 63 - index; +#elif EIGEN_COMP_MSVC + // MSVC's _BitScanReverse64 is not available for 32bits builds. + unsigned int lo = (unsigned int)(val&0xffffffff); + unsigned int hi = (unsigned int)((val>>32)&0xffffffff); + int n; + if(hi==0) + n = 32 + count_leading_zeros<unsigned int>(lo); + else + n = count_leading_zeros<unsigned int>(hi); + return n; #else EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE); - return (sizeof(T) == 8) ? - __builtin_clzll(static_cast<uint64_t>(val)) : - __builtin_clz(static_cast<uint32_t>(val)); + return __builtin_clzll(static_cast<uint64_t>(val)); #endif } @@ -98,7 +120,9 @@ namespace { return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1); #else const uint64_t shift = 1ULL << log_div; - TensorUInt128<uint64_t, uint64_t> result = (TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider) - TensorUInt128<static_val<1>, static_val<0> >(1, 0) + TensorUInt128<static_val<0>, static_val<1> >(1)); + TensorUInt128<uint64_t, uint64_t> result = TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider) + - TensorUInt128<static_val<1>, static_val<0> >(1, 0) + + TensorUInt128<static_val<0>, static_val<1> >(1); return static_cast<uint64_t>(result); #endif } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h index 8ed71f838..ee0078bbc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h @@ -28,7 +28,7 @@ // SFINAE requires variadic templates #ifndef __CUDACC__ -#ifdef EIGEN_HAS_VARIADIC_TEMPLATES +#if EIGEN_HAS_VARIADIC_TEMPLATES // SFINAE doesn't work for gcc <= 4.7 #ifdef EIGEN_COMP_GNUC #if EIGEN_GNUC_AT_LEAST(4,8) @@ -44,7 +44,7 @@ typename internal::enable_if< ( __condition__ ) , int >::type = 0 -#if defined(EIGEN_HAS_CONSTEXPR) +#if EIGEN_HAS_CONSTEXPR #define EIGEN_CONSTEXPR constexpr #else #define EIGEN_CONSTEXPR diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 9ebd9172b..6fb4f4a31 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -57,7 +57,7 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) } -#ifdef EIGEN_HAS_VARIADIC_TEMPLATES +#if EIGEN_HAS_VARIADIC_TEMPLATES template<typename... IndexTypes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. @@ -140,7 +140,7 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor return m_data[index]; } -#ifdef EIGEN_HAS_VARIADIC_TEMPLATES +#if EIGEN_HAS_VARIADIC_TEMPLATES template<typename... IndexTypes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const { @@ -227,7 +227,7 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor return m_data[index]; } -#ifdef EIGEN_HAS_VARIADIC_TEMPLATES +#if EIGEN_HAS_VARIADIC_TEMPLATES template<typename... IndexTypes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index cd04716bd..fdb5ee6b8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -47,22 +47,39 @@ template <> struct max_n_1<0> { // Default packet types template <typename Scalar, typename Device> -struct PacketType { +struct PacketType : internal::packet_traits<Scalar> { typedef typename internal::packet_traits<Scalar>::type type; - enum { size = internal::unpacket_traits<type>::size }; }; // For CUDA packet types when using a GpuDevice -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16) template <> -struct PacketType<float, GpuDevice> { - typedef float4 type; - static const int size = 4; -}; -template <> -struct PacketType<double, GpuDevice> { - typedef double2 type; +struct PacketType<half, GpuDevice> { + typedef half2 type; static const int size = 2; + enum { + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 0, + HasMin = 1, + HasMax = 1, + HasConj = 0, + HasSetLinear = 0, + HasBlend = 0, + + HasDiv = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasExp = 1, + HasLog = 1, + HasLog1p = 0, + HasLog10 = 0, + HasPow = 1, + }; }; #endif @@ -112,6 +129,20 @@ bool operator!=(const Tuple<U, V>& x, const Tuple<U, V>& y) { } +// Can't use std::pairs on cuda devices +template <typename Idx> struct IndexPair { + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) {} + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Idx f, Idx s) : first(f), second(s) {} + + EIGEN_DEVICE_FUNC void set(IndexPair<Idx> val) { + first = val.first; + second = val.second; + } + + Idx first; + Idx second; +}; + #ifdef EIGEN_HAS_SFINAE namespace internal { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index bfa65a607..d34f1e328 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -148,7 +148,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); } - const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } + EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } protected: TensorEvaluator<ArgType, Device> m_impl; @@ -409,7 +409,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { const int packetSize = internal::unpacket_traits<PacketReturnType>::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+packetSize-1 < internal::array_prod(dimensions())); Index inputIndices[] = {0, 0}; @@ -603,6 +603,286 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> }; + +namespace internal { +template<typename StartIndices, typename StopIndices, typename Strides, typename XprType> +struct traits<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> > : public traits<XprType> +{ + typedef typename XprType::Scalar Scalar; + typedef traits<XprType> XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = array_size<StartIndices>::value; + static const int Layout = XprTraits::Layout; +}; + +template<typename StartIndices, typename StopIndices, typename Strides, typename XprType> +struct eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Eigen::Dense> +{ + typedef const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>& type; +}; + +template<typename StartIndices, typename StopIndices, typename Strides, typename XprType> +struct nested<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, 1, typename eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >::type> +{ + typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> type; +}; + +} // end namespace internal + + +template<typename StartIndices, typename StopIndices, typename Strides, typename XprType> +class TensorStridingSlicingOp : public TensorBase<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> > +{ + public: + typedef typename internal::traits<TensorStridingSlicingOp>::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename internal::nested<TensorStridingSlicingOp>::type Nested; + typedef typename internal::traits<TensorStridingSlicingOp>::StorageKind StorageKind; + typedef typename internal::traits<TensorStridingSlicingOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingSlicingOp( + const XprType& expr, const StartIndices& startIndices, + const StopIndices& stopIndices, const Strides& strides) + : m_xpr(expr), m_startIndices(startIndices), m_stopIndices(stopIndices), + m_strides(strides) {} + + EIGEN_DEVICE_FUNC + const StartIndices& startIndices() const { return m_startIndices; } + EIGEN_DEVICE_FUNC + const StartIndices& stopIndices() const { return m_stopIndices; } + EIGEN_DEVICE_FUNC + const StartIndices& strides() const { return m_strides; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all<typename XprType::Nested>::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const TensorStridingSlicingOp& other) + { + typedef TensorAssignOp<TensorStridingSlicingOp, const TensorStridingSlicingOp> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice>::run( + assign, DefaultDevice()); + return *this; + } + + template<typename OtherDerived> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp<TensorStridingSlicingOp, const OtherDerived> Assign; + Assign assign(*this, other); + internal::TensorExecutor<const Assign, DefaultDevice>::run( + assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; + const StartIndices m_startIndices; + const StopIndices m_stopIndices; + const Strides m_strides; +}; + +// Eval as rvalue +template<typename StartIndices, typename StopIndices, typename Strides, typename ArgType, typename Device> +struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device> +{ + typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType; + static const int NumDims = internal::array_size<Strides>::value; + + enum { + // Alignment can't be guaranteed at compile time since it depends on the + // slice offsets and sizes. + IsAligned = false, + PacketAccess = false, + BlockAccess = false, + Layout = TensorEvaluator<ArgType, Device>::Layout, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_device(device), m_strides(op.strides()) + { + // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero + DSizes<Index,NumDims> startIndicesClamped, stopIndicesClamped; + for (size_t i = 0; i < internal::array_size<Dimensions>::value; ++i) { + eigen_assert(m_strides[i] != 0 && "0 stride is invalid"); + if(m_strides[i]>0){ + startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]); + stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]); + }else{ + /* implies m_strides[i]<0 by assert */ + startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1); + stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1); + } + m_startIndices[i] = startIndicesClamped[i]; + } + + const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); + + // check for degenerate intervals and compute output tensor shape + bool degenerate = false;; + for(int i = 0; i < NumDims; i++){ + Index interval = stopIndicesClamped[i] - startIndicesClamped[i]; + if(interval == 0 || ((interval<0) != (m_strides[i]<0))){ + m_dimensions[i] = 0; + degenerate = true; + }else{ + m_dimensions[i] = interval / m_strides[i] + + (interval % m_strides[i] != 0 ? 1 : 0); + eigen_assert(m_dimensions[i] >= 0); + } + } + Strides output_dims = m_dimensions; + + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_inputStrides[0] = m_strides[0]; + m_offsets[0] = startIndicesClamped[0]; + Index previousDimProduct = 1; + for (int i = 1; i < NumDims; ++i) { + previousDimProduct *= input_dims[i-1]; + m_inputStrides[i] = previousDimProduct * m_strides[i]; + m_offsets[i] = startIndicesClamped[i] * previousDimProduct; + } + + // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed. + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1]; + // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash + m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(degenerate ? 1 : m_outputStrides[i]); + } + } else { + m_inputStrides[NumDims-1] = m_strides[NumDims-1]; + m_offsets[NumDims-1] = startIndicesClamped[NumDims-1]; + Index previousDimProduct = 1; + for (int i = NumDims - 2; i >= 0; --i) { + previousDimProduct *= input_dims[i+1]; + m_inputStrides[i] = previousDimProduct * m_strides[i]; + m_offsets[i] = startIndicesClamped[i] * previousDimProduct; + } + + m_outputStrides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1]; + // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash + m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(degenerate ? 1 : m_outputStrides[i]); + } + } + m_block_total_size_max = numext::maxi(static_cast<std::size_t>(1), + device.lastLevelCacheSize() / + sizeof(Scalar)); + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const<Scalar>::type ScalarNonConst; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + typedef Strides Dimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(srcCoeff(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { + return NULL; + } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const + { + Index inputIndex = 0; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i >= 0; --i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += idx * m_inputStrides[i] + m_offsets[i]; + index -= idx * m_outputStrides[i]; + } + } else { + for (int i = 0; i < NumDims; ++i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += idx * m_inputStrides[i] + m_offsets[i]; + index -= idx * m_outputStrides[i]; + } + } + return inputIndex; + } + + static EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) { + return numext::maxi(min, numext::mini(max,value)); + } + + array<Index, NumDims> m_outputStrides; + array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides; + array<Index, NumDims> m_inputStrides; + TensorEvaluator<ArgType, Device> m_impl; + const Device& m_device; + DSizes<Index, NumDims> m_startIndices; // clamped startIndices + DSizes<Index, NumDims> m_dimensions; + DSizes<Index, NumDims> m_offsets; // offset in a flattened shape + const Strides m_strides; + std::size_t m_block_total_size_max; +}; + +// Eval as lvalue +template<typename StartIndices, typename StopIndices, typename Strides, typename ArgType, typename Device> +struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device> + : public TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device> +{ + typedef TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device> Base; + typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType; + static const int NumDims = internal::array_size<Strides>::value; + + enum { + IsAligned = false, + PacketAccess = false, + BlockAccess = false, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const<Scalar>::type ScalarNonConst; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + typedef Strides Dimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } +}; + + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 88b838b27..647bcf108 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -93,7 +93,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; enum { - IsAligned = false, + IsAligned = true, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = true, @@ -106,7 +106,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector // of 1 element first and then pad. - EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); // Compute dimensions m_dimensions = m_impl.dimensions(); @@ -150,27 +150,26 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; - if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { + if (isPaddingAtIndexForDim(idx, i)) { return m_paddingValue; } inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; index -= idx * m_outputStrides[i]; } - if (index < m_padding[0].first || index >= m_dimensions[0] - m_padding[0].second) { + if (isPaddingAtIndexForDim(index, 0)) { return m_paddingValue; } inputIndex += (index - m_padding[0].first); } else { for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i+1]; - if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { + if (isPaddingAtIndexForDim(idx, i)) { return m_paddingValue; } inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; index -= idx * m_outputStrides[i+1]; } - if (index < m_padding[NumDims-1].first || - index >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) { + if (isPaddingAtIndexForDim(index, NumDims-1)) { return m_paddingValue; } inputIndex += (index - m_padding[NumDims-1].first); @@ -187,43 +186,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device return packetRowMajor(index); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const - { - Index inputIndex; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - { - const Index idx = coords[0]; - if (idx < m_padding[0].first || idx >= m_dimensions[0] - m_padding[0].second) { - return m_paddingValue; - } - inputIndex = idx - m_padding[0].first; - } - for (int i = 1; i < NumDims; ++i) { - const Index idx = coords[i]; - if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { - return m_paddingValue; - } - inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; - } - } else { - { - const Index idx = coords[NumDims-1]; - if (idx < m_padding[NumDims-1].first || idx >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) { - return m_paddingValue; - } - inputIndex = idx - m_padding[NumDims-1].first; - } - for (int i = NumDims - 2; i >= 0; --i) { - const Index idx = coords[i]; - if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { - return m_paddingValue; - } - inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; - } - } - return m_impl.coeff(inputIndex); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { TensorOpCost cost = m_impl.costPerCoeff(vectorized); if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { @@ -239,6 +201,40 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } private: + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim( + Index index, int dim_index) const { +#if defined(EIGEN_HAS_INDEX_LIST) + return (!internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0) && + index < m_padding[dim_index].first) || + (!internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0) && + index >= m_dimensions[dim_index] - m_padding[dim_index].second); +#else + return (index < m_padding[dim_index].first) || + (index >= m_dimensions[dim_index] - m_padding[dim_index].second); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isLeftPaddingCompileTimeZero( + int dim_index) const { +#if defined(EIGEN_HAS_INDEX_LIST) + return internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0); +#else + EIGEN_UNUSED_VARIABLE(dim_index); + return false; +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isRightPaddingCompileTimeZero( + int dim_index) const { +#if defined(EIGEN_HAS_INDEX_LIST) + return internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0); +#else + EIGEN_UNUSED_VARIABLE(dim_index); + return false; +#endif + } + + void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const { const double in = static_cast<double>(m_impl.dimensions()[i]); const double out = in + m_padding[i].first + m_padding[i].second; @@ -261,7 +257,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const { - EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); const Index initialIndex = index; @@ -273,15 +269,15 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i]; const Index lastPaddedRight = m_outputStrides[i+1]; - if (last < lastPaddedLeft) { + if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) { // all the coefficient are in the padding zone. return internal::pset1<PacketReturnType>(m_paddingValue); } - else if (first >= firstPaddedRight && last < lastPaddedRight) { + else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) { // all the coefficient are in the padding zone. return internal::pset1<PacketReturnType>(m_paddingValue); } - else if (first >= lastPaddedLeft && last < firstPaddedRight) { + else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { // all the coefficient are between the 2 padding zones. const Index idx = index / m_outputStrides[i]; inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; @@ -299,15 +295,15 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second); const Index lastPaddedRight = m_outputStrides[1]; - if (last < lastPaddedLeft) { + if (!isLeftPaddingCompileTimeZero(0) && last < lastPaddedLeft) { // all the coefficient are in the padding zone. return internal::pset1<PacketReturnType>(m_paddingValue); } - else if (first >= firstPaddedRight && last < lastPaddedRight) { + else if (!isRightPaddingCompileTimeZero(0) && first >= firstPaddedRight && last < lastPaddedRight) { // all the coefficient are in the padding zone. return internal::pset1<PacketReturnType>(m_paddingValue); } - else if (first >= lastPaddedLeft && last < firstPaddedRight) { + else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { // all the coefficient are between the 2 padding zones. inputIndex += (index - m_padding[0].first); return m_impl.template packet<Unaligned>(inputIndex); @@ -318,7 +314,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const { - EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); const Index initialIndex = index; @@ -331,15 +327,15 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1]; const Index lastPaddedRight = m_outputStrides[i]; - if (last < lastPaddedLeft) { + if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) { // all the coefficient are in the padding zone. return internal::pset1<PacketReturnType>(m_paddingValue); } - else if (first >= firstPaddedRight && last < lastPaddedRight) { + else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) { // all the coefficient are in the padding zone. return internal::pset1<PacketReturnType>(m_paddingValue); } - else if (first >= lastPaddedLeft && last < firstPaddedRight) { + else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { // all the coefficient are between the 2 padding zones. const Index idx = index / m_outputStrides[i+1]; inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; @@ -357,15 +353,15 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second); const Index lastPaddedRight = m_outputStrides[NumDims-1]; - if (last < lastPaddedLeft) { + if (!isLeftPaddingCompileTimeZero(NumDims-1) && last < lastPaddedLeft) { // all the coefficient are in the padding zone. return internal::pset1<PacketReturnType>(m_paddingValue); } - else if (first >= firstPaddedRight && last < lastPaddedRight) { + else if (!isRightPaddingCompileTimeZero(NumDims-1) && first >= firstPaddedRight && last < lastPaddedRight) { // all the coefficient are in the padding zone. return internal::pset1<PacketReturnType>(m_paddingValue); } - else if (first >= lastPaddedLeft && last < firstPaddedRight) { + else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { // all the coefficient are between the 2 padding zones. inputIndex += (index - m_padding[NumDims-1].first); return m_impl.template packet<Unaligned>(inputIndex); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index a87e45330..886a254f6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -184,7 +184,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h new file mode 100644 index 000000000..1655a813e --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h @@ -0,0 +1,276 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H +#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H + +namespace Eigen { +namespace internal { + +namespace { + +EIGEN_DEVICE_FUNC uint64_t get_random_seed() { +#ifdef __CUDA_ARCH__ + // We don't support 3d kernels since we currently only use 1 and + // 2d kernels. + assert(threadIdx.z == 0); + return clock64() + + blockIdx.x * blockDim.x + threadIdx.x + + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y); + +#elif defined _WIN32 + // Use the current time as a baseline. + SYSTEMTIME st; + GetSystemTime(&st); + int time = st.wSecond + 1000 * st.wMilliseconds; + // Mix in a random number to make sure that we get different seeds if + // we try to generate seeds faster than the clock resolution. + // We need 2 random values since the generator only generate 16 bits at + // a time (https://msdn.microsoft.com/en-us/library/398ax69y.aspx) + int rnd1 = ::rand(); + int rnd2 = ::rand(); + uint64_t rnd = (rnd1 | rnd2 << 16) ^ time; + return rnd; + +#elif defined __APPLE__ + // Same approach as for win32, except that the random number generator + // is better (// https://developer.apple.com/legacy/library/documentation/Darwin/Reference/ManPages/man3/random.3.html#//apple_ref/doc/man/3/random). + uint64_t rnd = ::random() ^ mach_absolute_time(); + return rnd; + +#else + // Augment the current time with pseudo random number generation + // to ensure that we get different seeds if we try to generate seeds + // faster than the clock resolution. + timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + uint64_t rnd = ::random() ^ ts.tv_nsec; + return rnd; +#endif +} + +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state) { + // TODO: Unify with the implementation in the non blocking thread pool. + uint64_t current = *state; + // Update the internal state + *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; + // Generate the random output (using the PCG-XSH-RS scheme) + return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61))); +} + +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) { + seed = seed ? seed : get_random_seed(); + return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; +} + +} // namespace + + +template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +T RandomToTypeUniform(uint64_t* state) { + unsigned rnd = PCG_XSH_RS_generator(state); + return static_cast<T>(rnd); +} + + +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state) { + Eigen::half result; + // Generate 10 random bits for the mantissa + unsigned rnd = PCG_XSH_RS_generator(state); + result.x = static_cast<uint16_t>(rnd & 0x3ffu); + // Set the exponent + result.x |= (static_cast<uint16_t>(15) << 10); + // Return the final result + return result - Eigen::half(1.0f); +} + + +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float RandomToTypeUniform<float>(uint64_t* state) { + typedef union { + uint32_t raw; + float fp; + } internal; + internal result; + // Generate 23 random bits for the mantissa mantissa + const unsigned rnd = PCG_XSH_RS_generator(state); + result.raw = rnd & 0x7fffffu; + // Set the exponent + result.raw |= (static_cast<uint32_t>(127) << 23); + // Return the final result + return result.fp - 1.0f; +} + +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double RandomToTypeUniform<double>(uint64_t* state) { + typedef union { + uint64_t raw; + double dp; + } internal; + internal result; + result.raw = 0; + // Generate 52 random bits for the mantissa + // First generate the upper 20 bits + unsigned rnd1 = PCG_XSH_RS_generator(state) & 0xfffffu; + // The generate the lower 32 bits + unsigned rnd2 = PCG_XSH_RS_generator(state); + result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2; + // Set the exponent + result.raw |= (static_cast<uint64_t>(1023) << 52); + // Return the final result + return result.dp - 1.0; +} + +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state) { + return std::complex<float>(RandomToTypeUniform<float>(state), + RandomToTypeUniform<float>(state)); +} +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state) { + return std::complex<double>(RandomToTypeUniform<double>(state), + RandomToTypeUniform<double>(state)); +} + +template <typename T> class UniformRandomGenerator { + public: + static const bool PacketAccess = true; + + // Uses the given "seed" if non-zero, otherwise uses a random seed. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator( + uint64_t seed = 0) { + m_state = PCG_XSH_RS_state(seed); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator( + const UniformRandomGenerator& other) { + m_state = other.m_state; + } + + template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T operator()(Index i) const { + uint64_t local_state = m_state + i; + T result = RandomToTypeUniform<T>(&local_state); + m_state = local_state; + return result; + } + + template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Packet packetOp(Index i) const { + const int packetSize = internal::unpacket_traits<Packet>::size; + EIGEN_ALIGN_MAX T values[packetSize]; + uint64_t local_state = m_state + i; + for (int j = 0; j < packetSize; ++j) { + values[j] = RandomToTypeUniform<T>(&local_state); + } + m_state = local_state; + return internal::pload<Packet>(values); + } + + private: + mutable uint64_t m_state; +}; + +template <typename Scalar> +struct functor_traits<UniformRandomGenerator<Scalar> > { + enum { + // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)). + Cost = 12 * NumTraits<Scalar>::AddCost * + ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)), + PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess + }; +}; + + + +template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +T RandomToTypeNormal(uint64_t* state) { + // Use the ratio of uniform method to generate numbers following a normal + // distribution. See for example Numerical Recipes chapter 7.3.9 for the + // details. + T u, v, q; + do { + u = RandomToTypeUniform<T>(state); + v = T(1.7156) * (RandomToTypeUniform<T>(state) - T(0.5)); + const T x = u - T(0.449871); + const T y = numext::abs(v) + T(0.386595); + q = x*x + y * (T(0.196)*y - T(0.25472)*x); + } while (q > T(0.27597) && + (q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u)); + + return v/u; +} + +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state) { + return std::complex<float>(RandomToTypeNormal<float>(state), + RandomToTypeNormal<float>(state)); +} +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state) { + return std::complex<double>(RandomToTypeNormal<double>(state), + RandomToTypeNormal<double>(state)); +} + + +template <typename T> class NormalRandomGenerator { + public: + static const bool PacketAccess = true; + + // Uses the given "seed" if non-zero, otherwise uses a random seed. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) { + m_state = PCG_XSH_RS_state(seed); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator( + const NormalRandomGenerator& other) { + m_state = other.m_state; + } + + template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T operator()(Index i) const { + uint64_t local_state = m_state + i; + T result = RandomToTypeNormal<T>(&local_state); + m_state = local_state; + return result; + } + + template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Packet packetOp(Index i) const { + const int packetSize = internal::unpacket_traits<Packet>::size; + EIGEN_ALIGN_MAX T values[packetSize]; + uint64_t local_state = m_state + i; + for (int j = 0; j < packetSize; ++j) { + values[j] = RandomToTypeNormal<T>(&local_state); + } + m_state = local_state; + return internal::pload<Packet>(values); + } + + private: + mutable uint64_t m_state; +}; + + +template <typename Scalar> +struct functor_traits<NormalRandomGenerator<Scalar> > { + enum { + // On average, we need to generate about 3 random numbers + // 15 mul, 8 add, 1.5 logs + Cost = 3 * functor_traits<UniformRandomGenerator<Scalar> >::Cost + + 15 * NumTraits<Scalar>::AddCost + 8 * NumTraits<Scalar>::AddCost + + 3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2, + PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess + }; +}; + + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 885295f0a..a87777b22 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -87,7 +87,7 @@ struct preserve_inner_most_dims { static const bool value = false; }; -#if defined(EIGEN_HAS_CONSTEXPR) && defined(EIGEN_HAS_VARIADIC_TEMPLATES) +#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES template <typename ReducedDims, int NumTensorDims> struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{ static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>(); @@ -122,7 +122,7 @@ struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{ template <int DimIndex, typename Self, typename Op> struct GenericDimReducer { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) { - EIGEN_STATIC_ASSERT(DimIndex > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) { const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex]; GenericDimReducer<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum); @@ -183,7 +183,7 @@ struct InnerMostDimPreserver { template <int DimIndex, typename Self, typename Op> struct InnerMostDimPreserver<DimIndex, Self, Op, true> { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) { - EIGEN_STATIC_ASSERT(DimIndex > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) { const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex]; InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum); @@ -248,16 +248,12 @@ struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> { *output = reducer.finalize(reducer.initialize()); return; } -#ifdef EIGEN_USE_COST_MODEL const TensorOpCost cost = self.m_impl.costPerCoeff(Vectorizable) + TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable, PacketSize); const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads( num_coeffs, cost, device.numThreads()); -#else - const int num_threads = device.numThreads(); -#endif if (num_threads == 1) { *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer); @@ -268,7 +264,7 @@ struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> { const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; eigen_assert(num_coeffs >= numblocks * blocksize); - Barrier barrier(numblocks); + Barrier barrier(internal::convert_index<unsigned int>(numblocks)); MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize()); for (Index i = 0; i < numblocks; ++i) { device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run, @@ -320,7 +316,18 @@ struct OuterReducer { #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) template <int B, int N, typename S, typename R, typename I> -__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*); +__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); + + +#ifdef EIGEN_HAS_CUDA_FP16 +template <typename S, typename R, typename I> +__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); +template <int B, int N, typename S, typename R, typename I> +__global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*); +template <int NPT, typename S, typename R, typename I> +__global__ void InnerReductionKernelHalfFloat(R, const S, I, I, half*); + +#endif template <int NPT, typename S, typename R, typename I> __global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); @@ -396,7 +403,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device) { - EIGEN_STATIC_ASSERT(NumInputDims >= NumReducedDims, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -464,22 +471,14 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - static bool size_large_enough(Index total_size) { -#ifndef EIGEN_USE_COST_MODEL - return total_size > 1024 * 1024; -#else - return true || total_size; -#endif - } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) { m_impl.evalSubExprsIfNeeded(NULL); // Use the FullReducer if possible. - if (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation && + if (RunningFullReduction && + internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation && ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || - (!RunningOnGPU && size_large_enough(internal::array_prod(m_impl.dimensions()))))) { - + !RunningOnGPU)) { bool need_assign = false; if (!data) { m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType))); @@ -493,7 +492,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> } // Attempt to use an optimized reduction. - else if (RunningOnGPU && data && (m_device.majorDeviceVersion() >= 3)) { + else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) { bool reducing_inner_dims = true; for (int i = 0; i < NumReducedDims; ++i) { if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { @@ -506,8 +505,25 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> (reducing_inner_dims || ReducingInnerMostDims)) { const Index num_values_to_reduce = internal::array_prod(m_reducedDims); const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); + if (!data) { + if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) { + data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); + m_result = data; + } + else { + return true; + } + } Op reducer(m_reducer); - return internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); + if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { + if (m_result) { + m_device.deallocate(m_result); + m_result = NULL; + } + return true; + } else { + return (m_result != NULL); + } } bool preserving_inner_dims = true; @@ -522,8 +538,25 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> preserving_inner_dims) { const Index num_values_to_reduce = internal::array_prod(m_reducedDims); const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); + if (!data) { + if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) { + data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); + m_result = data; + } + else { + return true; + } + } Op reducer(m_reducer); - return internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); + if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { + if (m_result) { + m_device.deallocate(m_result); + m_result = NULL; + } + return true; + } else { + return (m_result != NULL); + } } } return true; @@ -533,13 +566,14 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> m_impl.cleanup(); if (m_result) { m_device.deallocate(m_result); + m_result = NULL; } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - if (RunningFullReduction && m_result) { - return *m_result; + if ((RunningFullReduction || RunningOnGPU) && m_result) { + return *(m_result + index); } Op reducer(m_reducer); if (ReducingInnerMostDims || RunningFullReduction) { @@ -558,8 +592,12 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions()))); + + if (RunningOnGPU && m_result) { + return internal::pload<PacketReturnType>(m_result + index); + } EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; if (ReducingInnerMostDims) { @@ -617,11 +655,19 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> template <typename S, typename O, bool V> friend struct internal::FullReducerShard; #endif #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) - template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*); + template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); +#ifdef EIGEN_HAS_CUDA_FP16 + template <typename S, typename R, typename I> friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); + template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); + template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*); +#endif template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); + template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); #endif + template <typename S, typename O, typename D> friend struct internal::InnerReducer; + // Returns the Index in the input tensor of the first value that needs to be // used to compute the reduction at output index "index". EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index fd2587dd5..65638b6a8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -67,8 +67,41 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) #endif } -template <typename T> -__device__ inline void atomicReduce(T* output, T accum, SumReducer<T>&) { +// We extend atomicExch to support extra data types +template <typename Type> +__device__ inline Type atomicExchCustom(Type* address, Type val) { + return atomicExch(address, val); +} + +template <> +__device__ inline double atomicExchCustom(double* address, double val) { + unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address); + return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val))); +} + +#ifdef EIGEN_HAS_CUDA_FP16 +template <template <typename T> class R> +__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) { + unsigned int oldval = *reinterpret_cast<unsigned int*>(output); + unsigned int newval = oldval; + reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval)); + if (newval == oldval) { + return; + } + unsigned int readback; + while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) { + oldval = readback; + newval = oldval; + reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval)); + if (newval == oldval) { + return; + } + } +} +#endif + +template <> +__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) { #if __CUDA_ARCH__ >= 300 atomicAdd(output, accum); #else @@ -86,17 +119,44 @@ __global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coe } } + template <int BlockSize, int NumPerThread, typename Self, typename Reducer, typename Index> __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs, - typename Self::CoeffReturnType* output) { + typename Self::CoeffReturnType* output, unsigned int* semaphore) { +#if __CUDA_ARCH__ >= 300 + // Initialize the output value const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x; - - // Initialize the output value if it wasn't initialized by the ReductionInitKernel - if (gridDim.x == 1 && first_index == 0) { - *output = reducer.initialize(); + if (gridDim.x == 1) { + if (first_index == 0) { + *output = reducer.initialize(); + } + } + else { + if (threadIdx.x == 0) { + unsigned int block = atomicCAS(semaphore, 0u, 1u); + if (block == 0) { + // We're the first block to run, initialize the output value + atomicExchCustom(output, reducer.initialize()); + __threadfence(); + atomicExch(semaphore, 2u); + } + else { + // Wait for the first block to initialize the output value. + // Use atomicCAS here to ensure that the reads aren't cached + unsigned int val; + do { + val = atomicCAS(semaphore, 2u, 2u); + } + while (val < 2u); + } + } } + __syncthreads(); + + eigen_assert(gridDim.x == 1 || *semaphore >= 2u); + typename Self::CoeffReturnType accum = reducer.initialize(); Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize); for (Index i = 0; i < max_iter; i+=BlockSize) { @@ -108,50 +168,203 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num #pragma unroll for (int offset = warpSize/2; offset > 0; offset /= 2) { - reducer.reduce(__shfl_down(accum, offset), &accum); + reducer.reduce(__shfl_down(accum, offset, warpSize), &accum); } if ((threadIdx.x & (warpSize - 1)) == 0) { atomicReduce(output, accum, reducer); } + + if (gridDim.x > 1 && threadIdx.x == 0) { + // Let the last block reset the semaphore + atomicInc(semaphore, gridDim.x + 1); + } +#else + assert(0 && "Shouldn't be called on unsupported device"); +#endif +} + + +#ifdef EIGEN_HAS_CUDA_FP16 +template <typename Self, + typename Reducer, typename Index> +__global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) { + eigen_assert(blockDim.x == 1); + eigen_assert(gridDim.x == 1); + if (num_coeffs % 2 != 0) { + half last = input.m_impl.coeff(num_coeffs-1); + *scratch = __halves2half2(last, reducer.initialize()); + } else { + *scratch = reducer.template initializePacket<half2>(); + } } +template <typename Self, + typename Reducer, typename Index> +__global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) { + const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; + const Index num_threads = blockDim.x * gridDim.x; + const Index num_packets = num_coeffs / 2; + for (Index i = thread_id; i < num_packets; i += num_threads) { + ((half2*)output)[i] = reducer.template initializePacket<half2>(); + } -template <typename Self, typename Op, bool Vectorizable> -struct FullReducer<Self, Op, GpuDevice, Vectorizable> { - // Unfortunately nvidia doesn't support well exotic types such as complex, - // so reduce the scope of the optimized version of the code to the simple case - // of floats. - static const bool HasOptimizedImplementation = !Op::IsStateful && - internal::is_same<typename Self::CoeffReturnType, float>::value; + if (thread_id == 0 && num_coeffs % 2 != 0) { + output[num_coeffs-1] = reducer.initialize(); + } +} - template <typename OutputType> - static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const GpuDevice&, OutputType*) { - assert(false && "Should only be called on floats"); +template <int BlockSize, int NumPerThread, typename Self, + typename Reducer, typename Index> +__global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, + half* output, half2* scratch) { + eigen_assert(NumPerThread % 2 == 0); + + const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x; + + // Initialize the output value if it wasn't initialized by the ReductionInitKernel + if (gridDim.x == 1 && first_index == 0) { + if (num_coeffs % 2 != 0) { + half last = input.m_impl.coeff(num_coeffs-1); + *scratch = __halves2half2(last, reducer.initialize()); + } else { + *scratch = reducer.template initializePacket<half2>(); + } + __syncthreads(); + } + + half2 accum = reducer.template initializePacket<half2>(); + const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2); + for (Index i = 0; i < max_iter; i += BlockSize) { + const Index index = first_index + 2*i; + eigen_assert(index + 1 < num_coeffs); + half2 val = input.m_impl.template packet<Unaligned>(index); + reducer.reducePacket(val, &accum); + } + +#pragma unroll + for (int offset = warpSize/2; offset > 0; offset /= 2) { + reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum); } - static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) { + if ((threadIdx.x & (warpSize - 1)) == 0) { + atomicReduce(scratch, accum, reducer); + } + + __syncthreads(); + + if (gridDim.x == 1 && first_index == 0) { + half tmp = __low2half(*scratch); + reducer.reduce(__high2half(*scratch), &tmp); + *output = tmp; + } +} + +template <typename Op> +__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) { + eigen_assert(threadIdx.x == 1); + half tmp = __low2half(*scratch); + reducer.reduce(__high2half(*scratch), &tmp); + *output = tmp; +} + +#endif + +template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void> +struct FullReductionLauncher { + static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) { + assert(false && "Should only be called on doubles, floats and half floats"); + } +}; + +// Specialization for float and double +template <typename Self, typename Op, typename OutputType, bool PacketAccess> +struct FullReductionLauncher< + Self, Op, OutputType, PacketAccess, + typename internal::enable_if< + internal::is_same<float, OutputType>::value || + internal::is_same<double, OutputType>::value, + void>::type> { + static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) { typedef typename Self::Index Index; + typedef typename Self::CoeffReturnType Scalar; + const int block_size = 256; + const int num_per_thread = 128; + const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread); - const Index num_coeffs = array_prod(self.m_impl.dimensions()); - // Don't crash when we're called with an input tensor of size 0. - if (num_coeffs == 0) { - return; + unsigned int* semaphore = NULL; + if (num_blocks > 1) { + semaphore = device.semaphore(); } + LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>), + num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore); + } +}; + +#ifdef EIGEN_HAS_CUDA_FP16 +template <typename Self, typename Op> +struct FullReductionLauncher<Self, Op, Eigen::half, false> { + static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) { + assert(false && "Should not be called since there is no packet accessor"); + } +}; + +template <typename Self, typename Op> +struct FullReductionLauncher<Self, Op, Eigen::half, true> { + static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) { + typedef typename Self::Index Index; + const int block_size = 256; const int num_per_thread = 128; const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread); + half2* scratch = static_cast<half2*>(device.scratchpad()); if (num_blocks > 1) { - // We initialize the outputs outside the reduction kernel when we can't be sure that there + // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there // won't be a race conditions between multiple thread blocks. - LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>), - 1, 32, 0, device, reducer.initialize(), 1, output); + LAUNCH_CUDA_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>), + 1, 1, 0, device, reducer, self, num_coeffs, scratch); } - LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>), - num_blocks, block_size, 0, device, reducer, self, num_coeffs, output); + LAUNCH_CUDA_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>), + num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch); + + if (num_blocks > 1) { + LAUNCH_CUDA_KERNEL((ReductionCleanupKernelHalfFloat<Op>), + 1, 1, 0, device, reducer, output, scratch); + } + } +}; +#endif + + +template <typename Self, typename Op, bool Vectorizable> +struct FullReducer<Self, Op, GpuDevice, Vectorizable> { + // Unfortunately nvidia doesn't support well exotic types such as complex, + // so reduce the scope of the optimized version of the code to the simple cases + // of doubles, floats and half floats +#ifdef EIGEN_HAS_CUDA_FP16 + static const bool HasOptimizedImplementation = !Op::IsStateful && + (internal::is_same<typename Self::CoeffReturnType, float>::value || + internal::is_same<typename Self::CoeffReturnType, double>::value || + (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess)); +#else + static const bool HasOptimizedImplementation = !Op::IsStateful && + (internal::is_same<typename Self::CoeffReturnType, float>::value || + internal::is_same<typename Self::CoeffReturnType, double>::value); +#endif + + template <typename OutputType> + static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { + assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats"); + const Index num_coeffs = array_prod(self.m_impl.dimensions()); + // Don't crash when we're called with an input tensor of size 0. + if (num_coeffs == 0) { + return; + } + + FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs); } }; @@ -160,6 +373,8 @@ template <int NumPerThread, typename Self, typename Reducer, typename Index> __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, typename Self::CoeffReturnType* output) { +#if __CUDA_ARCH__ >= 300 + typedef typename Self::CoeffReturnType Type; eigen_assert(blockDim.y == 1); eigen_assert(blockDim.z == 1); eigen_assert(gridDim.y == 1); @@ -179,6 +394,7 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { output[i] = reducer.initialize(); } + __syncthreads(); } for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) { @@ -188,13 +404,13 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu const Index col_block = i % input_col_blocks; const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x; - float reduced_val = reducer.initialize(); + Type reduced_val = reducer.initialize(); for (Index j = 0; j < NumPerThread; j += unroll_times) { const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1); if (last_col >= num_coeffs_to_reduce) { - for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col +=blockDim.x) { - const float val = input.m_impl.coeff(row * num_coeffs_to_reduce + col); + for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) { + const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col); reducer.reduce(val, &reduced_val); } break; @@ -217,33 +433,128 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu atomicReduce(&(output[row]), reduced_val, reducer); } } + } +#else + assert(0 && "Shouldn't be called on unsupported device"); +#endif +} + +#ifdef EIGEN_HAS_CUDA_FP16 + +template <int NumPerThread, typename Self, + typename Reducer, typename Index> +__global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, + half* output) { + eigen_assert(blockDim.y == 1); + eigen_assert(blockDim.z == 1); + eigen_assert(gridDim.y == 1); + eigen_assert(gridDim.z == 1); + + const int unroll_times = 16; + eigen_assert(NumPerThread % unroll_times == 0); + eigen_assert(unroll_times % 2 == 0); + + const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2); + const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2); + const Index num_threads = blockDim.x * gridDim.x; + const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + // Initialize the output values if they weren't initialized by the ReductionInitKernel + if (gridDim.x == 1) { + Index i = 2*thread_id; + for (; i + 1 < num_preserved_coeffs; i += 2*num_threads) { + half* loc = output + i; + *((half2*)loc) = reducer.template initializePacket<half2>(); + } + if (i < num_preserved_coeffs) { + output[i] = reducer.initialize(); + } __syncthreads(); } + + for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) { + const Index row = 2 * (i / input_col_blocks); + + if (row + 1 < num_preserved_coeffs) { + const Index col_block = i % input_col_blocks; + const Index col_begin = 2 * (col_block * blockDim.x * NumPerThread + threadIdx.x); + + half2 reduced_val1 = reducer.template initializePacket<half2>(); + half2 reduced_val2 = reducer.template initializePacket<half2>(); + + for (Index j = 0; j < NumPerThread; j += unroll_times) { + const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1) * 2; + if (last_col >= num_coeffs_to_reduce) { + Index col = col_begin + blockDim.x * j; + for (; col + 1 < num_coeffs_to_reduce; col += blockDim.x) { + const half2 val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col); + reducer.reducePacket(val1, &reduced_val1); + const half2 val2 = input.m_impl.template packet<Unaligned>((row+1) * num_coeffs_to_reduce + col); + reducer.reducePacket(val2, &reduced_val2); + } + if (col < num_coeffs_to_reduce) { + // Peel; + const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col); + const half2 val1 = __halves2half2(last1, reducer.initialize()); + reducer.reducePacket(val1, &reduced_val1); + const half last2 = input.m_impl.coeff((row+1) * num_coeffs_to_reduce + col); + const half2 val2 = __halves2half2(last2, reducer.initialize()); + reducer.reducePacket(val2, &reduced_val2); + } + break; + } else { + // Faster version of the loop with no branches after unrolling. +#pragma unroll + for (int k = 0; k < unroll_times; ++k) { + const Index col = col_begin + blockDim.x * (j + k) * 2; + reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col), &reduced_val1); + reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1)* num_coeffs_to_reduce + col), &reduced_val2); + } + } + } + +#pragma unroll + for (int offset = warpSize/2; offset > 0; offset /= 2) { + reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1); + reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2); + } + + half val1 = __low2half(reduced_val1); + reducer.reduce(__high2half(reduced_val1), &val1); + half val2 = __low2half(reduced_val2); + reducer.reduce(__high2half(reduced_val2), &val2); + half2 val = __halves2half2(val1, val2); + + if ((threadIdx.x & (warpSize - 1)) == 0) { + half* loc = output + row; + atomicReduce((half2*)loc, val, reducer); + } + } + } } -template <typename Self, typename Op> -struct InnerReducer<Self, Op, GpuDevice> { - // Unfortunately nvidia doesn't support well exotic types such as complex, - // so reduce the scope of the optimized version of the code to the simple case - // of floats. - static const bool HasOptimizedImplementation = !Op::IsStateful && - internal::is_same<typename Self::CoeffReturnType, float>::value; +#endif - template <typename Device, typename OutputType> - static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { - assert(false && "Should only be called to reduce floats on a gpu device"); +template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void> +struct InnerReductionLauncher { + static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) { + assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device"); return true; } +}; - static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { +// Specialization for float and double +template <typename Self, typename Op, typename OutputType, bool PacketAccess> +struct InnerReductionLauncher< + Self, Op, OutputType, PacketAccess, + typename internal::enable_if< + internal::is_same<float, OutputType>::value || + internal::is_same<double, OutputType>::value, + void>::type> { + static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { typedef typename Self::Index Index; - // It's faster to use the usual code. - if (num_coeffs_to_reduce <= 32) { - return true; - } - const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; const int block_size = 256; const int num_per_thread = 128; @@ -259,7 +570,7 @@ struct InnerReducer<Self, Op, GpuDevice> { const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / 1024; const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); - LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>), + LAUNCH_CUDA_KERNEL((ReductionInitKernel<OutputType, Index>), num_blocks, 1024, 0, device, reducer.initialize(), num_preserved_vals, output); } @@ -271,6 +582,85 @@ struct InnerReducer<Self, Op, GpuDevice> { } }; +#ifdef EIGEN_HAS_CUDA_FP16 +template <typename Self, typename Op> +struct InnerReductionLauncher<Self, Op, Eigen::half, false> { + static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) { + assert(false && "Should not be called since there is no packet accessor"); + return true; + } +}; + +template <typename Self, typename Op> +struct InnerReductionLauncher<Self, Op, Eigen::half, true> { + static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + typedef typename Self::Index Index; + + if (num_preserved_vals % 2 != 0) { + // Not supported yet, revert to the slower code path + return true; + } + + const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; + const int block_size = /*256*/128; + const int num_per_thread = /*128*/64; + const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread); + const int max_blocks = device.getNumCudaMultiProcessors() * + device.maxCudaThreadsPerMultiProcessor() / block_size; + const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); + + if (num_blocks > 1) { + // We initialize the outputs outside the reduction kernel when we can't be sure that there + // won't be a race conditions between multiple thread blocks. + const int dyn_blocks = divup<int>(num_preserved_vals, 1024); + const int max_blocks = device.getNumCudaMultiProcessors() * + device.maxCudaThreadsPerMultiProcessor() / 1024; + const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); + LAUNCH_CUDA_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>), + 1, 1, 0, device, reducer, self, num_preserved_vals, output); + } + + LAUNCH_CUDA_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>), + num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); + + return false; + } +}; +#endif + + +template <typename Self, typename Op> +struct InnerReducer<Self, Op, GpuDevice> { + // Unfortunately nvidia doesn't support well exotic types such as complex, + // so reduce the scope of the optimized version of the code to the simple case + // of floats and half floats. +#ifdef EIGEN_HAS_CUDA_FP16 + static const bool HasOptimizedImplementation = !Op::IsStateful && + (internal::is_same<typename Self::CoeffReturnType, float>::value || + internal::is_same<typename Self::CoeffReturnType, double>::value || + (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess)); +#else + static const bool HasOptimizedImplementation = !Op::IsStateful && + (internal::is_same<typename Self::CoeffReturnType, float>::value || + internal::is_same<typename Self::CoeffReturnType, double>::value); +#endif + + template <typename OutputType> + static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats"); + const Index num_coeffs = array_prod(self.m_impl.dimensions()); + // Don't crash when we're called with an input tensor of size 0. + if (num_coeffs == 0) { + return true; + } + // It's faster to use the usual code. + if (num_coeffs_to_reduce <= 128) { + return true; + } + + return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals); + } +}; template <int NumPerThread, typename Self, typename Reducer, typename Index> @@ -283,6 +673,7 @@ __global__ void OuterReductionKernel(Reducer reducer, const Self input, Index nu for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { output[i] = reducer.initialize(); } + __syncthreads(); } // Do the reduction. @@ -307,11 +698,11 @@ struct OuterReducer<Self, Op, GpuDevice> { // so reduce the scope of the optimized version of the code to the simple case // of floats. static const bool HasOptimizedImplementation = !Op::IsStateful && - internal::is_same<typename Self::CoeffReturnType, float>::value; - + (internal::is_same<typename Self::CoeffReturnType, float>::value || + internal::is_same<typename Self::CoeffReturnType, double>::value); template <typename Device, typename OutputType> static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { - assert(false && "Should only be called to reduce floats on a gpu device"); + assert(false && "Should only be called to reduce doubles or floats on a gpu device"); return true; } @@ -323,7 +714,7 @@ struct OuterReducer<Self, Op, GpuDevice> { return true; } - const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; + const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; const int block_size = 256; const int num_per_thread = 16; const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h index bc92d9e6d..99245f778 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h @@ -193,7 +193,7 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef return m_evaluator->coeff(index); } -#ifdef EIGEN_HAS_VARIADIC_TEMPLATES +#if EIGEN_HAS_VARIADIC_TEMPLATES template<typename... IndexTypes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(Index firstIndex, IndexTypes... otherIndices) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index 1a59cc8f7..14e392e36 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -122,7 +122,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device : m_impl(op.expression(), device), m_reverse(op.reverse()) { // Reversing a scalar isn't supported yet. It would be a no-op anyway. - EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); // Compute strides m_dimensions = m_impl.dimensions(); @@ -195,7 +195,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); // TODO(ndjaitly): write a better packing routine that uses @@ -269,7 +269,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device> template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); // This code is pilfered from TensorMorphing.h diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h new file mode 100644 index 000000000..8501466ce --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h @@ -0,0 +1,287 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Igor Babuschkin <igor@babuschk.in> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H +#define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H + +namespace Eigen { + +namespace internal { + +template <typename Op, typename XprType> +struct traits<TensorScanOp<Op, XprType> > + : public traits<XprType> { + typedef typename XprType::Scalar Scalar; + typedef traits<XprType> XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprType::Nested Nested; + typedef typename remove_reference<Nested>::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template<typename Op, typename XprType> +struct eval<TensorScanOp<Op, XprType>, Eigen::Dense> +{ + typedef const TensorScanOp<Op, XprType>& type; +}; + +template<typename Op, typename XprType> +struct nested<TensorScanOp<Op, XprType>, 1, + typename eval<TensorScanOp<Op, XprType> >::type> +{ + typedef TensorScanOp<Op, XprType> type; +}; +} // end namespace internal + +/** \class TensorScan + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor scan class. + */ +template <typename Op, typename XprType> +class TensorScanOp + : public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> { +public: + typedef typename Eigen::internal::traits<TensorScanOp>::Scalar Scalar; + typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested<TensorScanOp>::type Nested; + typedef typename Eigen::internal::traits<TensorScanOp>::StorageKind StorageKind; + typedef typename Eigen::internal::traits<TensorScanOp>::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp( + const XprType& expr, const Index& axis, bool exclusive = false, const Op& op = Op()) + : m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Index axis() const { return m_axis; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const XprType& expression() const { return m_expr; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Op accumulator() const { return m_accumulator; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + bool exclusive() const { return m_exclusive; } + +protected: + typename XprType::Nested m_expr; + const Index m_axis; + const Op m_accumulator; + const bool m_exclusive; +}; + +template <typename Self, typename Reducer, typename Device> +struct ScanLauncher; + +// Eval as rvalue +template <typename Op, typename ArgType, typename Device> +struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> { + + typedef TensorScanOp<Op, ArgType> XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; + typedef DSizes<Index, NumDims> Dimensions; + typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self; + + enum { + IsAligned = false, + PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1), + BlockAccess = false, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, + RawAccess = true + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, + const Device& device) + : m_impl(op.expression(), device), + m_device(device), + m_exclusive(op.exclusive()), + m_accumulator(op.accumulator()), + m_size(m_impl.dimensions()[op.axis()]), + m_stride(1), + m_output(NULL) { + + // Accumulating a scalar isn't supported. + EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(op.axis() >= 0 && op.axis() < NumDims); + + // Compute stride of scan axis + const Dimensions& dims = m_impl.dimensions(); + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = 0; i < op.axis(); ++i) { + m_stride = m_stride * dims[i]; + } + } else { + for (int i = NumDims - 1; i > op.axis(); --i) { + m_stride = m_stride * dims[i]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { + return m_impl.dimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const { + return m_stride; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const { + return m_size; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const { + return m_accumulator; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const { + return m_exclusive; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const { + return m_impl; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { + return m_device; + } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + m_impl.evalSubExprsIfNeeded(NULL); + ScanLauncher<Self, Op, Device> launcher; + if (data) { + launcher(*this, data); + return false; + } + + const Index total_size = internal::array_prod(dimensions()); + m_output = static_cast<CoeffReturnType*>(m_device.allocate(total_size * sizeof(Scalar))); + launcher(*this, m_output); + return true; + } + + template<int LoadMode> + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { + return internal::ploadt<PacketReturnType, LoadMode>(m_output + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const + { + return m_output; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_output[index]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + if (m_output != NULL) { + m_device.deallocate(m_output); + m_output = NULL; + } + m_impl.cleanup(); + } + +protected: + TensorEvaluator<ArgType, Device> m_impl; + const Device& m_device; + const bool m_exclusive; + Op m_accumulator; + const Index m_size; + Index m_stride; + CoeffReturnType* m_output; +}; + +// CPU implementation of scan +// TODO(ibab) This single-threaded implementation should be parallelized, +// at least by running multiple scans at the same time. +template <typename Self, typename Reducer, typename Device> +struct ScanLauncher { + void operator()(Self& self, typename Self::CoeffReturnType *data) { + Index total_size = internal::array_prod(self.dimensions()); + + // We fix the index along the scan axis to 0 and perform a + // scan per remaining entry. The iteration is split into two nested + // loops to avoid an integer division by keeping track of each idx1 and idx2. + for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) { + for (Index idx2 = 0; idx2 < self.stride(); idx2++) { + // Calculate the starting offset for the scan + Index offset = idx1 + idx2; + + // Compute the scan along the axis, starting at the calculated offset + typename Self::CoeffReturnType accum = self.accumulator().initialize(); + for (Index idx3 = 0; idx3 < self.size(); idx3++) { + Index curr = offset + idx3 * self.stride(); + + if (self.exclusive()) { + data[curr] = self.accumulator().finalize(accum); + self.accumulator().reduce(self.inner().coeff(curr), &accum); + } else { + self.accumulator().reduce(self.inner().coeff(curr), &accum); + data[curr] = self.accumulator().finalize(accum); + } + } + } + } + } +}; + +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) + +// GPU implementation of scan +// TODO(ibab) This placeholder implementation performs multiple scans in +// parallel, but it would be better to use a parallel scan algorithm and +// optimize memory access. +template <typename Self, typename Reducer> +__global__ void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) { + // Compute offset as in the CPU version + Index val = threadIdx.x + blockIdx.x * blockDim.x; + Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride(); + + if (offset + (self.size() - 1) * self.stride() < total_size) { + // Compute the scan along the axis, starting at the calculated offset + typename Self::CoeffReturnType accum = self.accumulator().initialize(); + for (Index idx = 0; idx < self.size(); idx++) { + Index curr = offset + idx * self.stride(); + if (self.exclusive()) { + data[curr] = self.accumulator().finalize(accum); + self.accumulator().reduce(self.inner().coeff(curr), &accum); + } else { + self.accumulator().reduce(self.inner().coeff(curr), &accum); + data[curr] = self.accumulator().finalize(accum); + } + } + } + __syncthreads(); + +} + +template <typename Self, typename Reducer> +struct ScanLauncher<Self, Reducer, GpuDevice> { + void operator()(const Self& self, typename Self::CoeffReturnType* data) { + Index total_size = internal::array_prod(self.dimensions()); + Index num_blocks = (total_size / self.size() + 63) / 64; + Index block_size = 64; + LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data); + } +}; +#endif // EIGEN_USE_GPU && __CUDACC__ + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index e76533710..113c060e3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -166,7 +166,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; @@ -248,7 +248,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device> template <int StoreMode> EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; internal::pstore<CoeffReturnType, PacketReturnType>(values, x); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index 0e89033c4..f8121d17b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -85,7 +85,7 @@ class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_> : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size)), m_dimensions(dimensions) { EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN } -#ifdef EIGEN_HAS_VARIADIC_TEMPLATES +#if EIGEN_HAS_VARIADIC_TEMPLATES template <typename... DenseIndex> EIGEN_DEVICE_FUNC TensorStorage(DenseIndex... indices) : m_dimensions(indices...) { m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(internal::array_prod(m_dimensions)); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 52b7d216a..6c35bfdb6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -164,7 +164,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); Index inputIndices[] = {0, 0}; @@ -289,7 +289,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device> template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < this->dimensions().TotalSize()); Index inputIndices[] = {0, 0}; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h index 5950f38e2..3523e7c94 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h @@ -20,6 +20,7 @@ struct static_val { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator uint64_t() const { return n; } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val() { } + template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) { eigen_assert(v == n); @@ -53,7 +54,7 @@ struct TensorUInt128 template<typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE explicit TensorUInt128(const T& x) : high(0), low(x) { - eigen_assert((static_cast<typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type>(x) <= static_cast<typename conditional<sizeof(LOW) == 8, uint64_t, uint32_t>::type>(NumTraits<LOW>::highest()))); + eigen_assert((static_cast<typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type>(x) <= NumTraits<uint64_t>::highest())); eigen_assert(x >= 0); } @@ -74,21 +75,21 @@ struct TensorUInt128 template <typename HL, typename LL, typename HR, typename LR> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -static bool operator == (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) +bool operator == (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) { return (lhs.high == rhs.high) & (lhs.low == rhs.low); } template <typename HL, typename LL, typename HR, typename LR> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -static bool operator != (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) +bool operator != (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) { return (lhs.high != rhs.high) | (lhs.low != rhs.low); } template <typename HL, typename LL, typename HR, typename LR> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -static bool operator >= (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) +bool operator >= (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) { if (lhs.high != rhs.high) { return lhs.high > rhs.high; @@ -98,7 +99,7 @@ static bool operator >= (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<H template <typename HL, typename LL, typename HR, typename LR> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -static bool operator < (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) +bool operator < (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) { if (lhs.high != rhs.high) { return lhs.high < rhs.high; @@ -108,7 +109,7 @@ static bool operator < (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR template <typename HL, typename LL, typename HR, typename LR> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -static TensorUInt128<uint64_t, uint64_t> operator + (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) +TensorUInt128<uint64_t, uint64_t> operator + (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) { TensorUInt128<uint64_t, uint64_t> result(lhs.high + rhs.high, lhs.low + rhs.low); if (result.low < rhs.low) { @@ -119,7 +120,7 @@ static TensorUInt128<uint64_t, uint64_t> operator + (const TensorUInt128<HL, LL> template <typename HL, typename LL, typename HR, typename LR> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -static TensorUInt128<uint64_t, uint64_t> operator - (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) +TensorUInt128<uint64_t, uint64_t> operator - (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) { TensorUInt128<uint64_t, uint64_t> result(lhs.high - rhs.high, lhs.low - rhs.low); if (result.low > lhs.low) { @@ -130,8 +131,8 @@ static TensorUInt128<uint64_t, uint64_t> operator - (const TensorUInt128<HL, LL> template <typename HL, typename LL, typename HR, typename LR> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -static TensorUInt128<uint64_t, uint64_t> operator * (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +TensorUInt128<uint64_t, uint64_t> operator * (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) { // Split each 128-bit integer into 4 32-bit integers, and then do the // multiplications by hand as follow: @@ -205,8 +206,8 @@ static TensorUInt128<uint64_t, uint64_t> operator * (const TensorUInt128<HL, LL> } template <typename HL, typename LL, typename HR, typename LR> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -static TensorUInt128<uint64_t, uint64_t> operator / (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +TensorUInt128<uint64_t, uint64_t> operator / (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) { if (rhs == TensorUInt128<static_val<0>, static_val<1> >(1)) { return TensorUInt128<uint64_t, uint64_t>(lhs.high, lhs.low); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index e735fc76f..0ca2cac84 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -187,7 +187,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { - EIGEN_STATIC_ASSERT(NumDims >= 5, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE); m_paddingValue = op.padding_value(); @@ -408,7 +408,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1 || diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt b/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt deleted file mode 100644 index 6e871a8da..000000000 --- a/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -FILE(GLOB Eigen_CXX11_TensorSymmetry_SRCS "*.h") - -INSTALL(FILES - ${Eigen_CXX11_TensorSymmetry_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/TensorSymmetry COMPONENT Devel - ) - -add_subdirectory(util) diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt deleted file mode 100644 index dc9fc78ec..000000000 --- a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_CXX11_TensorSymmetry_util_SRCS "*.h") - -INSTALL(FILES - ${Eigen_CXX11_TensorSymmetry_util_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/TensorSymmetry/util COMPONENT Devel - ) diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt b/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt deleted file mode 100644 index 88fef50c6..000000000 --- a/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_CXX11_ThreadPool_SRCS "*.h") - -INSTALL(FILES - ${Eigen_CXX11_ThreadPool_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/ThreadPool COMPONENT Devel - ) diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h index 6dd64f185..71d55552d 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h @@ -50,7 +50,7 @@ class EventCount { public: class Waiter; - EventCount(std::vector<Waiter>& waiters) : waiters_(waiters) { + EventCount(MaxSizeVector<Waiter>& waiters) : waiters_(waiters) { eigen_assert(waiters.size() < (1 << kWaiterBits) - 1); // Initialize epoch to something close to overflow to test overflow. state_ = kStackMask | (kEpochMask - kEpochInc * waiters.size() * 2); @@ -169,7 +169,8 @@ class EventCount { class Waiter { friend class EventCount; - std::atomic<Waiter*> next; + // Align to 128 byte boundary to prevent false sharing with other Waiter objects in the same vector. + EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic<Waiter*> next; std::mutex mu; std::condition_variable cv; uint64_t epoch; @@ -179,8 +180,6 @@ class EventCount { kWaiting, kSignaled, }; - // Prevent false sharing with other Waiter objects in the same vector. - char pad_[128]; }; private: @@ -200,7 +199,7 @@ class EventCount { static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift; static const uint64_t kEpochInc = 1ull << kEpochShift; std::atomic<uint64_t> state_; - std::vector<Waiter>& waiters_; + MaxSizeVector<Waiter>& waiters_; void Park(Waiter* w) { std::unique_lock<std::mutex> lock(w->mu); diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h index 1c471a19f..354bce52a 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h @@ -23,18 +23,44 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { : env_(env), threads_(num_threads), queues_(num_threads), + coprimes_(num_threads), waiters_(num_threads), - blocked_(), - spinning_(), - done_(), + blocked_(0), + spinning_(0), + done_(false), ec_(waiters_) { - for (int i = 0; i < num_threads; i++) queues_.push_back(new Queue()); - for (int i = 0; i < num_threads; i++) + waiters_.resize(num_threads); + + // Calculate coprimes of num_threads. + // Coprimes are used for a random walk over all threads in Steal + // and NonEmptyQueueIndex. Iteration is based on the fact that if we take + // a walk starting thread index t and calculate num_threads - 1 subsequent + // indices as (t + coprime) % num_threads, we will cover all threads without + // repetitions (effectively getting a presudo-random permutation of thread + // indices). + for (int i = 1; i <= num_threads; i++) { + unsigned a = i; + unsigned b = num_threads; + // If GCD(a, b) == 1, then a and b are coprimes. + while (b != 0) { + unsigned tmp = a; + a = b; + b = tmp % b; + } + if (a == 1) { + coprimes_.push_back(i); + } + } + for (int i = 0; i < num_threads; i++) { + queues_.push_back(new Queue()); + } + for (int i = 0; i < num_threads; i++) { threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); })); + } } ~NonBlockingThreadPoolTempl() { - done_.store(true, std::memory_order_relaxed); + done_ = true; // Now if all threads block without work, they will start exiting. // But note that threads can continue to work arbitrary long, // block, submit new work, unblock and otherwise live full life. @@ -50,7 +76,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { PerThread* pt = GetPerThread(); if (pt->pool == this) { // Worker thread of this pool, push onto the thread's queue. - Queue* q = queues_[pt->index]; + Queue* q = queues_[pt->thread_id]; t = q->PushFront(std::move(t)); } else { // A free-standing thread (or worker of another pool), push onto a random @@ -71,108 +97,111 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { env_.ExecuteTask(t); // Push failed, execute directly. } + int NumThreads() const final { + return static_cast<int>(threads_.size()); + } + + int CurrentThreadId() const final { + const PerThread* pt = + const_cast<NonBlockingThreadPoolTempl*>(this)->GetPerThread(); + if (pt->pool == this) { + return pt->thread_id; + } else { + return -1; + } + } + private: typedef typename Environment::EnvThread Thread; struct PerThread { - bool inited; + constexpr PerThread() : pool(NULL), rand(0), thread_id(-1) { } NonBlockingThreadPoolTempl* pool; // Parent pool, or null for normal threads. - unsigned index; // Worker thread index in pool. - unsigned rand; // Random generator state. + uint64_t rand; // Random generator state. + int thread_id; // Worker thread index in pool. }; Environment env_; MaxSizeVector<Thread*> threads_; MaxSizeVector<Queue*> queues_; - std::vector<EventCount::Waiter> waiters_; + MaxSizeVector<unsigned> coprimes_; + MaxSizeVector<EventCount::Waiter> waiters_; std::atomic<unsigned> blocked_; std::atomic<bool> spinning_; std::atomic<bool> done_; EventCount ec_; // Main worker thread loop. - void WorkerLoop(unsigned index) { + void WorkerLoop(int thread_id) { PerThread* pt = GetPerThread(); pt->pool = this; - pt->index = index; - Queue* q = queues_[index]; - EventCount::Waiter* waiter = &waiters_[index]; - std::vector<Task> stolen; + pt->rand = std::hash<std::thread::id>()(std::this_thread::get_id()); + pt->thread_id = thread_id; + Queue* q = queues_[thread_id]; + EventCount::Waiter* waiter = &waiters_[thread_id]; for (;;) { - Task t; - if (!stolen.empty()) { - t = std::move(stolen.back()); - stolen.pop_back(); - } - if (!t.f) t = q->PopFront(); + Task t = q->PopFront(); if (!t.f) { - if (Steal(&stolen)) { - t = std::move(stolen.back()); - stolen.pop_back(); - while (stolen.size()) { - Task t1 = q->PushFront(std::move(stolen.back())); - stolen.pop_back(); - if (t1.f) { - // There is not much we can do in this case. Just execute the - // remaining directly. - stolen.push_back(std::move(t1)); - break; + t = Steal(); + if (!t.f) { + // Leave one thread spinning. This reduces latency. + // TODO(dvyukov): 1000 iterations is based on fair dice roll, tune it. + // Also, the time it takes to attempt to steal work 1000 times depends + // on the size of the thread pool. However the speed at which the user + // of the thread pool submit tasks is independent of the size of the + // pool. Consider a time based limit instead. + if (!spinning_ && !spinning_.exchange(true)) { + for (int i = 0; i < 1000 && !t.f; i++) { + t = Steal(); + } + spinning_ = false; + } + if (!t.f) { + if (!WaitForWork(waiter, &t)) { + return; } } } } if (t.f) { env_.ExecuteTask(t); - continue; } - // Leave one thread spinning. This reduces latency. - if (!spinning_ && !spinning_.exchange(true)) { - bool nowork = true; - for (int i = 0; i < 1000; i++) { - if (!OutOfWork()) { - nowork = false; - break; - } - } - spinning_ = false; - if (!nowork) continue; - } - if (!WaitForWork(waiter)) return; } } // Steal tries to steal work from other worker threads in best-effort manner. - bool Steal(std::vector<Task>* stolen) { - if (queues_.size() == 1) return false; + Task Steal() { PerThread* pt = GetPerThread(); - unsigned lastq = pt->index; - for (unsigned i = queues_.size(); i > 0; i--) { - unsigned victim = Rand(&pt->rand) % queues_.size(); - if (victim == lastq && queues_.size() > 2) { - i++; - continue; + const size_t size = queues_.size(); + unsigned r = Rand(&pt->rand); + unsigned inc = coprimes_[r % coprimes_.size()]; + unsigned victim = r % size; + for (unsigned i = 0; i < size; i++) { + Task t = queues_[victim]->PopBack(); + if (t.f) { + return t; + } + victim += inc; + if (victim >= size) { + victim -= size; } - // Steal half of elements from a victim queue. - // It is typical to steal just one element, but that assumes that work is - // recursively subdivided in halves so that the stolen element is exactly - // half of work. If work elements are equally-sized, then is makes sense - // to steal half of elements at once and then work locally for a while. - if (queues_[victim]->PopBackHalf(stolen)) return true; - lastq = victim; } - // Just to make sure that we did not miss anything. - for (unsigned i = queues_.size(); i > 0; i--) - if (queues_[i - 1]->PopBackHalf(stolen)) return true; - return false; + return Task(); } - // WaitForWork blocks until new work is available, or if it is time to exit. - bool WaitForWork(EventCount::Waiter* waiter) { - // We already did best-effort emptiness check in Steal, so prepare blocking. + // WaitForWork blocks until new work is available (returns true), or if it is + // time to exit (returns false). Can optionally return a task to execute in t + // (in such case t.f != nullptr on return). + bool WaitForWork(EventCount::Waiter* waiter, Task* t) { + eigen_assert(!t->f); + // We already did best-effort emptiness check in Steal, so prepare for + // blocking. ec_.Prewait(waiter); - // Now do reliable emptiness check. - if (!OutOfWork()) { + // Now do a reliable emptiness check. + int victim = NonEmptyQueueIndex(); + if (victim != -1) { ec_.CancelWait(waiter); + *t = queues_[victim]->PopBack(); return true; } // Number of blocked threads is used as termination condition. @@ -186,7 +215,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { // right after incrementing blocked_ above. Now a free-standing thread // submits work and calls destructor (which sets done_). If we don't // re-check queues, we will exit leaving the work unexecuted. - if (!OutOfWork()) { + if (NonEmptyQueueIndex() != -1) { // Note: we must not pop from queues before we decrement blocked_, // otherwise the following scenario is possible. Consider that instead // of checking for emptiness we popped the only element from queues. @@ -205,23 +234,36 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { return true; } - bool OutOfWork() { - for (unsigned i = 0; i < queues_.size(); i++) - if (!queues_[i]->Empty()) return false; - return true; + int NonEmptyQueueIndex() { + PerThread* pt = GetPerThread(); + const size_t size = queues_.size(); + unsigned r = Rand(&pt->rand); + unsigned inc = coprimes_[r % coprimes_.size()]; + unsigned victim = r % size; + for (unsigned i = 0; i < size; i++) { + if (!queues_[victim]->Empty()) { + return victim; + } + victim += inc; + if (victim >= size) { + victim -= size; + } + } + return -1; } - PerThread* GetPerThread() { + static EIGEN_STRONG_INLINE PerThread* GetPerThread() { EIGEN_THREAD_LOCAL PerThread per_thread_; PerThread* pt = &per_thread_; - if (pt->inited) return pt; - pt->inited = true; - pt->rand = std::hash<std::thread::id>()(std::this_thread::get_id()); return pt; } - static unsigned Rand(unsigned* state) { - return *state = *state * 1103515245 + 12345; + static EIGEN_STRONG_INLINE unsigned Rand(uint64_t* state) { + uint64_t current = *state; + // Update the internal state + *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; + // Generate the random output (using the PCG-XSH-RS scheme) + return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61))); } }; diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h index 0544a6e15..05ed76cbe 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h @@ -38,7 +38,7 @@ namespace Eigen { template <typename Work, unsigned kSize> class RunQueue { public: - RunQueue() : front_(), back_() { + RunQueue() : front_(0), back_(0) { // require power-of-two for fast masking eigen_assert((kSize & (kSize - 1)) == 0); eigen_assert(kSize > 2); // why would you do this? @@ -100,7 +100,7 @@ class RunQueue { // PopBack removes and returns the last elements in the queue. // Can fail spuriously. Work PopBack() { - if (Empty()) return 0; + if (Empty()) return Work(); std::unique_lock<std::mutex> lock(mutex_, std::try_to_lock); if (!lock) return Work(); unsigned back = back_.load(std::memory_order_relaxed); diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h index 17fd1658b..e75d0f467 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h @@ -24,7 +24,7 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface { explicit SimpleThreadPoolTempl(int num_threads, Environment env = Environment()) : env_(env), threads_(num_threads), waiters_(num_threads) { for (int i = 0; i < num_threads; i++) { - threads_.push_back(env.CreateThread([this]() { WorkerLoop(); })); + threads_.push_back(env.CreateThread([this, i]() { WorkerLoop(i); })); } } @@ -55,7 +55,7 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface { // Schedule fn() for execution in the pool of threads. The functions are // executed in the order in which they are scheduled. - void Schedule(std::function<void()> fn) { + void Schedule(std::function<void()> fn) final { Task t = env_.CreateTask(std::move(fn)); std::unique_lock<std::mutex> l(mu_); if (waiters_.empty()) { @@ -69,9 +69,25 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface { } } + int NumThreads() const final { + return static_cast<int>(threads_.size()); + } + + int CurrentThreadId() const final { + const PerThread* pt = this->GetPerThread(); + if (pt->pool == this) { + return pt->thread_id; + } else { + return -1; + } + } + protected: - void WorkerLoop() { + void WorkerLoop(int thread_id) { std::unique_lock<std::mutex> l(mu_); + PerThread* pt = GetPerThread(); + pt->pool = this; + pt->thread_id = thread_id; Waiter w; Task t; while (!exiting_) { @@ -111,13 +127,24 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface { bool ready; }; + struct PerThread { + constexpr PerThread() : pool(NULL), thread_id(-1) { } + SimpleThreadPoolTempl* pool; // Parent pool, or null for normal threads. + int thread_id; // Worker thread index in pool. + }; + Environment env_; std::mutex mu_; MaxSizeVector<Thread*> threads_; // All threads MaxSizeVector<Waiter*> waiters_; // Stack of waiting threads. - std::deque<Task> pending_; // Queue of pending work - std::condition_variable empty_; // Signaled on pending_.empty() + std::deque<Task> pending_; // Queue of pending work + std::condition_variable empty_; // Signaled on pending_.empty() bool exiting_ = false; + + PerThread* GetPerThread() const { + EIGEN_THREAD_LOCAL PerThread per_thread; + return &per_thread; + } }; typedef SimpleThreadPoolTempl<StlThreadEnvironment> SimpleThreadPool; diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h index d2204ad5b..399f95cc1 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h @@ -21,14 +21,14 @@ struct StlThreadEnvironment { // destructor must join the thread. class EnvThread { public: - EnvThread(std::function<void()> f) : thr_(f) {} + EnvThread(std::function<void()> f) : thr_(std::move(f)) {} ~EnvThread() { thr_.join(); } private: std::thread thr_; }; - EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(f); } + EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(std::move(f)); } Task CreateTask(std::function<void()> f) { return Task{std::move(f)}; } void ExecuteTask(const Task& t) { t.f(); } }; diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h index 38b40aceb..a65ee97c9 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h @@ -18,6 +18,13 @@ class ThreadPoolInterface { public: virtual void Schedule(std::function<void()> fn) = 0; + // Returns the number of threads in the pool. + virtual int NumThreads() const = 0; + + // Returns a logical thread index between 0 and NumThreads() - 1 if called + // from one of the threads in the pool. Returns -1 otherwise. + virtual int CurrentThreadId() const = 0; + virtual ~ThreadPoolInterface() {} }; diff --git a/unsupported/Eigen/CXX11/src/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/util/CMakeLists.txt deleted file mode 100644 index 7eab492d6..000000000 --- a/unsupported/Eigen/CXX11/src/util/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_CXX11_util_SRCS "*.h") - -INSTALL(FILES - ${Eigen_CXX11_util_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/util COMPONENT Devel - ) diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h index 24159e54c..30d3ebcff 100644 --- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h @@ -117,7 +117,7 @@ template <typename T, size_t n> class array { values[7] = v8; } -#ifdef EIGEN_HAS_VARIADIC_TEMPLATES +#if EIGEN_HAS_VARIADIC_TEMPLATES EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array(std::initializer_list<T> l) { eigen_assert(l.size() == n); @@ -167,7 +167,7 @@ template <typename T> class array<T, 0> { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array() : dummy() { } -#ifdef EIGEN_HAS_VARIADIC_TEMPLATES +#if EIGEN_HAS_VARIADIC_TEMPLATES EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() { eigen_assert(l.size() == 0); } diff --git a/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h b/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h index 961456f10..4bc3dd1ba 100644 --- a/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h +++ b/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h @@ -55,6 +55,17 @@ class MaxSizeVector { internal::aligned_free(data_); } + void resize(size_t n) { + eigen_assert(n <= reserve_); + for (size_t i = size_; i < n; ++i) { + new (&data_[i]) T; + } + for (size_t i = n; i < size_; ++i) { + data_[i].~T(); + } + size_ = n; + } + // Append new elements (up to reserved size). EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void push_back(const T& t) { diff --git a/unsupported/Eigen/EulerAngles b/unsupported/Eigen/EulerAngles new file mode 100644 index 000000000..521fa3f76 --- /dev/null +++ b/unsupported/Eigen/EulerAngles @@ -0,0 +1,43 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_EULERANGLES_MODULE_H +#define EIGEN_EULERANGLES_MODULE_H + + +#include "Eigen/Core" +#include "Eigen/Geometry" + +#include "Eigen/src/Core/util/DisableStupidWarnings.h" + +namespace Eigen { + +/** + * \defgroup EulerAngles_Module EulerAngles module + * \brief This module provides generic euler angles rotation. + * + * Euler angles are a way to represent 3D rotation. + * + * In order to use this module in your code, include this header: + * \code + * #include <unsupported/Eigen/EulerAngles> + * \endcode + * + * See \ref EulerAngles for more information. + * + */ + +} + +#include "src/EulerAngles/EulerSystem.h" +#include "src/EulerAngles/EulerAngles.h" + +#include "Eigen/src/Core/util/ReenableStupidWarnings.h" + +#endif // EIGEN_EULERANGLES_MODULE_H diff --git a/unsupported/Eigen/KroneckerProduct b/unsupported/Eigen/KroneckerProduct index c932c06a6..5f5afb8cf 100644 --- a/unsupported/Eigen/KroneckerProduct +++ b/unsupported/Eigen/KroneckerProduct @@ -13,6 +13,8 @@ #include "../../Eigen/src/Core/util/DisableStupidWarnings.h" +#include "../../Eigen/src/SparseCore/SparseUtil.h" + namespace Eigen { /** diff --git a/unsupported/Eigen/MPRealSupport b/unsupported/Eigen/MPRealSupport index 89036886b..7f0b70c63 100644 --- a/unsupported/Eigen/MPRealSupport +++ b/unsupported/Eigen/MPRealSupport @@ -67,27 +67,32 @@ int main() IsSigned = 1, IsComplex = 0, RequireInitialization = 1, - ReadCost = 10, - AddCost = 10, - MulCost = 40 + ReadCost = HugeCost, + AddCost = HugeCost, + MulCost = HugeCost }; typedef mpfr::mpreal Real; typedef mpfr::mpreal NonInteger; - inline static Real highest (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::maxval(Precision); } - inline static Real lowest (long Precision = mpfr::mpreal::get_default_prec()) { return -mpfr::maxval(Precision); } + static inline Real highest (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::maxval(Precision); } + static inline Real lowest (long Precision = mpfr::mpreal::get_default_prec()) { return -mpfr::maxval(Precision); } // Constants - inline static Real Pi (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_pi(Precision); } - inline static Real Euler (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_euler(Precision); } - inline static Real Log2 (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_log2(Precision); } - inline static Real Catalan (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_catalan(Precision); } + static inline Real Pi (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_pi(Precision); } + static inline Real Euler (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_euler(Precision); } + static inline Real Log2 (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_log2(Precision); } + static inline Real Catalan (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_catalan(Precision); } - inline static Real epsilon (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::machine_epsilon(Precision); } - inline static Real epsilon (const Real& x) { return mpfr::machine_epsilon(x); } + static inline Real epsilon (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::machine_epsilon(Precision); } + static inline Real epsilon (const Real& x) { return mpfr::machine_epsilon(x); } - inline static Real dummy_precision() +#ifdef MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS + static inline int digits10 (long Precision = mpfr::mpreal::get_default_prec()) { return std::numeric_limits<Real>::digits10(Precision); } + static inline int digits10 (const Real& x) { return std::numeric_limits<Real>::digits10(x); } +#endif + + static inline Real dummy_precision() { mpfr_prec_t weak_prec = ((mpfr::mpreal::get_default_prec()-1) * 90) / 100; return mpfr::machine_epsilon(weak_prec); diff --git a/unsupported/Eigen/SpecialFunctions b/unsupported/Eigen/SpecialFunctions new file mode 100644 index 000000000..7c7493c56 --- /dev/null +++ b/unsupported/Eigen/SpecialFunctions @@ -0,0 +1,61 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Gael Guennebaud <g.gael@free.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPECIALFUNCTIONS_MODULE +#define EIGEN_SPECIALFUNCTIONS_MODULE + +#include "../../Eigen/Core" + +#include "../../Eigen/src/Core/util/DisableStupidWarnings.h" + +namespace Eigen { + +/** + * \defgroup SpecialFunctions_Module Special math functions module + * + * This module features additional coefficient-wise math functions available + * within the numext:: namespace for the scalar version, and as method and/or free + * functions of Array. Those include: + * + * - erf + * - erfc + * - lgamma + * - igamma + * - igammac + * - digamma + * - polygamma + * - zeta + * - betainc + * + * \code + * #include <unsupported/Eigen/SpecialFunctions> + * \endcode + */ +//@{ + +} + +#include "src/SpecialFunctions/SpecialFunctionsImpl.h" +#include "src/SpecialFunctions/SpecialFunctionsPacketMath.h" +#include "src/SpecialFunctions/SpecialFunctionsHalf.h" +#include "src/SpecialFunctions/SpecialFunctionsFunctors.h" +#include "src/SpecialFunctions/SpecialFunctionsArrayAPI.h" + +#if defined EIGEN_VECTORIZE_CUDA + #include "src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h" +#endif + +namespace Eigen { +//@} +} + + +#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h" + +#endif // EIGEN_SPECIALFUNCTIONS_MODULE diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h b/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h index 1a61e3367..33b6c393f 100644 --- a/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h +++ b/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h @@ -20,37 +20,60 @@ public: AutoDiffJacobian(const Functor& f) : Functor(f) {} // forward constructors +#if EIGEN_HAS_VARIADIC_TEMPLATES + template<typename... T> + AutoDiffJacobian(const T& ...Values) : Functor(Values...) {} +#else template<typename T0> AutoDiffJacobian(const T0& a0) : Functor(a0) {} template<typename T0, typename T1> AutoDiffJacobian(const T0& a0, const T1& a1) : Functor(a0, a1) {} template<typename T0, typename T1, typename T2> AutoDiffJacobian(const T0& a0, const T1& a1, const T2& a2) : Functor(a0, a1, a2) {} +#endif + + typedef typename Functor::InputType InputType; + typedef typename Functor::ValueType ValueType; + typedef typename ValueType::Scalar Scalar; enum { - InputsAtCompileTime = Functor::InputsAtCompileTime, - ValuesAtCompileTime = Functor::ValuesAtCompileTime + InputsAtCompileTime = InputType::RowsAtCompileTime, + ValuesAtCompileTime = ValueType::RowsAtCompileTime }; - typedef typename Functor::InputType InputType; - typedef typename Functor::ValueType ValueType; - typedef typename Functor::JacobianType JacobianType; - typedef typename JacobianType::Scalar Scalar; + typedef Matrix<Scalar, ValuesAtCompileTime, InputsAtCompileTime> JacobianType; typedef typename JacobianType::Index Index; - typedef Matrix<Scalar,InputsAtCompileTime,1> DerivativeType; + typedef Matrix<Scalar, InputsAtCompileTime, 1> DerivativeType; typedef AutoDiffScalar<DerivativeType> ActiveScalar; - typedef Matrix<ActiveScalar, InputsAtCompileTime, 1> ActiveInput; typedef Matrix<ActiveScalar, ValuesAtCompileTime, 1> ActiveValue; +#if EIGEN_HAS_VARIADIC_TEMPLATES + // Some compilers don't accept variadic parameters after a default parameter, + // i.e., we can't just write _jac=0 but we need to overload operator(): + EIGEN_STRONG_INLINE + void operator() (const InputType& x, ValueType* v) const + { + this->operator()(x, v, 0); + } + template<typename... ParamsType> + void operator() (const InputType& x, ValueType* v, JacobianType* _jac, + const ParamsType&... Params) const +#else void operator() (const InputType& x, ValueType* v, JacobianType* _jac=0) const +#endif { eigen_assert(v!=0); + if (!_jac) { +#if EIGEN_HAS_VARIADIC_TEMPLATES + Functor::operator()(x, v, Params...); +#else Functor::operator()(x, v); +#endif return; } @@ -61,12 +84,16 @@ public: if(InputsAtCompileTime==Dynamic) for (Index j=0; j<jac.rows(); j++) - av[j].derivatives().resize(this->inputs()); + av[j].derivatives().resize(x.rows()); for (Index i=0; i<jac.cols(); i++) - ax[i].derivatives() = DerivativeType::Unit(this->inputs(),i); + ax[i].derivatives() = DerivativeType::Unit(x.rows(),i); +#if EIGEN_HAS_VARIADIC_TEMPLATES + Functor::operator()(ax, &av, Params...); +#else Functor::operator()(ax, &av); +#endif for (Index i=0; i<jac.rows(); i++) { @@ -74,8 +101,6 @@ public: jac.row(i) = av[i].derivatives(); } } -protected: - }; } diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h index 481dfa91a..50fedf6ac 100755 --- a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h +++ b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h @@ -30,6 +30,13 @@ template<typename _DerType, bool Enable> struct auto_diff_special_op; } // end namespace internal +template<typename _DerType> class AutoDiffScalar; + +template<typename NewDerType> +inline AutoDiffScalar<NewDerType> MakeAutoDiffScalar(const typename NewDerType::Scalar& value, const NewDerType &der) { + return AutoDiffScalar<NewDerType>(value,der); +} + /** \class AutoDiffScalar * \brief A scalar type replacement with automatic differentation capability * @@ -60,7 +67,7 @@ template<typename _DerType> class AutoDiffScalar : public internal::auto_diff_special_op <_DerType, !internal::is_same<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar, - typename NumTraits<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar>::Real>::value> + typename NumTraits<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar>::Real>::value> { public: typedef internal::auto_diff_special_op @@ -101,7 +108,7 @@ class AutoDiffScalar template<typename OtherDerType> AutoDiffScalar(const AutoDiffScalar<OtherDerType>& other #ifndef EIGEN_PARSED_BY_DOXYGEN - , typename internal::enable_if<internal::is_same<Scalar,typename OtherDerType::Scalar>::value,void*>::type = 0 + , typename internal::enable_if<internal::is_same<Scalar, typename internal::traits<typename internal::remove_all<OtherDerType>::type>::Scalar>::value,void*>::type = 0 #endif ) : m_value(other.value()), m_derivatives(other.derivatives()) @@ -257,20 +264,16 @@ class AutoDiffScalar -m_derivatives); } - inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> > + inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) > operator*(const Scalar& other) const { - return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >( - m_value * other, - (m_derivatives * other)); + return MakeAutoDiffScalar(m_value * other, m_derivatives * other); } - friend inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> > + friend inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) > operator*(const Scalar& other, const AutoDiffScalar& a) { - return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >( - a.value() * other, - a.derivatives() * other); + return MakeAutoDiffScalar(a.value() * other, a.derivatives() * other); } // inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type > @@ -289,20 +292,16 @@ class AutoDiffScalar // a.derivatives() * other); // } - inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> > + inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) > operator/(const Scalar& other) const { - return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >( - m_value / other, - (m_derivatives * (Scalar(1)/other))); + return MakeAutoDiffScalar(m_value / other, (m_derivatives * (Scalar(1)/other))); } - friend inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> > + friend inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) > operator/(const Scalar& other, const AutoDiffScalar& a) { - return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >( - other / a.value(), - a.derivatives() * (Scalar(-other) / (a.value()*a.value()))); + return MakeAutoDiffScalar(other / a.value(), a.derivatives() * (Scalar(-other) / (a.value()*a.value()))); } // inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type > @@ -322,34 +321,29 @@ class AutoDiffScalar // } template<typename OtherDerType> - inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, - const CwiseBinaryOp<internal::scalar_difference_op<Scalar>, - const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>, - const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type > > > > + inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE( + CwiseBinaryOp<internal::scalar_difference_op<Scalar> EIGEN_COMMA + const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) EIGEN_COMMA + const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename internal::remove_all<OtherDerType>::type,Scalar,product) >,Scalar,product) > operator/(const AutoDiffScalar<OtherDerType>& other) const { internal::make_coherent(m_derivatives, other.derivatives()); - return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, - const CwiseBinaryOp<internal::scalar_difference_op<Scalar>, - const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>, - const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type > > > >( + return MakeAutoDiffScalar( m_value / other.value(), - ((m_derivatives * other.value()) - (m_value * other.derivatives())) + ((m_derivatives * other.value()) - (other.derivatives() * m_value)) * (Scalar(1)/(other.value()*other.value()))); } template<typename OtherDerType> inline const AutoDiffScalar<CwiseBinaryOp<internal::scalar_sum_op<Scalar>, - const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>, - const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type> > > + const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product), + const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename internal::remove_all<OtherDerType>::type,Scalar,product) > > operator*(const AutoDiffScalar<OtherDerType>& other) const { internal::make_coherent(m_derivatives, other.derivatives()); - return AutoDiffScalar<const CwiseBinaryOp<internal::scalar_sum_op<Scalar>, - const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>, - const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type > > >( + return MakeAutoDiffScalar( m_value * other.value(), - (m_derivatives * other.value()) + (m_value * other.derivatives())); + (m_derivatives * other.value()) + (other.derivatives() * m_value)); } inline AutoDiffScalar& operator*=(const Scalar& other) @@ -426,18 +420,18 @@ struct auto_diff_special_op<_DerType, true> } - inline const AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type > + inline const AutoDiffScalar<typename CwiseUnaryOp<bind2nd_op<scalar_product_op<Scalar,Real> >, DerType>::Type > operator*(const Real& other) const { - return AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >( + return AutoDiffScalar<typename CwiseUnaryOp<bind2nd_op<scalar_product_op<Scalar,Real> >, DerType>::Type >( derived().value() * other, derived().derivatives() * other); } - friend inline const AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type > + friend inline const AutoDiffScalar<typename CwiseUnaryOp<bind1st_op<scalar_product_op<Real,Scalar> >, DerType>::Type > operator*(const Real& other, const AutoDiffScalar<_DerType>& a) { - return AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >( + return AutoDiffScalar<typename CwiseUnaryOp<bind1st_op<scalar_product_op<Real,Scalar> >, DerType>::Type >( a.value() * other, a.derivatives() * other); } @@ -501,43 +495,44 @@ struct make_coherent_impl<Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, } }; -template<typename A_Scalar, int A_Rows, int A_Cols, int A_Options, int A_MaxRows, int A_MaxCols> -struct scalar_product_traits<Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols>,A_Scalar> -{ - enum { Defined = 1 }; - typedef Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> ReturnType; -}; - -template<typename A_Scalar, int A_Rows, int A_Cols, int A_Options, int A_MaxRows, int A_MaxCols> -struct scalar_product_traits<A_Scalar, Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> > -{ - enum { Defined = 1 }; - typedef Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> ReturnType; -}; +} // end namespace internal -template<typename DerType> -struct scalar_product_traits<AutoDiffScalar<DerType>,typename DerType::Scalar> +template<typename DerType, typename BinOp> +struct ScalarBinaryOpTraits<AutoDiffScalar<DerType>,typename DerType::Scalar,BinOp> { - enum { Defined = 1 }; typedef AutoDiffScalar<DerType> ReturnType; }; -template<typename DerType> -struct scalar_product_traits<typename DerType::Scalar,AutoDiffScalar<DerType> > +template<typename DerType, typename BinOp> +struct ScalarBinaryOpTraits<typename DerType::Scalar,AutoDiffScalar<DerType>, BinOp> { - enum { Defined = 1 }; typedef AutoDiffScalar<DerType> ReturnType; }; -} // end namespace internal + +// The following is an attempt to let Eigen's known about expression template, but that's more tricky! + +// template<typename DerType, typename BinOp> +// struct ScalarBinaryOpTraits<AutoDiffScalar<DerType>,AutoDiffScalar<DerType>, BinOp> +// { +// enum { Defined = 1 }; +// typedef AutoDiffScalar<typename DerType::PlainObject> ReturnType; +// }; +// +// template<typename DerType1,typename DerType2, typename BinOp> +// struct ScalarBinaryOpTraits<AutoDiffScalar<DerType1>,AutoDiffScalar<DerType2>, BinOp> +// { +// enum { Defined = 1 };//internal::is_same<typename DerType1::Scalar,typename DerType2::Scalar>::value }; +// typedef AutoDiffScalar<typename DerType1::PlainObject> ReturnType; +// }; #define EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(FUNC,CODE) \ template<typename DerType> \ - inline const Eigen::AutoDiffScalar<Eigen::CwiseUnaryOp<Eigen::internal::scalar_multiple_op<typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar>, const typename Eigen::internal::remove_all<DerType>::type> > \ + inline const Eigen::AutoDiffScalar< \ + EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename Eigen::internal::remove_all<DerType>::type, typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar, product) > \ FUNC(const Eigen::AutoDiffScalar<DerType>& x) { \ using namespace Eigen; \ - typedef typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar Scalar; \ - typedef AutoDiffScalar<CwiseUnaryOp<Eigen::internal::scalar_multiple_op<Scalar>, const typename Eigen::internal::remove_all<DerType>::type> > ReturnType; \ + EIGEN_UNUSED typedef typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar Scalar; \ CODE; \ } @@ -548,56 +543,75 @@ inline const AutoDiffScalar<DerType>& real(const AutoDiffScalar<DerType>& x) { template<typename DerType> inline typename DerType::Scalar imag(const AutoDiffScalar<DerType>&) { return 0.; } template<typename DerType, typename T> -inline AutoDiffScalar<DerType> (min)(const AutoDiffScalar<DerType>& x, const T& y) { return (x <= y ? x : y); } +inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (min)(const AutoDiffScalar<DerType>& x, const T& y) { + typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> ADS; + return (x <= y ? ADS(x) : ADS(y)); +} template<typename DerType, typename T> -inline AutoDiffScalar<DerType> (max)(const AutoDiffScalar<DerType>& x, const T& y) { return (x >= y ? x : y); } +inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (max)(const AutoDiffScalar<DerType>& x, const T& y) { + typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> ADS; + return (x >= y ? ADS(x) : ADS(y)); +} template<typename DerType, typename T> -inline AutoDiffScalar<DerType> (min)(const T& x, const AutoDiffScalar<DerType>& y) { return (x < y ? x : y); } +inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (min)(const T& x, const AutoDiffScalar<DerType>& y) { + typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> ADS; + return (x < y ? ADS(x) : ADS(y)); +} template<typename DerType, typename T> -inline AutoDiffScalar<DerType> (max)(const T& x, const AutoDiffScalar<DerType>& y) { return (x > y ? x : y); } +inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (max)(const T& x, const AutoDiffScalar<DerType>& y) { + typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> ADS; + return (x > y ? ADS(x) : ADS(y)); +} +template<typename DerType> +inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (min)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) { + return (x.value() < y.value() ? x : y); +} +template<typename DerType> +inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (max)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) { + return (x.value() >= y.value() ? x : y); +} + EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(abs, using std::abs; - return ReturnType(abs(x.value()), x.derivatives() * (x.value()<0 ? -1 : 1) );) + return Eigen::MakeAutoDiffScalar(abs(x.value()), x.derivatives() * (x.value()<0 ? -1 : 1) );) EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(abs2, using numext::abs2; - return ReturnType(abs2(x.value()), x.derivatives() * (Scalar(2)*x.value()));) + return Eigen::MakeAutoDiffScalar(abs2(x.value()), x.derivatives() * (Scalar(2)*x.value()));) EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sqrt, using std::sqrt; Scalar sqrtx = sqrt(x.value()); - return ReturnType(sqrtx,x.derivatives() * (Scalar(0.5) / sqrtx));) + return Eigen::MakeAutoDiffScalar(sqrtx,x.derivatives() * (Scalar(0.5) / sqrtx));) EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(cos, using std::cos; using std::sin; - return ReturnType(cos(x.value()), x.derivatives() * (-sin(x.value())));) + return Eigen::MakeAutoDiffScalar(cos(x.value()), x.derivatives() * (-sin(x.value())));) EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sin, using std::sin; using std::cos; - return ReturnType(sin(x.value()),x.derivatives() * cos(x.value()));) + return Eigen::MakeAutoDiffScalar(sin(x.value()),x.derivatives() * cos(x.value()));) EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(exp, using std::exp; Scalar expx = exp(x.value()); - return ReturnType(expx,x.derivatives() * expx);) + return Eigen::MakeAutoDiffScalar(expx,x.derivatives() * expx);) EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(log, using std::log; - return ReturnType(log(x.value()),x.derivatives() * (Scalar(1)/x.value()));) + return Eigen::MakeAutoDiffScalar(log(x.value()),x.derivatives() * (Scalar(1)/x.value()));) template<typename DerType> -inline const Eigen::AutoDiffScalar<Eigen::CwiseUnaryOp<Eigen::internal::scalar_multiple_op<typename internal::traits<typename internal::remove_all<DerType>::type>::Scalar>, const typename internal::remove_all<DerType>::type> > -pow(const Eigen::AutoDiffScalar<DerType>& x, typename internal::traits<typename internal::remove_all<DerType>::type>::Scalar y) +inline const Eigen::AutoDiffScalar< +EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename internal::remove_all<DerType>::type,typename internal::traits<typename internal::remove_all<DerType>::type>::Scalar,product) > +pow(const Eigen::AutoDiffScalar<DerType> &x, const typename internal::traits<typename internal::remove_all<DerType>::type>::Scalar &y) { using namespace Eigen; - typedef typename internal::remove_all<DerType>::type DerTypeCleaned; - typedef typename Eigen::internal::traits<DerTypeCleaned>::Scalar Scalar; - return AutoDiffScalar<CwiseUnaryOp<Eigen::internal::scalar_multiple_op<Scalar>, const DerTypeCleaned> >( - std::pow(x.value(),y), - x.derivatives() * (y * std::pow(x.value(),y-1))); + using std::pow; + return Eigen::MakeAutoDiffScalar(pow(x.value(),y), x.derivatives() * (y * pow(x.value(),y-1))); } @@ -622,27 +636,44 @@ atan2(const AutoDiffScalar<DerTypeA>& a, const AutoDiffScalar<DerTypeB>& b) EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(tan, using std::tan; using std::cos; - return ReturnType(tan(x.value()),x.derivatives() * (Scalar(1)/numext::abs2(cos(x.value()))));) + return Eigen::MakeAutoDiffScalar(tan(x.value()),x.derivatives() * (Scalar(1)/numext::abs2(cos(x.value()))));) EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(asin, using std::sqrt; using std::asin; - return ReturnType(asin(x.value()),x.derivatives() * (Scalar(1)/sqrt(1-numext::abs2(x.value()))));) + return Eigen::MakeAutoDiffScalar(asin(x.value()),x.derivatives() * (Scalar(1)/sqrt(1-numext::abs2(x.value()))));) EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(acos, using std::sqrt; using std::acos; - return ReturnType(acos(x.value()),x.derivatives() * (Scalar(-1)/sqrt(1-numext::abs2(x.value()))));) + return Eigen::MakeAutoDiffScalar(acos(x.value()),x.derivatives() * (Scalar(-1)/sqrt(1-numext::abs2(x.value()))));) + +EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(tanh, + using std::cosh; + using std::tanh; + return Eigen::MakeAutoDiffScalar(tanh(x.value()),x.derivatives() * (Scalar(1)/numext::abs2(cosh(x.value()))));) + +EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sinh, + using std::sinh; + using std::cosh; + return Eigen::MakeAutoDiffScalar(sinh(x.value()),x.derivatives() * cosh(x.value()));) + +EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(cosh, + using std::sinh; + using std::cosh; + return Eigen::MakeAutoDiffScalar(cosh(x.value()),x.derivatives() * sinh(x.value()));) #undef EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY template<typename DerType> struct NumTraits<AutoDiffScalar<DerType> > - : NumTraits< typename NumTraits<typename DerType::Scalar>::Real > + : NumTraits< typename NumTraits<typename internal::remove_all<DerType>::type::Scalar>::Real > { - typedef AutoDiffScalar<Matrix<typename NumTraits<typename DerType::Scalar>::Real,DerType::RowsAtCompileTime,DerType::ColsAtCompileTime, - DerType::Options, DerType::MaxRowsAtCompileTime, DerType::MaxColsAtCompileTime> > Real; + typedef typename internal::remove_all<DerType>::type DerTypeCleaned; + typedef AutoDiffScalar<Matrix<typename NumTraits<typename DerTypeCleaned::Scalar>::Real,DerTypeCleaned::RowsAtCompileTime,DerTypeCleaned::ColsAtCompileTime, + 0, DerTypeCleaned::MaxRowsAtCompileTime, DerTypeCleaned::MaxColsAtCompileTime> > Real; typedef AutoDiffScalar<DerType> NonInteger; typedef AutoDiffScalar<DerType> Nested; + typedef typename NumTraits<typename DerTypeCleaned::Scalar>::Literal Literal; enum{ RequireInitialization = 1 }; diff --git a/unsupported/Eigen/src/AutoDiff/CMakeLists.txt b/unsupported/Eigen/src/AutoDiff/CMakeLists.txt deleted file mode 100644 index ad91fd9c4..000000000 --- a/unsupported/Eigen/src/AutoDiff/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_AutoDiff_SRCS "*.h") - -INSTALL(FILES - ${Eigen_AutoDiff_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/AutoDiff COMPONENT Devel - ) diff --git a/unsupported/Eigen/src/BVH/CMakeLists.txt b/unsupported/Eigen/src/BVH/CMakeLists.txt deleted file mode 100644 index b377d865c..000000000 --- a/unsupported/Eigen/src/BVH/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_BVH_SRCS "*.h") - -INSTALL(FILES - ${Eigen_BVH_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/BVH COMPONENT Devel - ) diff --git a/unsupported/Eigen/src/CMakeLists.txt b/unsupported/Eigen/src/CMakeLists.txt deleted file mode 100644 index a7e8c7553..000000000 --- a/unsupported/Eigen/src/CMakeLists.txt +++ /dev/null @@ -1,15 +0,0 @@ -ADD_SUBDIRECTORY(AutoDiff) -ADD_SUBDIRECTORY(BVH) -ADD_SUBDIRECTORY(Eigenvalues) -ADD_SUBDIRECTORY(FFT) -ADD_SUBDIRECTORY(IterativeSolvers) -ADD_SUBDIRECTORY(LevenbergMarquardt) -ADD_SUBDIRECTORY(MatrixFunctions) -ADD_SUBDIRECTORY(MoreVectorization) -ADD_SUBDIRECTORY(NonLinearOptimization) -ADD_SUBDIRECTORY(NumericalDiff) -ADD_SUBDIRECTORY(Polynomials) -ADD_SUBDIRECTORY(Skyline) -ADD_SUBDIRECTORY(SparseExtra) -ADD_SUBDIRECTORY(KroneckerProduct) -ADD_SUBDIRECTORY(Splines) diff --git a/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h b/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h index 3b6a69aff..866a8a460 100644 --- a/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h +++ b/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h @@ -628,15 +628,15 @@ ArpackGeneralizedSelfAdjointEigenSolver<MatrixType, MatrixSolver, BisSPD>& m_info = Success; } - delete select; + delete[] select; } - delete v; - delete iparam; - delete ipntr; - delete workd; - delete workl; - delete resid; + delete[] v; + delete[] iparam; + delete[] ipntr; + delete[] workd; + delete[] workl; + delete[] resid; m_isInitialized = true; diff --git a/unsupported/Eigen/src/Eigenvalues/CMakeLists.txt b/unsupported/Eigen/src/Eigenvalues/CMakeLists.txt deleted file mode 100644 index 1d4387c82..000000000 --- a/unsupported/Eigen/src/Eigenvalues/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_Eigenvalues_SRCS "*.h") - -INSTALL(FILES - ${Eigen_Eigenvalues_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Eigenvalues COMPONENT Devel - ) diff --git a/unsupported/Eigen/src/EulerAngles/CMakeLists.txt b/unsupported/Eigen/src/EulerAngles/CMakeLists.txt new file mode 100644 index 000000000..40af550e8 --- /dev/null +++ b/unsupported/Eigen/src/EulerAngles/CMakeLists.txt @@ -0,0 +1,6 @@ +FILE(GLOB Eigen_EulerAngles_SRCS "*.h") + +INSTALL(FILES + ${Eigen_EulerAngles_SRCS} + DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/EulerAngles COMPONENT Devel + ) diff --git a/unsupported/Eigen/src/EulerAngles/EulerAngles.h b/unsupported/Eigen/src/EulerAngles/EulerAngles.h new file mode 100644 index 000000000..13a0da1ab --- /dev/null +++ b/unsupported/Eigen/src/EulerAngles/EulerAngles.h @@ -0,0 +1,386 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_EULERANGLESCLASS_H// TODO: Fix previous "EIGEN_EULERANGLES_H" definition? +#define EIGEN_EULERANGLESCLASS_H + +namespace Eigen +{ + /*template<typename Other, + int OtherRows=Other::RowsAtCompileTime, + int OtherCols=Other::ColsAtCompileTime> + struct ei_eulerangles_assign_impl;*/ + + /** \class EulerAngles + * + * \ingroup EulerAngles_Module + * + * \brief Represents a rotation in a 3 dimensional space as three Euler angles. + * + * Euler rotation is a set of three rotation of three angles over three fixed axes, defined by the EulerSystem given as a template parameter. + * + * Here is how intrinsic Euler angles works: + * - first, rotate the axes system over the alpha axis in angle alpha + * - then, rotate the axes system over the beta axis(which was rotated in the first stage) in angle beta + * - then, rotate the axes system over the gamma axis(which was rotated in the two stages above) in angle gamma + * + * \note This class support only intrinsic Euler angles for simplicity, + * see EulerSystem how to easily overcome this for extrinsic systems. + * + * ### Rotation representation and conversions ### + * + * It has been proved(see Wikipedia link below) that every rotation can be represented + * by Euler angles, but there is no singular representation (e.g. unlike rotation matrices). + * Therefore, you can convert from Eigen rotation and to them + * (including rotation matrices, which is not called "rotations" by Eigen design). + * + * Euler angles usually used for: + * - convenient human representation of rotation, especially in interactive GUI. + * - gimbal systems and robotics + * - efficient encoding(i.e. 3 floats only) of rotation for network protocols. + * + * However, Euler angles are slow comparing to quaternion or matrices, + * because their unnatural math definition, although it's simple for human. + * To overcome this, this class provide easy movement from the math friendly representation + * to the human friendly representation, and vise-versa. + * + * All the user need to do is a safe simple C++ type conversion, + * and this class take care for the math. + * Additionally, some axes related computation is done in compile time. + * + * #### Euler angles ranges in conversions #### + * + * When converting some rotation to Euler angles, there are some ways you can guarantee + * the Euler angles ranges. + * + * #### implicit ranges #### + * When using implicit ranges, all angles are guarantee to be in the range [-PI, +PI], + * unless you convert from some other Euler angles. + * In this case, the range is __undefined__ (might be even less than -PI or greater than +2*PI). + * \sa EulerAngles(const MatrixBase<Derived>&) + * \sa EulerAngles(const RotationBase<Derived, 3>&) + * + * #### explicit ranges #### + * When using explicit ranges, all angles are guarantee to be in the range you choose. + * In the range Boolean parameter, you're been ask whether you prefer the positive range or not: + * - _true_ - force the range between [0, +2*PI] + * - _false_ - force the range between [-PI, +PI] + * + * ##### compile time ranges ##### + * This is when you have compile time ranges and you prefer to + * use template parameter. (e.g. for performance) + * \sa FromRotation() + * + * ##### run-time time ranges ##### + * Run-time ranges are also supported. + * \sa EulerAngles(const MatrixBase<Derived>&, bool, bool, bool) + * \sa EulerAngles(const RotationBase<Derived, 3>&, bool, bool, bool) + * + * ### Convenient user typedefs ### + * + * Convenient typedefs for EulerAngles exist for float and double scalar, + * in a form of EulerAngles{A}{B}{C}{scalar}, + * e.g. \ref EulerAnglesXYZd, \ref EulerAnglesZYZf. + * + * Only for positive axes{+x,+y,+z} Euler systems are have convenient typedef. + * If you need negative axes{-x,-y,-z}, it is recommended to create you own typedef with + * a word that represent what you need. + * + * ### Example ### + * + * \include EulerAngles.cpp + * Output: \verbinclude EulerAngles.out + * + * ### Additional reading ### + * + * If you're want to get more idea about how Euler system work in Eigen see EulerSystem. + * + * More information about Euler angles: https://en.wikipedia.org/wiki/Euler_angles + * + * \tparam _Scalar the scalar type, i.e., the type of the angles. + * + * \tparam _System the EulerSystem to use, which represents the axes of rotation. + */ + template <typename _Scalar, class _System> + class EulerAngles : public RotationBase<EulerAngles<_Scalar, _System>, 3> + { + public: + /** the scalar type of the angles */ + typedef _Scalar Scalar; + + /** the EulerSystem to use, which represents the axes of rotation. */ + typedef _System System; + + typedef Matrix<Scalar,3,3> Matrix3; /*!< the equivalent rotation matrix type */ + typedef Matrix<Scalar,3,1> Vector3; /*!< the equivalent 3 dimension vector type */ + typedef Quaternion<Scalar> QuaternionType; /*!< the equivalent quaternion type */ + typedef AngleAxis<Scalar> AngleAxisType; /*!< the equivalent angle-axis type */ + + /** \returns the axis vector of the first (alpha) rotation */ + static Vector3 AlphaAxisVector() { + const Vector3& u = Vector3::Unit(System::AlphaAxisAbs - 1); + return System::IsAlphaOpposite ? -u : u; + } + + /** \returns the axis vector of the second (beta) rotation */ + static Vector3 BetaAxisVector() { + const Vector3& u = Vector3::Unit(System::BetaAxisAbs - 1); + return System::IsBetaOpposite ? -u : u; + } + + /** \returns the axis vector of the third (gamma) rotation */ + static Vector3 GammaAxisVector() { + const Vector3& u = Vector3::Unit(System::GammaAxisAbs - 1); + return System::IsGammaOpposite ? -u : u; + } + + private: + Vector3 m_angles; + + public: + /** Default constructor without initialization. */ + EulerAngles() {} + /** Constructs and initialize Euler angles(\p alpha, \p beta, \p gamma). */ + EulerAngles(const Scalar& alpha, const Scalar& beta, const Scalar& gamma) : + m_angles(alpha, beta, gamma) {} + + /** Constructs and initialize Euler angles from a 3x3 rotation matrix \p m. + * + * \note All angles will be in the range [-PI, PI]. + */ + template<typename Derived> + EulerAngles(const MatrixBase<Derived>& m) { *this = m; } + + /** Constructs and initialize Euler angles from a 3x3 rotation matrix \p m, + * with options to choose for each angle the requested range. + * + * If positive range is true, then the specified angle will be in the range [0, +2*PI]. + * Otherwise, the specified angle will be in the range [-PI, +PI]. + * + * \param m The 3x3 rotation matrix to convert + * \param positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI]. + * \param positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI]. + * \param positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI]. + */ + template<typename Derived> + EulerAngles( + const MatrixBase<Derived>& m, + bool positiveRangeAlpha, + bool positiveRangeBeta, + bool positiveRangeGamma) { + + System::CalcEulerAngles(*this, m, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma); + } + + /** Constructs and initialize Euler angles from a rotation \p rot. + * + * \note All angles will be in the range [-PI, PI], unless \p rot is an EulerAngles. + * If rot is an EulerAngles, expected EulerAngles range is __undefined__. + * (Use other functions here for enforcing range if this effect is desired) + */ + template<typename Derived> + EulerAngles(const RotationBase<Derived, 3>& rot) { *this = rot; } + + /** Constructs and initialize Euler angles from a rotation \p rot, + * with options to choose for each angle the requested range. + * + * If positive range is true, then the specified angle will be in the range [0, +2*PI]. + * Otherwise, the specified angle will be in the range [-PI, +PI]. + * + * \param rot The 3x3 rotation matrix to convert + * \param positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI]. + * \param positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI]. + * \param positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI]. + */ + template<typename Derived> + EulerAngles( + const RotationBase<Derived, 3>& rot, + bool positiveRangeAlpha, + bool positiveRangeBeta, + bool positiveRangeGamma) { + + System::CalcEulerAngles(*this, rot.toRotationMatrix(), positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma); + } + + /** \returns The angle values stored in a vector (alpha, beta, gamma). */ + const Vector3& angles() const { return m_angles; } + /** \returns A read-write reference to the angle values stored in a vector (alpha, beta, gamma). */ + Vector3& angles() { return m_angles; } + + /** \returns The value of the first angle. */ + Scalar alpha() const { return m_angles[0]; } + /** \returns A read-write reference to the angle of the first angle. */ + Scalar& alpha() { return m_angles[0]; } + + /** \returns The value of the second angle. */ + Scalar beta() const { return m_angles[1]; } + /** \returns A read-write reference to the angle of the second angle. */ + Scalar& beta() { return m_angles[1]; } + + /** \returns The value of the third angle. */ + Scalar gamma() const { return m_angles[2]; } + /** \returns A read-write reference to the angle of the third angle. */ + Scalar& gamma() { return m_angles[2]; } + + /** \returns The Euler angles rotation inverse (which is as same as the negative), + * (-alpha, -beta, -gamma). + */ + EulerAngles inverse() const + { + EulerAngles res; + res.m_angles = -m_angles; + return res; + } + + /** \returns The Euler angles rotation negative (which is as same as the inverse), + * (-alpha, -beta, -gamma). + */ + EulerAngles operator -() const + { + return inverse(); + } + + /** Constructs and initialize Euler angles from a 3x3 rotation matrix \p m, + * with options to choose for each angle the requested range (__only in compile time__). + * + * If positive range is true, then the specified angle will be in the range [0, +2*PI]. + * Otherwise, the specified angle will be in the range [-PI, +PI]. + * + * \param m The 3x3 rotation matrix to convert + * \tparam positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI]. + * \tparam positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI]. + * \tparam positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI]. + */ + template< + bool PositiveRangeAlpha, + bool PositiveRangeBeta, + bool PositiveRangeGamma, + typename Derived> + static EulerAngles FromRotation(const MatrixBase<Derived>& m) + { + EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Derived, 3, 3) + + EulerAngles e; + System::template CalcEulerAngles< + PositiveRangeAlpha, PositiveRangeBeta, PositiveRangeGamma, _Scalar>(e, m); + return e; + } + + /** Constructs and initialize Euler angles from a rotation \p rot, + * with options to choose for each angle the requested range (__only in compile time__). + * + * If positive range is true, then the specified angle will be in the range [0, +2*PI]. + * Otherwise, the specified angle will be in the range [-PI, +PI]. + * + * \param rot The 3x3 rotation matrix to convert + * \tparam positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI]. + * \tparam positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI]. + * \tparam positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI]. + */ + template< + bool PositiveRangeAlpha, + bool PositiveRangeBeta, + bool PositiveRangeGamma, + typename Derived> + static EulerAngles FromRotation(const RotationBase<Derived, 3>& rot) + { + return FromRotation<PositiveRangeAlpha, PositiveRangeBeta, PositiveRangeGamma>(rot.toRotationMatrix()); + } + + /*EulerAngles& fromQuaternion(const QuaternionType& q) + { + // TODO: Implement it in a faster way for quaternions + // According to http://www.euclideanspace.com/maths/geometry/rotations/conversions/quaternionToEuler/ + // we can compute only the needed matrix cells and then convert to euler angles. (see ZYX example below) + // Currently we compute all matrix cells from quaternion. + + // Special case only for ZYX + //Scalar y2 = q.y() * q.y(); + //m_angles[0] = std::atan2(2*(q.w()*q.z() + q.x()*q.y()), (1 - 2*(y2 + q.z()*q.z()))); + //m_angles[1] = std::asin( 2*(q.w()*q.y() - q.z()*q.x())); + //m_angles[2] = std::atan2(2*(q.w()*q.x() + q.y()*q.z()), (1 - 2*(q.x()*q.x() + y2))); + }*/ + + /** Set \c *this from a rotation matrix(i.e. pure orthogonal matrix with determinant of +1). */ + template<typename Derived> + EulerAngles& operator=(const MatrixBase<Derived>& m) { + EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Derived, 3, 3) + + System::CalcEulerAngles(*this, m); + return *this; + } + + // TODO: Assign and construct from another EulerAngles (with different system) + + /** Set \c *this from a rotation. */ + template<typename Derived> + EulerAngles& operator=(const RotationBase<Derived, 3>& rot) { + System::CalcEulerAngles(*this, rot.toRotationMatrix()); + return *this; + } + + // TODO: Support isApprox function + + /** \returns an equivalent 3x3 rotation matrix. */ + Matrix3 toRotationMatrix() const + { + return static_cast<QuaternionType>(*this).toRotationMatrix(); + } + + /** Convert the Euler angles to quaternion. */ + operator QuaternionType() const + { + return + AngleAxisType(alpha(), AlphaAxisVector()) * + AngleAxisType(beta(), BetaAxisVector()) * + AngleAxisType(gamma(), GammaAxisVector()); + } + + friend std::ostream& operator<<(std::ostream& s, const EulerAngles<Scalar, System>& eulerAngles) + { + s << eulerAngles.angles().transpose(); + return s; + } + }; + +#define EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(AXES, SCALAR_TYPE, SCALAR_POSTFIX) \ + /** \ingroup EulerAngles_Module */ \ + typedef EulerAngles<SCALAR_TYPE, EulerSystem##AXES> EulerAngles##AXES##SCALAR_POSTFIX; + +#define EIGEN_EULER_ANGLES_TYPEDEFS(SCALAR_TYPE, SCALAR_POSTFIX) \ + EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XYZ, SCALAR_TYPE, SCALAR_POSTFIX) \ + EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XYX, SCALAR_TYPE, SCALAR_POSTFIX) \ + EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XZY, SCALAR_TYPE, SCALAR_POSTFIX) \ + EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XZX, SCALAR_TYPE, SCALAR_POSTFIX) \ + \ + EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YZX, SCALAR_TYPE, SCALAR_POSTFIX) \ + EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YZY, SCALAR_TYPE, SCALAR_POSTFIX) \ + EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YXZ, SCALAR_TYPE, SCALAR_POSTFIX) \ + EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YXY, SCALAR_TYPE, SCALAR_POSTFIX) \ + \ + EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZXY, SCALAR_TYPE, SCALAR_POSTFIX) \ + EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZXZ, SCALAR_TYPE, SCALAR_POSTFIX) \ + EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZYX, SCALAR_TYPE, SCALAR_POSTFIX) \ + EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZYZ, SCALAR_TYPE, SCALAR_POSTFIX) + +EIGEN_EULER_ANGLES_TYPEDEFS(float, f) +EIGEN_EULER_ANGLES_TYPEDEFS(double, d) + + namespace internal + { + template<typename _Scalar, class _System> + struct traits<EulerAngles<_Scalar, _System> > + { + typedef _Scalar Scalar; + }; + } + +} + +#endif // EIGEN_EULERANGLESCLASS_H diff --git a/unsupported/Eigen/src/EulerAngles/EulerSystem.h b/unsupported/Eigen/src/EulerAngles/EulerSystem.h new file mode 100644 index 000000000..98f9f647d --- /dev/null +++ b/unsupported/Eigen/src/EulerAngles/EulerSystem.h @@ -0,0 +1,326 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_EULERSYSTEM_H +#define EIGEN_EULERSYSTEM_H + +namespace Eigen +{ + // Forward declerations + template <typename _Scalar, class _System> + class EulerAngles; + + namespace internal + { + // TODO: Check if already exists on the rest API + template <int Num, bool IsPositive = (Num > 0)> + struct Abs + { + enum { value = Num }; + }; + + template <int Num> + struct Abs<Num, false> + { + enum { value = -Num }; + }; + + template <int Axis> + struct IsValidAxis + { + enum { value = Axis != 0 && Abs<Axis>::value <= 3 }; + }; + } + + #define EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(COND,MSG) typedef char static_assertion_##MSG[(COND)?1:-1] + + /** \brief Representation of a fixed signed rotation axis for EulerSystem. + * + * \ingroup EulerAngles_Module + * + * Values here represent: + * - The axis of the rotation: X, Y or Z. + * - The sign (i.e. direction of the rotation along the axis): positive(+) or negative(-) + * + * Therefore, this could express all the axes {+X,+Y,+Z,-X,-Y,-Z} + * + * For positive axis, use +EULER_{axis}, and for negative axis use -EULER_{axis}. + */ + enum EulerAxis + { + EULER_X = 1, /*!< the X axis */ + EULER_Y = 2, /*!< the Y axis */ + EULER_Z = 3 /*!< the Z axis */ + }; + + /** \class EulerSystem + * + * \ingroup EulerAngles_Module + * + * \brief Represents a fixed Euler rotation system. + * + * This meta-class goal is to represent the Euler system in compilation time, for EulerAngles. + * + * You can use this class to get two things: + * - Build an Euler system, and then pass it as a template parameter to EulerAngles. + * - Query some compile time data about an Euler system. (e.g. Whether it's tait bryan) + * + * Euler rotation is a set of three rotation on fixed axes. (see \ref EulerAngles) + * This meta-class store constantly those signed axes. (see \ref EulerAxis) + * + * ### Types of Euler systems ### + * + * All and only valid 3 dimension Euler rotation over standard + * signed axes{+X,+Y,+Z,-X,-Y,-Z} are supported: + * - all axes X, Y, Z in each valid order (see below what order is valid) + * - rotation over the axis is supported both over the positive and negative directions. + * - both tait bryan and proper/classic Euler angles (i.e. the opposite). + * + * Since EulerSystem support both positive and negative directions, + * you may call this rotation distinction in other names: + * - _right handed_ or _left handed_ + * - _counterclockwise_ or _clockwise_ + * + * Notice all axed combination are valid, and would trigger a static assertion. + * Same unsigned axes can't be neighbors, e.g. {X,X,Y} is invalid. + * This yield two and only two classes: + * - _tait bryan_ - all unsigned axes are distinct, e.g. {X,Y,Z} + * - _proper/classic Euler angles_ - The first and the third unsigned axes is equal, + * and the second is different, e.g. {X,Y,X} + * + * ### Intrinsic vs extrinsic Euler systems ### + * + * Only intrinsic Euler systems are supported for simplicity. + * If you want to use extrinsic Euler systems, + * just use the equal intrinsic opposite order for axes and angles. + * I.e axes (A,B,C) becomes (C,B,A), and angles (a,b,c) becomes (c,b,a). + * + * ### Convenient user typedefs ### + * + * Convenient typedefs for EulerSystem exist (only for positive axes Euler systems), + * in a form of EulerSystem{A}{B}{C}, e.g. \ref EulerSystemXYZ. + * + * ### Additional reading ### + * + * More information about Euler angles: https://en.wikipedia.org/wiki/Euler_angles + * + * \tparam _AlphaAxis the first fixed EulerAxis + * + * \tparam _AlphaAxis the second fixed EulerAxis + * + * \tparam _AlphaAxis the third fixed EulerAxis + */ + template <int _AlphaAxis, int _BetaAxis, int _GammaAxis> + class EulerSystem + { + public: + // It's defined this way and not as enum, because I think + // that enum is not guerantee to support negative numbers + + /** The first rotation axis */ + static const int AlphaAxis = _AlphaAxis; + + /** The second rotation axis */ + static const int BetaAxis = _BetaAxis; + + /** The third rotation axis */ + static const int GammaAxis = _GammaAxis; + + enum + { + AlphaAxisAbs = internal::Abs<AlphaAxis>::value, /*!< the first rotation axis unsigned */ + BetaAxisAbs = internal::Abs<BetaAxis>::value, /*!< the second rotation axis unsigned */ + GammaAxisAbs = internal::Abs<GammaAxis>::value, /*!< the third rotation axis unsigned */ + + IsAlphaOpposite = (AlphaAxis < 0) ? 1 : 0, /*!< weather alpha axis is negative */ + IsBetaOpposite = (BetaAxis < 0) ? 1 : 0, /*!< weather beta axis is negative */ + IsGammaOpposite = (GammaAxis < 0) ? 1 : 0, /*!< weather gamma axis is negative */ + + IsOdd = ((AlphaAxisAbs)%3 == (BetaAxisAbs - 1)%3) ? 0 : 1, /*!< weather the Euler system is odd */ + IsEven = IsOdd ? 0 : 1, /*!< weather the Euler system is even */ + + IsTaitBryan = ((unsigned)AlphaAxisAbs != (unsigned)GammaAxisAbs) ? 1 : 0 /*!< weather the Euler system is tait bryan */ + }; + + private: + + EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(internal::IsValidAxis<AlphaAxis>::value, + ALPHA_AXIS_IS_INVALID); + + EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(internal::IsValidAxis<BetaAxis>::value, + BETA_AXIS_IS_INVALID); + + EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(internal::IsValidAxis<GammaAxis>::value, + GAMMA_AXIS_IS_INVALID); + + EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT((unsigned)AlphaAxisAbs != (unsigned)BetaAxisAbs, + ALPHA_AXIS_CANT_BE_EQUAL_TO_BETA_AXIS); + + EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT((unsigned)BetaAxisAbs != (unsigned)GammaAxisAbs, + BETA_AXIS_CANT_BE_EQUAL_TO_GAMMA_AXIS); + + enum + { + // I, J, K are the pivot indexes permutation for the rotation matrix, that match this Euler system. + // They are used in this class converters. + // They are always different from each other, and their possible values are: 0, 1, or 2. + I = AlphaAxisAbs - 1, + J = (AlphaAxisAbs - 1 + 1 + IsOdd)%3, + K = (AlphaAxisAbs - 1 + 2 - IsOdd)%3 + }; + + // TODO: Get @mat parameter in form that avoids double evaluation. + template <typename Derived> + static void CalcEulerAngles_imp(Matrix<typename MatrixBase<Derived>::Scalar, 3, 1>& res, const MatrixBase<Derived>& mat, internal::true_type /*isTaitBryan*/) + { + using std::atan2; + using std::sin; + using std::cos; + + typedef typename Derived::Scalar Scalar; + typedef Matrix<Scalar,2,1> Vector2; + + res[0] = atan2(mat(J,K), mat(K,K)); + Scalar c2 = Vector2(mat(I,I), mat(I,J)).norm(); + if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0))) { + if(res[0] > Scalar(0)) { + res[0] -= Scalar(EIGEN_PI); + } + else { + res[0] += Scalar(EIGEN_PI); + } + res[1] = atan2(-mat(I,K), -c2); + } + else + res[1] = atan2(-mat(I,K), c2); + Scalar s1 = sin(res[0]); + Scalar c1 = cos(res[0]); + res[2] = atan2(s1*mat(K,I)-c1*mat(J,I), c1*mat(J,J) - s1 * mat(K,J)); + } + + template <typename Derived> + static void CalcEulerAngles_imp(Matrix<typename MatrixBase<Derived>::Scalar,3,1>& res, const MatrixBase<Derived>& mat, internal::false_type /*isTaitBryan*/) + { + using std::atan2; + using std::sin; + using std::cos; + + typedef typename Derived::Scalar Scalar; + typedef Matrix<Scalar,2,1> Vector2; + + res[0] = atan2(mat(J,I), mat(K,I)); + if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0))) + { + if(res[0] > Scalar(0)) { + res[0] -= Scalar(EIGEN_PI); + } + else { + res[0] += Scalar(EIGEN_PI); + } + Scalar s2 = Vector2(mat(J,I), mat(K,I)).norm(); + res[1] = -atan2(s2, mat(I,I)); + } + else + { + Scalar s2 = Vector2(mat(J,I), mat(K,I)).norm(); + res[1] = atan2(s2, mat(I,I)); + } + + // With a=(0,1,0), we have i=0; j=1; k=2, and after computing the first two angles, + // we can compute their respective rotation, and apply its inverse to M. Since the result must + // be a rotation around x, we have: + // + // c2 s1.s2 c1.s2 1 0 0 + // 0 c1 -s1 * M = 0 c3 s3 + // -s2 s1.c2 c1.c2 0 -s3 c3 + // + // Thus: m11.c1 - m21.s1 = c3 & m12.c1 - m22.s1 = s3 + + Scalar s1 = sin(res[0]); + Scalar c1 = cos(res[0]); + res[2] = atan2(c1*mat(J,K)-s1*mat(K,K), c1*mat(J,J) - s1 * mat(K,J)); + } + + template<typename Scalar> + static void CalcEulerAngles( + EulerAngles<Scalar, EulerSystem>& res, + const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat) + { + CalcEulerAngles(res, mat, false, false, false); + } + + template< + bool PositiveRangeAlpha, + bool PositiveRangeBeta, + bool PositiveRangeGamma, + typename Scalar> + static void CalcEulerAngles( + EulerAngles<Scalar, EulerSystem>& res, + const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat) + { + CalcEulerAngles(res, mat, PositiveRangeAlpha, PositiveRangeBeta, PositiveRangeGamma); + } + + template<typename Scalar> + static void CalcEulerAngles( + EulerAngles<Scalar, EulerSystem>& res, + const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat, + bool PositiveRangeAlpha, + bool PositiveRangeBeta, + bool PositiveRangeGamma) + { + CalcEulerAngles_imp( + res.angles(), mat, + typename internal::conditional<IsTaitBryan, internal::true_type, internal::false_type>::type()); + + if (IsAlphaOpposite == IsOdd) + res.alpha() = -res.alpha(); + + if (IsBetaOpposite == IsOdd) + res.beta() = -res.beta(); + + if (IsGammaOpposite == IsOdd) + res.gamma() = -res.gamma(); + + // Saturate results to the requested range + if (PositiveRangeAlpha && (res.alpha() < 0)) + res.alpha() += Scalar(2 * EIGEN_PI); + + if (PositiveRangeBeta && (res.beta() < 0)) + res.beta() += Scalar(2 * EIGEN_PI); + + if (PositiveRangeGamma && (res.gamma() < 0)) + res.gamma() += Scalar(2 * EIGEN_PI); + } + + template <typename _Scalar, class _System> + friend class Eigen::EulerAngles; + }; + +#define EIGEN_EULER_SYSTEM_TYPEDEF(A, B, C) \ + /** \ingroup EulerAngles_Module */ \ + typedef EulerSystem<EULER_##A, EULER_##B, EULER_##C> EulerSystem##A##B##C; + + EIGEN_EULER_SYSTEM_TYPEDEF(X,Y,Z) + EIGEN_EULER_SYSTEM_TYPEDEF(X,Y,X) + EIGEN_EULER_SYSTEM_TYPEDEF(X,Z,Y) + EIGEN_EULER_SYSTEM_TYPEDEF(X,Z,X) + + EIGEN_EULER_SYSTEM_TYPEDEF(Y,Z,X) + EIGEN_EULER_SYSTEM_TYPEDEF(Y,Z,Y) + EIGEN_EULER_SYSTEM_TYPEDEF(Y,X,Z) + EIGEN_EULER_SYSTEM_TYPEDEF(Y,X,Y) + + EIGEN_EULER_SYSTEM_TYPEDEF(Z,X,Y) + EIGEN_EULER_SYSTEM_TYPEDEF(Z,X,Z) + EIGEN_EULER_SYSTEM_TYPEDEF(Z,Y,X) + EIGEN_EULER_SYSTEM_TYPEDEF(Z,Y,Z) +} + +#endif // EIGEN_EULERSYSTEM_H diff --git a/unsupported/Eigen/src/FFT/CMakeLists.txt b/unsupported/Eigen/src/FFT/CMakeLists.txt deleted file mode 100644 index edcffcb18..000000000 --- a/unsupported/Eigen/src/FFT/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_FFT_SRCS "*.h") - -INSTALL(FILES - ${Eigen_FFT_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/FFT COMPONENT Devel - ) diff --git a/unsupported/Eigen/src/IterativeSolvers/CMakeLists.txt b/unsupported/Eigen/src/IterativeSolvers/CMakeLists.txt deleted file mode 100644 index 7986afc5e..000000000 --- a/unsupported/Eigen/src/IterativeSolvers/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_IterativeSolvers_SRCS "*.h") - -INSTALL(FILES - ${Eigen_IterativeSolvers_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/IterativeSolvers COMPONENT Devel - ) diff --git a/unsupported/Eigen/src/IterativeSolvers/GMRES.h b/unsupported/Eigen/src/IterativeSolvers/GMRES.h index fbe21fc7e..5a82b0df6 100644 --- a/unsupported/Eigen/src/IterativeSolvers/GMRES.h +++ b/unsupported/Eigen/src/IterativeSolvers/GMRES.h @@ -62,7 +62,7 @@ bool gmres(const MatrixType & mat, const Rhs & rhs, Dest & x, const Precondition typedef typename Dest::RealScalar RealScalar; typedef typename Dest::Scalar Scalar; typedef Matrix < Scalar, Dynamic, 1 > VectorType; - typedef Matrix < Scalar, Dynamic, Dynamic > FMatrixType; + typedef Matrix < Scalar, Dynamic, Dynamic, ColMajor> FMatrixType; RealScalar tol = tol_error; const Index maxIters = iters; @@ -157,7 +157,8 @@ bool gmres(const MatrixType & mat, const Rhs & rhs, Dest & x, const Precondition // insert coefficients into upper matrix triangle H.col(k-1).head(k) = v.head(k); - bool stop = (k==m || abs(w(k)) < tol * r0Norm || iters == maxIters); + tol_error = abs(w(k)) / r0Norm; + bool stop = (k==m || tol_error < tol || iters == maxIters); if (stop || k == restart) { diff --git a/unsupported/Eigen/src/KroneckerProduct/CMakeLists.txt b/unsupported/Eigen/src/KroneckerProduct/CMakeLists.txt deleted file mode 100644 index 4daefebee..000000000 --- a/unsupported/Eigen/src/KroneckerProduct/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_KroneckerProduct_SRCS "*.h") - -INSTALL(FILES - ${Eigen_KroneckerProduct_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/KroneckerProduct COMPONENT Devel - ) diff --git a/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h b/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h index 4d3e5358e..582fa8512 100644 --- a/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h +++ b/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h @@ -203,7 +203,7 @@ struct traits<KroneckerProduct<_Lhs,_Rhs> > { typedef typename remove_all<_Lhs>::type Lhs; typedef typename remove_all<_Rhs>::type Rhs; - typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar; + typedef typename ScalarBinaryOpTraits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar; typedef typename promote_index_type<typename Lhs::StorageIndex, typename Rhs::StorageIndex>::type StorageIndex; enum { @@ -222,7 +222,7 @@ struct traits<KroneckerProductSparse<_Lhs,_Rhs> > typedef MatrixXpr XprKind; typedef typename remove_all<_Lhs>::type Lhs; typedef typename remove_all<_Rhs>::type Rhs; - typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar; + typedef typename ScalarBinaryOpTraits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar; typedef typename cwise_promote_storage_type<typename traits<Lhs>::StorageKind, typename traits<Rhs>::StorageKind, scalar_product_op<typename Lhs::Scalar, typename Rhs::Scalar> >::ret StorageKind; typedef typename promote_index_type<typename Lhs::StorageIndex, typename Rhs::StorageIndex>::type StorageIndex; @@ -239,7 +239,7 @@ struct traits<KroneckerProductSparse<_Lhs,_Rhs> > RemovedBits = ~(EvalToRowMajor ? 0 : RowMajorBit), Flags = ((LhsFlags | RhsFlags) & HereditaryBits & RemovedBits) - | EvalBeforeNestingBit | EvalBeforeAssigningBit, + | EvalBeforeNestingBit, CoeffReadCost = HugeCost }; diff --git a/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt b/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt deleted file mode 100644 index d9690854d..000000000 --- a/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_LevenbergMarquardt_SRCS "*.h") - -INSTALL(FILES - ${Eigen_LevenbergMarquardt_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/LevenbergMarquardt COMPONENT Devel - ) diff --git a/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h b/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h index b30e0a90a..995427978 100644 --- a/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h +++ b/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h @@ -304,7 +304,7 @@ LevenbergMarquardt<FunctorType>::minimizeInit(FVectorType &x) // m_fjac.reserve(VectorXi::Constant(n,5)); // FIXME Find a better alternative if (!m_useExternalScaling) m_diag.resize(n); - eigen_assert( (!m_useExternalScaling || m_diag.size()==n) || "When m_useExternalScaling is set, the caller must provide a valid 'm_diag'"); + eigen_assert( (!m_useExternalScaling || m_diag.size()==n) && "When m_useExternalScaling is set, the caller must provide a valid 'm_diag'"); m_qtf.resize(n); /* Function Body */ diff --git a/unsupported/Eigen/src/MatrixFunctions/CMakeLists.txt b/unsupported/Eigen/src/MatrixFunctions/CMakeLists.txt deleted file mode 100644 index cdde64d2c..000000000 --- a/unsupported/Eigen/src/MatrixFunctions/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_MatrixFunctions_SRCS "*.h") - -INSTALL(FILES - ${Eigen_MatrixFunctions_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/MatrixFunctions COMPONENT Devel - ) diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h index bbb7e5776..4bb1852b6 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h @@ -65,7 +65,7 @@ template <typename MatrixType> void matrix_exp_pade3(const MatrixType &A, MatrixType &U, MatrixType &V) { typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar; - const RealScalar b[] = {120., 60., 12., 1.}; + const RealScalar b[] = {120.L, 60.L, 12.L, 1.L}; const MatrixType A2 = A * A; const MatrixType tmp = b[3] * A2 + b[1] * MatrixType::Identity(A.rows(), A.cols()); U.noalias() = A * tmp; @@ -81,7 +81,7 @@ template <typename MatrixType> void matrix_exp_pade5(const MatrixType &A, MatrixType &U, MatrixType &V) { typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar; - const RealScalar b[] = {30240., 15120., 3360., 420., 30., 1.}; + const RealScalar b[] = {30240.L, 15120.L, 3360.L, 420.L, 30.L, 1.L}; const MatrixType A2 = A * A; const MatrixType A4 = A2 * A2; const MatrixType tmp = b[5] * A4 + b[3] * A2 + b[1] * MatrixType::Identity(A.rows(), A.cols()); @@ -98,7 +98,7 @@ template <typename MatrixType> void matrix_exp_pade7(const MatrixType &A, MatrixType &U, MatrixType &V) { typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar; - const RealScalar b[] = {17297280., 8648640., 1995840., 277200., 25200., 1512., 56., 1.}; + const RealScalar b[] = {17297280.L, 8648640.L, 1995840.L, 277200.L, 25200.L, 1512.L, 56.L, 1.L}; const MatrixType A2 = A * A; const MatrixType A4 = A2 * A2; const MatrixType A6 = A4 * A2; @@ -118,8 +118,8 @@ template <typename MatrixType> void matrix_exp_pade9(const MatrixType &A, MatrixType &U, MatrixType &V) { typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar; - const RealScalar b[] = {17643225600., 8821612800., 2075673600., 302702400., 30270240., - 2162160., 110880., 3960., 90., 1.}; + const RealScalar b[] = {17643225600.L, 8821612800.L, 2075673600.L, 302702400.L, 30270240.L, + 2162160.L, 110880.L, 3960.L, 90.L, 1.L}; const MatrixType A2 = A * A; const MatrixType A4 = A2 * A2; const MatrixType A6 = A4 * A2; @@ -139,9 +139,9 @@ template <typename MatrixType> void matrix_exp_pade13(const MatrixType &A, MatrixType &U, MatrixType &V) { typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar; - const RealScalar b[] = {64764752532480000., 32382376266240000., 7771770303897600., - 1187353796428800., 129060195264000., 10559470521600., 670442572800., - 33522128640., 1323241920., 40840800., 960960., 16380., 182., 1.}; + const RealScalar b[] = {64764752532480000.L, 32382376266240000.L, 7771770303897600.L, + 1187353796428800.L, 129060195264000.L, 10559470521600.L, 670442572800.L, + 33522128640.L, 1323241920.L, 40840800.L, 960960.L, 16380.L, 182.L, 1.L}; const MatrixType A2 = A * A; const MatrixType A4 = A2 * A2; const MatrixType A6 = A4 * A2; @@ -210,9 +210,9 @@ struct matrix_exp_computeUV<MatrixType, float> using std::pow; const float l1norm = arg.cwiseAbs().colwise().sum().maxCoeff(); squarings = 0; - if (l1norm < 4.258730016922831e-001) { + if (l1norm < 4.258730016922831e-001f) { matrix_exp_pade3(arg, U, V); - } else if (l1norm < 1.880152677804762e+000) { + } else if (l1norm < 1.880152677804762e+000f) { matrix_exp_pade5(arg, U, V); } else { const float maxnorm = 3.925724783138660f; diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h index 8f7a6f3b0..db2449d02 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h @@ -132,6 +132,7 @@ template <typename EivalsType, typename Cluster> void matrix_function_partition_eigenvalues(const EivalsType& eivals, std::list<Cluster>& clusters) { typedef typename EivalsType::Index Index; + typedef typename EivalsType::RealScalar RealScalar; for (Index i=0; i<eivals.rows(); ++i) { // Find cluster containing i-th ei'val, adding a new cluster if necessary typename std::list<Cluster>::iterator qi = matrix_function_find_cluster(i, clusters); @@ -145,7 +146,7 @@ void matrix_function_partition_eigenvalues(const EivalsType& eivals, std::list<C // Look for other element to add to the set for (Index j=i+1; j<eivals.rows(); ++j) { - if (abs(eivals(j) - eivals(i)) <= matrix_function_separation + if (abs(eivals(j) - eivals(i)) <= RealScalar(matrix_function_separation) && std::find(qi->begin(), qi->end(), j) == qi->end()) { typename std::list<Cluster>::iterator qj = matrix_function_find_cluster(j, clusters); if (qj == clusters.end()) { @@ -403,11 +404,10 @@ struct matrix_function_compute<MatrixType, 0> typedef internal::traits<MatrixType> Traits; typedef typename Traits::Scalar Scalar; static const int Rows = Traits::RowsAtCompileTime, Cols = Traits::ColsAtCompileTime; - static const int Options = MatrixType::Options; static const int MaxRows = Traits::MaxRowsAtCompileTime, MaxCols = Traits::MaxColsAtCompileTime; typedef std::complex<Scalar> ComplexScalar; - typedef Matrix<ComplexScalar, Rows, Cols, Options, MaxRows, MaxCols> ComplexMatrix; + typedef Matrix<ComplexScalar, Rows, Cols, 0, MaxRows, MaxCols> ComplexMatrix; ComplexMatrix CA = A.template cast<ComplexScalar>(); ComplexMatrix Cresult; @@ -508,9 +508,8 @@ template<typename Derived> class MatrixFunctionReturnValue typedef internal::traits<NestedEvalTypeClean> Traits; static const int RowsAtCompileTime = Traits::RowsAtCompileTime; static const int ColsAtCompileTime = Traits::ColsAtCompileTime; - static const int Options = NestedEvalTypeClean::Options; typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar; - typedef Matrix<ComplexScalar, Dynamic, Dynamic, Options, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType; + typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType; typedef internal::MatrixFunctionAtomic<DynMatrixType> AtomicType; AtomicType atomic(m_f); diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h index e43e86e90..1acfbed9e 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h @@ -37,6 +37,7 @@ template <typename MatrixType> void matrix_log_compute_2x2(const MatrixType& A, MatrixType& result) { typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::RealScalar RealScalar; using std::abs; using std::ceil; using std::imag; @@ -54,14 +55,14 @@ void matrix_log_compute_2x2(const MatrixType& A, MatrixType& result) { result(0,1) = A(0,1) / A(0,0); } - else if ((abs(A(0,0)) < 0.5*abs(A(1,1))) || (abs(A(0,0)) > 2*abs(A(1,1)))) + else if ((abs(A(0,0)) < RealScalar(0.5)*abs(A(1,1))) || (abs(A(0,0)) > 2*abs(A(1,1)))) { result(0,1) = A(0,1) * (logA11 - logA00) / y; } else { // computation in previous branch is inaccurate if A(1,1) \approx A(0,0) - int unwindingNumber = static_cast<int>(ceil((imag(logA11 - logA00) - EIGEN_PI) / (2*EIGEN_PI))); + int unwindingNumber = static_cast<int>(ceil((imag(logA11 - logA00) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI))); result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,2*EIGEN_PI*unwindingNumber)) / y; } } @@ -232,8 +233,8 @@ void matrix_log_compute_big(const MatrixType& A, MatrixType& result) MatrixType T = A, sqrtT; int maxPadeDegree = matrix_log_max_pade_degree<Scalar>::value; - const RealScalar maxNormForPade = maxPadeDegree<= 5? 5.3149729967117310e-1: // single precision - maxPadeDegree<= 7? 2.6429608311114350e-1: // double precision + const RealScalar maxNormForPade = maxPadeDegree<= 5? 5.3149729967117310e-1L: // single precision + maxPadeDegree<= 7? 2.6429608311114350e-1L: // double precision maxPadeDegree<= 8? 2.32777776523703892094e-1L: // extended precision maxPadeDegree<=10? 1.05026503471351080481093652651105e-1L: // double-double 1.1880960220216759245467951592883642e-1L; // quadruple precision @@ -333,9 +334,8 @@ public: typedef internal::traits<DerivedEvalTypeClean> Traits; static const int RowsAtCompileTime = Traits::RowsAtCompileTime; static const int ColsAtCompileTime = Traits::ColsAtCompileTime; - static const int Options = DerivedEvalTypeClean::Options; typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar; - typedef Matrix<ComplexScalar, Dynamic, Dynamic, Options, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType; + typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType; typedef internal::MatrixLogarithmAtomic<DynMatrixType> AtomicType; AtomicType atomic; diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h index f37d31c3f..ebc433d89 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h @@ -196,11 +196,11 @@ void MatrixPowerAtomic<MatrixType>::computeBig(ResultType& res) const { using std::ldexp; const int digits = std::numeric_limits<RealScalar>::digits; - const RealScalar maxNormForPade = digits <= 24? 4.3386528e-1f: // sigle precision - digits <= 53? 2.789358995219730e-1: // double precision - digits <= 64? 2.4471944416607995472e-1L: // extended precision - digits <= 106? 1.1016843812851143391275867258512e-1L: // double-double - 9.134603732914548552537150753385375e-2L; // quadruple precision + const RealScalar maxNormForPade = digits <= 24? 4.3386528e-1L // single precision + : digits <= 53? 2.789358995219730e-1L // double precision + : digits <= 64? 2.4471944416607995472e-1L // extended precision + : digits <= 106? 1.1016843812851143391275867258512e-1L // double-double + : 9.134603732914548552537150753385375e-2L; // quadruple precision MatrixType IminusT, sqrtT, T = m_A.template triangularView<Upper>(); RealScalar normIminusT; int degree, degree2, numberOfSquareRoots = 0; @@ -264,7 +264,7 @@ inline int MatrixPowerAtomic<MatrixType>::getPadeDegree(long double normIminusT) 1.999045567181744e-1L, 2.789358995219730e-1L }; #elif LDBL_MANT_DIG <= 64 const int maxPadeDegree = 8; - const double maxNormForPade[] = { 6.3854693117491799460e-3L /* degree = 3 */ , 2.6394893435456973676e-2L, + const long double maxNormForPade[] = { 6.3854693117491799460e-3L /* degree = 3 */ , 2.6394893435456973676e-2L, 6.4216043030404063729e-2L, 1.1701165502926694307e-1L, 1.7904284231268670284e-1L, 2.4471944416607995472e-1L }; #elif LDBL_MANT_DIG <= 106 const int maxPadeDegree = 10; @@ -298,7 +298,7 @@ MatrixPowerAtomic<MatrixType>::computeSuperDiag(const ComplexScalar& curr, const ComplexScalar logCurr = log(curr); ComplexScalar logPrev = log(prev); - int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - EIGEN_PI) / (2*EIGEN_PI)); + int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI)); ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, EIGEN_PI*unwindingNumber); return RealScalar(2) * exp(RealScalar(0.5) * p * (logCurr + logPrev)) * sinh(p * w) / (curr - prev); } @@ -383,7 +383,7 @@ class MatrixPower : internal::noncopyable private: typedef std::complex<RealScalar> ComplexScalar; - typedef Matrix<ComplexScalar, Dynamic, Dynamic, MatrixType::Options, + typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime> ComplexMatrix; /** \brief Reference to the base of matrix power. */ diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h b/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h index 9f08c6162..afd88ec4d 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h @@ -65,21 +65,6 @@ void matrix_sqrt_quasi_triangular_2x1_off_diagonal_block(const MatrixType& T, ty sqrtT.template block<2,1>(i,j) = A.fullPivLu().solve(rhs); } -// similar to compute1x1offDiagonalBlock() -template <typename MatrixType, typename ResultType> -void matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT) -{ - typedef typename traits<MatrixType>::Scalar Scalar; - Matrix<Scalar,2,2> A = sqrtT.template block<2,2>(i,i); - Matrix<Scalar,2,2> B = sqrtT.template block<2,2>(j,j); - Matrix<Scalar,2,2> C = T.template block<2,2>(i,j); - if (j-i > 2) - C -= sqrtT.block(i, i+2, 2, j-i-2) * sqrtT.block(i+2, j, j-i-2, 2); - Matrix<Scalar,2,2> X; - matrix_sqrt_quasi_triangular_solve_auxiliary_equation(X, A, B, C); - sqrtT.template block<2,2>(i,j) = X; -} - // solves the equation A X + X B = C where all matrices are 2-by-2 template <typename MatrixType> void matrix_sqrt_quasi_triangular_solve_auxiliary_equation(MatrixType& X, const MatrixType& A, const MatrixType& B, const MatrixType& C) @@ -98,13 +83,13 @@ void matrix_sqrt_quasi_triangular_solve_auxiliary_equation(MatrixType& X, const coeffMatrix.coeffRef(2,3) = B.coeff(1,0); coeffMatrix.coeffRef(3,1) = A.coeff(1,0); coeffMatrix.coeffRef(3,2) = B.coeff(0,1); - + Matrix<Scalar,4,1> rhs; rhs.coeffRef(0) = C.coeff(0,0); rhs.coeffRef(1) = C.coeff(0,1); rhs.coeffRef(2) = C.coeff(1,0); rhs.coeffRef(3) = C.coeff(1,1); - + Matrix<Scalar,4,1> result; result = coeffMatrix.fullPivLu().solve(rhs); @@ -114,6 +99,20 @@ void matrix_sqrt_quasi_triangular_solve_auxiliary_equation(MatrixType& X, const X.coeffRef(1,1) = result.coeff(3); } +// similar to compute1x1offDiagonalBlock() +template <typename MatrixType, typename ResultType> +void matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT) +{ + typedef typename traits<MatrixType>::Scalar Scalar; + Matrix<Scalar,2,2> A = sqrtT.template block<2,2>(i,i); + Matrix<Scalar,2,2> B = sqrtT.template block<2,2>(j,j); + Matrix<Scalar,2,2> C = T.template block<2,2>(i,j); + if (j-i > 2) + C -= sqrtT.block(i, i+2, 2, j-i-2) * sqrtT.block(i+2, j, j-i-2, 2); + Matrix<Scalar,2,2> X; + matrix_sqrt_quasi_triangular_solve_auxiliary_equation(X, A, B, C); + sqrtT.template block<2,2>(i,j) = X; +} // pre: T is quasi-upper-triangular and sqrtT is a zero matrix of the same size // post: the diagonal blocks of sqrtT are the square roots of the diagonal blocks of T diff --git a/unsupported/Eigen/src/MoreVectorization/CMakeLists.txt b/unsupported/Eigen/src/MoreVectorization/CMakeLists.txt deleted file mode 100644 index 1b887cc8e..000000000 --- a/unsupported/Eigen/src/MoreVectorization/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_MoreVectorization_SRCS "*.h") - -INSTALL(FILES - ${Eigen_MoreVectorization_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/MoreVectorization COMPONENT Devel - ) diff --git a/unsupported/Eigen/src/NonLinearOptimization/CMakeLists.txt b/unsupported/Eigen/src/NonLinearOptimization/CMakeLists.txt deleted file mode 100644 index 9322ddadf..000000000 --- a/unsupported/Eigen/src/NonLinearOptimization/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_NonLinearOptimization_SRCS "*.h") - -INSTALL(FILES - ${Eigen_NonLinearOptimization_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/NonLinearOptimization COMPONENT Devel - ) diff --git a/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h b/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h index b8ba6ddcb..8fe3ed86b 100644 --- a/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h +++ b/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h @@ -150,7 +150,7 @@ HybridNonLinearSolver<FunctorType,Scalar>::solveInit(FVectorType &x) fjac.resize(n, n); if (!useExternalScaling) diag.resize(n); - eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'"); + eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'"); /* Function Body */ nfev = 0; @@ -390,7 +390,7 @@ HybridNonLinearSolver<FunctorType,Scalar>::solveNumericalDiffInit(FVectorType & fvec.resize(n); if (!useExternalScaling) diag.resize(n); - eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'"); + eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'"); /* Function Body */ nfev = 0; diff --git a/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h b/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h index 69106ddc5..fe3b79ca7 100644 --- a/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h +++ b/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h @@ -179,7 +179,7 @@ LevenbergMarquardt<FunctorType,Scalar>::minimizeInit(FVectorType &x) fjac.resize(m, n); if (!useExternalScaling) diag.resize(n); - eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'"); + eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'"); qtf.resize(n); /* Function Body */ @@ -215,7 +215,7 @@ LevenbergMarquardt<FunctorType,Scalar>::minimizeOneStep(FVectorType &x) { using std::abs; using std::sqrt; - + eigen_assert(x.size()==n); // check the caller is not cheating us /* calculate the jacobian matrix. */ @@ -398,7 +398,7 @@ LevenbergMarquardt<FunctorType,Scalar>::minimizeOptimumStorageInit(FVectorType fjac.resize(n, n); if (!useExternalScaling) diag.resize(n); - eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'"); + eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'"); qtf.resize(n); /* Function Body */ diff --git a/unsupported/Eigen/src/NumericalDiff/CMakeLists.txt b/unsupported/Eigen/src/NumericalDiff/CMakeLists.txt deleted file mode 100644 index 1199aca2f..000000000 --- a/unsupported/Eigen/src/NumericalDiff/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_NumericalDiff_SRCS "*.h") - -INSTALL(FILES - ${Eigen_NumericalDiff_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/NumericalDiff COMPONENT Devel - ) diff --git a/unsupported/Eigen/src/Polynomials/CMakeLists.txt b/unsupported/Eigen/src/Polynomials/CMakeLists.txt deleted file mode 100644 index 51f13f3cb..000000000 --- a/unsupported/Eigen/src/Polynomials/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_Polynomials_SRCS "*.h") - -INSTALL(FILES - ${Eigen_Polynomials_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Polynomials COMPONENT Devel - ) diff --git a/unsupported/Eigen/src/Skyline/CMakeLists.txt b/unsupported/Eigen/src/Skyline/CMakeLists.txt deleted file mode 100644 index 3bf1b0dd4..000000000 --- a/unsupported/Eigen/src/Skyline/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_Skyline_SRCS "*.h") - -INSTALL(FILES - ${Eigen_Skyline_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Skyline COMPONENT Devel - ) diff --git a/unsupported/Eigen/src/SparseExtra/CMakeLists.txt b/unsupported/Eigen/src/SparseExtra/CMakeLists.txt deleted file mode 100644 index 7ea32ca5e..000000000 --- a/unsupported/Eigen/src/SparseExtra/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_SparseExtra_SRCS "*.h") - -INSTALL(FILES - ${Eigen_SparseExtra_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/SparseExtra COMPONENT Devel - ) diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h new file mode 100644 index 000000000..ed415db99 --- /dev/null +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h @@ -0,0 +1,124 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +#ifndef EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H +#define EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H + +namespace Eigen { + +/** \cpp11 \returns an expression of the coefficient-wise igamma(\a a, \a x) to the given arrays. + * + * This function computes the coefficient-wise incomplete gamma function. + * + * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types, + * or float/double in non c++11 mode, the user has to provide implementations of igammac(T,T) for any scalar + * type T to be supported. + * + * \sa Eigen::igammac(), Eigen::lgamma() + */ +template<typename Derived,typename ExponentDerived> +inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived> +igamma(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x) +{ + return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>( + a.derived(), + x.derived() + ); +} + +/** \cpp11 \returns an expression of the coefficient-wise igammac(\a a, \a x) to the given arrays. + * + * This function computes the coefficient-wise complementary incomplete gamma function. + * + * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types, + * or float/double in non c++11 mode, the user has to provide implementations of igammac(T,T) for any scalar + * type T to be supported. + * + * \sa Eigen::igamma(), Eigen::lgamma() + */ +template<typename Derived,typename ExponentDerived> +inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived> +igammac(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x) +{ + return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>( + a.derived(), + x.derived() + ); +} + +/** \cpp11 \returns an expression of the coefficient-wise polygamma(\a n, \a x) to the given arrays. + * + * It returns the \a n -th derivative of the digamma(psi) evaluated at \c x. + * + * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types, + * or float/double in non c++11 mode, the user has to provide implementations of polygamma(T,T) for any scalar + * type T to be supported. + * + * \sa Eigen::digamma() + */ +// * \warning Be careful with the order of the parameters: x.polygamma(n) is equivalent to polygamma(n,x) +// * \sa ArrayBase::polygamma() +template<typename DerivedN,typename DerivedX> +inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX> +polygamma(const Eigen::ArrayBase<DerivedN>& n, const Eigen::ArrayBase<DerivedX>& x) +{ + return Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX>( + n.derived(), + x.derived() + ); +} + +/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given arrays. + * + * This function computes the regularized incomplete beta function (integral). + * + * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types, + * or float/double in non c++11 mode, the user has to provide implementations of betainc(T,T,T) for any scalar + * type T to be supported. + * + * \sa Eigen::betainc(), Eigen::lgamma() + */ +template<typename ArgADerived, typename ArgBDerived, typename ArgXDerived> +inline const Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived> +betainc(const Eigen::ArrayBase<ArgADerived>& a, const Eigen::ArrayBase<ArgBDerived>& b, const Eigen::ArrayBase<ArgXDerived>& x) +{ + return Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived>( + a.derived(), + b.derived(), + x.derived() + ); +} + + +/** \returns an expression of the coefficient-wise zeta(\a x, \a q) to the given arrays. + * + * It returns the Riemann zeta function of two arguments \a x and \a q: + * + * \param x is the exposent, it must be > 1 + * \param q is the shift, it must be > 0 + * + * \note This function supports only float and double scalar types. To support other scalar types, the user has + * to provide implementations of zeta(T,T) for any scalar type T to be supported. + * + * \sa ArrayBase::zeta() + */ +template<typename DerivedX,typename DerivedQ> +inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ> +zeta(const Eigen::ArrayBase<DerivedX>& x, const Eigen::ArrayBase<DerivedQ>& q) +{ + return Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>( + x.derived(), + q.derived() + ); +} + +} // end namespace Eigen + +#endif // EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h new file mode 100644 index 000000000..d8f2363be --- /dev/null +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h @@ -0,0 +1,236 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com> +// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPECIALFUNCTIONS_FUNCTORS_H +#define EIGEN_SPECIALFUNCTIONS_FUNCTORS_H + +namespace Eigen { + +namespace internal { + + +/** \internal + * \brief Template functor to compute the incomplete gamma function igamma(a, x) + * + * \sa class CwiseBinaryOp, Cwise::igamma + */ +template<typename Scalar> struct scalar_igamma_op : binary_op_base<Scalar,Scalar> +{ + EIGEN_EMPTY_STRUCT_CTOR(scalar_igamma_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const { + using numext::igamma; return igamma(a, x); + } + template<typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const { + return internal::pigamma(a, x); + } +}; +template<typename Scalar> +struct functor_traits<scalar_igamma_op<Scalar> > { + enum { + // Guesstimate + Cost = 20 * NumTraits<Scalar>::MulCost + 10 * NumTraits<Scalar>::AddCost, + PacketAccess = packet_traits<Scalar>::HasIGamma + }; +}; + + +/** \internal + * \brief Template functor to compute the complementary incomplete gamma function igammac(a, x) + * + * \sa class CwiseBinaryOp, Cwise::igammac + */ +template<typename Scalar> struct scalar_igammac_op : binary_op_base<Scalar,Scalar> +{ + EIGEN_EMPTY_STRUCT_CTOR(scalar_igammac_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const { + using numext::igammac; return igammac(a, x); + } + template<typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const + { + return internal::pigammac(a, x); + } +}; +template<typename Scalar> +struct functor_traits<scalar_igammac_op<Scalar> > { + enum { + // Guesstimate + Cost = 20 * NumTraits<Scalar>::MulCost + 10 * NumTraits<Scalar>::AddCost, + PacketAccess = packet_traits<Scalar>::HasIGammac + }; +}; + + +/** \internal + * \brief Template functor to compute the incomplete beta integral betainc(a, b, x) + * + */ +template<typename Scalar> struct scalar_betainc_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_betainc_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& x, const Scalar& a, const Scalar& b) const { + using numext::betainc; return betainc(x, a, b); + } + template<typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& x, const Packet& a, const Packet& b) const + { + return internal::pbetainc(x, a, b); + } +}; +template<typename Scalar> +struct functor_traits<scalar_betainc_op<Scalar> > { + enum { + // Guesstimate + Cost = 400 * NumTraits<Scalar>::MulCost + 400 * NumTraits<Scalar>::AddCost, + PacketAccess = packet_traits<Scalar>::HasBetaInc + }; +}; + + +/** \internal + * \brief Template functor to compute the natural log of the absolute + * value of Gamma of a scalar + * \sa class CwiseUnaryOp, Cwise::lgamma() + */ +template<typename Scalar> struct scalar_lgamma_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_lgamma_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { + using numext::lgamma; return lgamma(a); + } + typedef typename packet_traits<Scalar>::type Packet; + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plgamma(a); } +}; +template<typename Scalar> +struct functor_traits<scalar_lgamma_op<Scalar> > +{ + enum { + // Guesstimate + Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost, + PacketAccess = packet_traits<Scalar>::HasLGamma + }; +}; + +/** \internal + * \brief Template functor to compute psi, the derivative of lgamma of a scalar. + * \sa class CwiseUnaryOp, Cwise::digamma() + */ +template<typename Scalar> struct scalar_digamma_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_digamma_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { + using numext::digamma; return digamma(a); + } + typedef typename packet_traits<Scalar>::type Packet; + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pdigamma(a); } +}; +template<typename Scalar> +struct functor_traits<scalar_digamma_op<Scalar> > +{ + enum { + // Guesstimate + Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost, + PacketAccess = packet_traits<Scalar>::HasDiGamma + }; +}; + +/** \internal + * \brief Template functor to compute the Riemann Zeta function of two arguments. + * \sa class CwiseUnaryOp, Cwise::zeta() + */ +template<typename Scalar> struct scalar_zeta_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_zeta_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& x, const Scalar& q) const { + using numext::zeta; return zeta(x, q); + } + typedef typename packet_traits<Scalar>::type Packet; + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x, const Packet& q) const { return internal::pzeta(x, q); } +}; +template<typename Scalar> +struct functor_traits<scalar_zeta_op<Scalar> > +{ + enum { + // Guesstimate + Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost, + PacketAccess = packet_traits<Scalar>::HasZeta + }; +}; + +/** \internal + * \brief Template functor to compute the polygamma function. + * \sa class CwiseUnaryOp, Cwise::polygamma() + */ +template<typename Scalar> struct scalar_polygamma_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_polygamma_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& n, const Scalar& x) const { + using numext::polygamma; return polygamma(n, x); + } + typedef typename packet_traits<Scalar>::type Packet; + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& n, const Packet& x) const { return internal::ppolygamma(n, x); } +}; +template<typename Scalar> +struct functor_traits<scalar_polygamma_op<Scalar> > +{ + enum { + // Guesstimate + Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost, + PacketAccess = packet_traits<Scalar>::HasPolygamma + }; +}; + +/** \internal + * \brief Template functor to compute the Gauss error function of a + * scalar + * \sa class CwiseUnaryOp, Cwise::erf() + */ +template<typename Scalar> struct scalar_erf_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_erf_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { + using numext::erf; return erf(a); + } + typedef typename packet_traits<Scalar>::type Packet; + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perf(a); } +}; +template<typename Scalar> +struct functor_traits<scalar_erf_op<Scalar> > +{ + enum { + // Guesstimate + Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost, + PacketAccess = packet_traits<Scalar>::HasErf + }; +}; + +/** \internal + * \brief Template functor to compute the Complementary Error Function + * of a scalar + * \sa class CwiseUnaryOp, Cwise::erfc() + */ +template<typename Scalar> struct scalar_erfc_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_erfc_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { + using numext::erfc; return erfc(a); + } + typedef typename packet_traits<Scalar>::type Packet; + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perfc(a); } +}; +template<typename Scalar> +struct functor_traits<scalar_erfc_op<Scalar> > +{ + enum { + // Guesstimate + Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost, + PacketAccess = packet_traits<Scalar>::HasErfc + }; +}; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_SPECIALFUNCTIONS_FUNCTORS_H diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h new file mode 100644 index 000000000..553bcda6a --- /dev/null +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h @@ -0,0 +1,47 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPECIALFUNCTIONS_HALF_H +#define EIGEN_SPECIALFUNCTIONS_HALF_H + +namespace Eigen { +namespace numext { + +#if EIGEN_HAS_C99_MATH +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half lgamma(const Eigen::half& a) { + return Eigen::half(Eigen::numext::lgamma(static_cast<float>(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half digamma(const Eigen::half& a) { + return Eigen::half(Eigen::numext::digamma(static_cast<float>(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half zeta(const Eigen::half& x, const Eigen::half& q) { + return Eigen::half(Eigen::numext::zeta(static_cast<float>(x), static_cast<float>(q))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half polygamma(const Eigen::half& n, const Eigen::half& x) { + return Eigen::half(Eigen::numext::polygamma(static_cast<float>(n), static_cast<float>(x))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erf(const Eigen::half& a) { + return Eigen::half(Eigen::numext::erf(static_cast<float>(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erfc(const Eigen::half& a) { + return Eigen::half(Eigen::numext::erfc(static_cast<float>(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma(const Eigen::half& a, const Eigen::half& x) { + return Eigen::half(Eigen::numext::igamma(static_cast<float>(a), static_cast<float>(x))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igammac(const Eigen::half& a, const Eigen::half& x) { + return Eigen::half(Eigen::numext::igammac(static_cast<float>(a), static_cast<float>(x))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half betainc(const Eigen::half& a, const Eigen::half& b, const Eigen::half& x) { + return Eigen::half(Eigen::numext::betainc(static_cast<float>(a), static_cast<float>(b), static_cast<float>(x))); +} +#endif + +} // end namespace numext +} // end namespace Eigen + +#endif // EIGEN_SPECIALFUNCTIONS_HALF_H diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h new file mode 100644 index 000000000..52619fc0c --- /dev/null +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h @@ -0,0 +1,1551 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Eugene Brevdo <ebrevdo@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPECIAL_FUNCTIONS_H +#define EIGEN_SPECIAL_FUNCTIONS_H + +namespace Eigen { +namespace internal { + +// Parts of this code are based on the Cephes Math Library. +// +// Cephes Math Library Release 2.8: June, 2000 +// Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier +// +// Permission has been kindly provided by the original author +// to incorporate the Cephes software into the Eigen codebase: +// +// From: Stephen Moshier +// To: Eugene Brevdo +// Subject: Re: Permission to wrap several cephes functions in Eigen +// +// Hello Eugene, +// +// Thank you for writing. +// +// If your licensing is similar to BSD, the formal way that has been +// handled is simply to add a statement to the effect that you are incorporating +// the Cephes software by permission of the author. +// +// Good luck with your project, +// Steve + +namespace cephes { + +/* polevl (modified for Eigen) + * + * Evaluate polynomial + * + * + * + * SYNOPSIS: + * + * int N; + * Scalar x, y, coef[N+1]; + * + * y = polevl<decltype(x), N>( x, coef); + * + * + * + * DESCRIPTION: + * + * Evaluates polynomial of degree N: + * + * 2 N + * y = C + C x + C x +...+ C x + * 0 1 2 N + * + * Coefficients are stored in reverse order: + * + * coef[0] = C , ..., coef[N] = C . + * N 0 + * + * The function p1evl() assumes that coef[N] = 1.0 and is + * omitted from the array. Its calling arguments are + * otherwise the same as polevl(). + * + * + * The Eigen implementation is templatized. For best speed, store + * coef as a const array (constexpr), e.g. + * + * const double coef[] = {1.0, 2.0, 3.0, ...}; + * + */ +template <typename Scalar, int N> +struct polevl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar x, const Scalar coef[]) { + EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + + return polevl<Scalar, N - 1>::run(x, coef) * x + coef[N]; + } +}; + +template <typename Scalar> +struct polevl<Scalar, 0> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar, const Scalar coef[]) { + return coef[0]; + } +}; + +} // end namespace cephes + +/**************************************************************************** + * Implementation of lgamma, requires C++11/C99 * + ****************************************************************************/ + +template <typename Scalar> +struct lgamma_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar) { + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +template <typename Scalar> +struct lgamma_retval { + typedef Scalar type; +}; + +#if EIGEN_HAS_C99_MATH +template <> +struct lgamma_impl<float> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float run(float x) { return ::lgammaf(x); } +}; + +template <> +struct lgamma_impl<double> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double run(double x) { return ::lgamma(x); } +}; +#endif + +/**************************************************************************** + * Implementation of digamma (psi), based on Cephes * + ****************************************************************************/ + +template <typename Scalar> +struct digamma_retval { + typedef Scalar type; +}; + +/* + * + * Polynomial evaluation helper for the Psi (digamma) function. + * + * digamma_impl_maybe_poly::run(s) evaluates the asymptotic Psi expansion for + * input Scalar s, assuming s is above 10.0. + * + * If s is above a certain threshold for the given Scalar type, zero + * is returned. Otherwise the polynomial is evaluated with enough + * coefficients for results matching Scalar machine precision. + * + * + */ +template <typename Scalar> +struct digamma_impl_maybe_poly { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar) { + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + + +template <> +struct digamma_impl_maybe_poly<float> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float run(const float s) { + const float A[] = { + -4.16666666666666666667E-3f, + 3.96825396825396825397E-3f, + -8.33333333333333333333E-3f, + 8.33333333333333333333E-2f + }; + + float z; + if (s < 1.0e8f) { + z = 1.0f / (s * s); + return z * cephes::polevl<float, 3>::run(z, A); + } else return 0.0f; + } +}; + +template <> +struct digamma_impl_maybe_poly<double> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double run(const double s) { + const double A[] = { + 8.33333333333333333333E-2, + -2.10927960927960927961E-2, + 7.57575757575757575758E-3, + -4.16666666666666666667E-3, + 3.96825396825396825397E-3, + -8.33333333333333333333E-3, + 8.33333333333333333333E-2 + }; + + double z; + if (s < 1.0e17) { + z = 1.0 / (s * s); + return z * cephes::polevl<double, 6>::run(z, A); + } + else return 0.0; + } +}; + +template <typename Scalar> +struct digamma_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar x) { + /* + * + * Psi (digamma) function (modified for Eigen) + * + * + * SYNOPSIS: + * + * double x, y, psi(); + * + * y = psi( x ); + * + * + * DESCRIPTION: + * + * d - + * psi(x) = -- ln | (x) + * dx + * + * is the logarithmic derivative of the gamma function. + * For integer x, + * n-1 + * - + * psi(n) = -EUL + > 1/k. + * - + * k=1 + * + * If x is negative, it is transformed to a positive argument by the + * reflection formula psi(1-x) = psi(x) + pi cot(pi x). + * For general positive x, the argument is made greater than 10 + * using the recurrence psi(x+1) = psi(x) + 1/x. + * Then the following asymptotic expansion is applied: + * + * inf. B + * - 2k + * psi(x) = log(x) - 1/2x - > ------- + * - 2k + * k=1 2k x + * + * where the B2k are Bernoulli numbers. + * + * ACCURACY (float): + * Relative error (except absolute when |psi| < 1): + * arithmetic domain # trials peak rms + * IEEE 0,30 30000 1.3e-15 1.4e-16 + * IEEE -30,0 40000 1.5e-15 2.2e-16 + * + * ACCURACY (double): + * Absolute error, relative when |psi| > 1 : + * arithmetic domain # trials peak rms + * IEEE -33,0 30000 8.2e-7 1.2e-7 + * IEEE 0,33 100000 7.3e-7 7.7e-8 + * + * ERROR MESSAGES: + * message condition value returned + * psi singularity x integer <=0 INFINITY + */ + + Scalar p, q, nz, s, w, y; + bool negative = false; + + const Scalar maxnum = NumTraits<Scalar>::infinity(); + const Scalar m_pi = Scalar(EIGEN_PI); + + const Scalar zero = Scalar(0); + const Scalar one = Scalar(1); + const Scalar half = Scalar(0.5); + nz = zero; + + if (x <= zero) { + negative = true; + q = x; + p = numext::floor(q); + if (p == q) { + return maxnum; + } + /* Remove the zeros of tan(m_pi x) + * by subtracting the nearest integer from x + */ + nz = q - p; + if (nz != half) { + if (nz > half) { + p += one; + nz = q - p; + } + nz = m_pi / numext::tan(m_pi * nz); + } + else { + nz = zero; + } + x = one - x; + } + + /* use the recurrence psi(x+1) = psi(x) + 1/x. */ + s = x; + w = zero; + while (s < Scalar(10)) { + w += one / s; + s += one; + } + + y = digamma_impl_maybe_poly<Scalar>::run(s); + + y = numext::log(s) - (half / s) - y - w; + + return (negative) ? y - nz : y; + } +}; + +/**************************************************************************** + * Implementation of erf, requires C++11/C99 * + ****************************************************************************/ + +template <typename Scalar> +struct erf_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar) { + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +template <typename Scalar> +struct erf_retval { + typedef Scalar type; +}; + +#if EIGEN_HAS_C99_MATH +template <> +struct erf_impl<float> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float run(float x) { return ::erff(x); } +}; + +template <> +struct erf_impl<double> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double run(double x) { return ::erf(x); } +}; +#endif // EIGEN_HAS_C99_MATH + +/*************************************************************************** +* Implementation of erfc, requires C++11/C99 * +****************************************************************************/ + +template <typename Scalar> +struct erfc_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar) { + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +template <typename Scalar> +struct erfc_retval { + typedef Scalar type; +}; + +#if EIGEN_HAS_C99_MATH +template <> +struct erfc_impl<float> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float run(const float x) { return ::erfcf(x); } +}; + +template <> +struct erfc_impl<double> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double run(const double x) { return ::erfc(x); } +}; +#endif // EIGEN_HAS_C99_MATH + +/************************************************************************************************************** + * Implementation of igammac (complemented incomplete gamma integral), based on Cephes but requires C++11/C99 * + **************************************************************************************************************/ + +template <typename Scalar> +struct igammac_retval { + typedef Scalar type; +}; + +// NOTE: cephes_helper is also used to implement zeta +template <typename Scalar> +struct cephes_helper { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar machep() { assert(false && "machep not supported for this type"); return 0.0; } + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar big() { assert(false && "big not supported for this type"); return 0.0; } + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar biginv() { assert(false && "biginv not supported for this type"); return 0.0; } +}; + +template <> +struct cephes_helper<float> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float machep() { + return NumTraits<float>::epsilon() / 2; // 1.0 - machep == 1.0 + } + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float big() { + // use epsneg (1.0 - epsneg == 1.0) + return 1.0f / (NumTraits<float>::epsilon() / 2); + } + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float biginv() { + // epsneg + return machep(); + } +}; + +template <> +struct cephes_helper<double> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double machep() { + return NumTraits<double>::epsilon() / 2; // 1.0 - machep == 1.0 + } + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double big() { + return 1.0 / NumTraits<double>::epsilon(); + } + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double biginv() { + // inverse of eps + return NumTraits<double>::epsilon(); + } +}; + +#if !EIGEN_HAS_C99_MATH + +template <typename Scalar> +struct igammac_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar a, Scalar x) { + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +#else + +template <typename Scalar> struct igamma_impl; // predeclare igamma_impl + +template <typename Scalar> +struct igammac_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar a, Scalar x) { + /* igamc() + * + * Incomplete gamma integral (modified for Eigen) + * + * + * + * SYNOPSIS: + * + * double a, x, y, igamc(); + * + * y = igamc( a, x ); + * + * DESCRIPTION: + * + * The function is defined by + * + * + * igamc(a,x) = 1 - igam(a,x) + * + * inf. + * - + * 1 | | -t a-1 + * = ----- | e t dt. + * - | | + * | (a) - + * x + * + * + * In this implementation both arguments must be positive. + * The integral is evaluated by either a power series or + * continued fraction expansion, depending on the relative + * values of a and x. + * + * ACCURACY (float): + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0,30 30000 7.8e-6 5.9e-7 + * + * + * ACCURACY (double): + * + * Tested at random a, x. + * a x Relative error: + * arithmetic domain domain # trials peak rms + * IEEE 0.5,100 0,100 200000 1.9e-14 1.7e-15 + * IEEE 0.01,0.5 0,100 200000 1.4e-13 1.6e-15 + * + */ + /* + Cephes Math Library Release 2.2: June, 1992 + Copyright 1985, 1987, 1992 by Stephen L. Moshier + Direct inquiries to 30 Frost Street, Cambridge, MA 02140 + */ + const Scalar zero = 0; + const Scalar one = 1; + const Scalar nan = NumTraits<Scalar>::quiet_NaN(); + + if ((x < zero) || (a <= zero)) { + // domain error + return nan; + } + + if ((x < one) || (x < a)) { + /* The checks above ensure that we meet the preconditions for + * igamma_impl::Impl(), so call it, rather than igamma_impl::Run(). + * Calling Run() would also work, but in that case the compiler may not be + * able to prove that igammac_impl::Run and igamma_impl::Run are not + * mutually recursive. This leads to worse code, particularly on + * platforms like nvptx, where recursion is allowed only begrudgingly. + */ + return (one - igamma_impl<Scalar>::Impl(a, x)); + } + + return Impl(a, x); + } + + private: + /* igamma_impl calls igammac_impl::Impl. */ + friend struct igamma_impl<Scalar>; + + /* Actually computes igamc(a, x). + * + * Preconditions: + * a > 0 + * x >= 1 + * x >= a + */ + EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) { + const Scalar zero = 0; + const Scalar one = 1; + const Scalar two = 2; + const Scalar machep = cephes_helper<Scalar>::machep(); + const Scalar maxlog = numext::log(NumTraits<Scalar>::highest()); + const Scalar big = cephes_helper<Scalar>::big(); + const Scalar biginv = cephes_helper<Scalar>::biginv(); + const Scalar inf = NumTraits<Scalar>::infinity(); + + Scalar ans, ax, c, yc, r, t, y, z; + Scalar pk, pkm1, pkm2, qk, qkm1, qkm2; + + if (x == inf) return zero; // std::isinf crashes on CUDA + + /* Compute x**a * exp(-x) / gamma(a) */ + ax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a); + if (ax < -maxlog) { // underflow + return zero; + } + ax = numext::exp(ax); + + // continued fraction + y = one - a; + z = x + y + one; + c = zero; + pkm2 = one; + qkm2 = x; + pkm1 = x + one; + qkm1 = z * x; + ans = pkm1 / qkm1; + + while (true) { + c += one; + y += one; + z += two; + yc = y * c; + pk = pkm1 * z - pkm2 * yc; + qk = qkm1 * z - qkm2 * yc; + if (qk != zero) { + r = pk / qk; + t = numext::abs((ans - r) / r); + ans = r; + } else { + t = one; + } + pkm2 = pkm1; + pkm1 = pk; + qkm2 = qkm1; + qkm1 = qk; + if (numext::abs(pk) > big) { + pkm2 *= biginv; + pkm1 *= biginv; + qkm2 *= biginv; + qkm1 *= biginv; + } + if (t <= machep) { + break; + } + } + + return (ans * ax); + } +}; + +#endif // EIGEN_HAS_C99_MATH + +/************************************************************************************************ + * Implementation of igamma (incomplete gamma integral), based on Cephes but requires C++11/C99 * + ************************************************************************************************/ + +template <typename Scalar> +struct igamma_retval { + typedef Scalar type; +}; + +#if !EIGEN_HAS_C99_MATH + +template <typename Scalar> +struct igamma_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar x) { + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +#else + +template <typename Scalar> +struct igamma_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar a, Scalar x) { + /* igam() + * Incomplete gamma integral + * + * + * + * SYNOPSIS: + * + * double a, x, y, igam(); + * + * y = igam( a, x ); + * + * DESCRIPTION: + * + * The function is defined by + * + * x + * - + * 1 | | -t a-1 + * igam(a,x) = ----- | e t dt. + * - | | + * | (a) - + * 0 + * + * + * In this implementation both arguments must be positive. + * The integral is evaluated by either a power series or + * continued fraction expansion, depending on the relative + * values of a and x. + * + * ACCURACY (double): + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0,30 200000 3.6e-14 2.9e-15 + * IEEE 0,100 300000 9.9e-14 1.5e-14 + * + * + * ACCURACY (float): + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0,30 20000 7.8e-6 5.9e-7 + * + */ + /* + Cephes Math Library Release 2.2: June, 1992 + Copyright 1985, 1987, 1992 by Stephen L. Moshier + Direct inquiries to 30 Frost Street, Cambridge, MA 02140 + */ + + + /* left tail of incomplete gamma function: + * + * inf. k + * a -x - x + * x e > ---------- + * - - + * k=0 | (a+k+1) + * + */ + const Scalar zero = 0; + const Scalar one = 1; + const Scalar nan = NumTraits<Scalar>::quiet_NaN(); + + if (x == zero) return zero; + + if ((x < zero) || (a <= zero)) { // domain error + return nan; + } + + if ((x > one) && (x > a)) { + /* The checks above ensure that we meet the preconditions for + * igammac_impl::Impl(), so call it, rather than igammac_impl::Run(). + * Calling Run() would also work, but in that case the compiler may not be + * able to prove that igammac_impl::Run and igamma_impl::Run are not + * mutually recursive. This leads to worse code, particularly on + * platforms like nvptx, where recursion is allowed only begrudgingly. + */ + return (one - igammac_impl<Scalar>::Impl(a, x)); + } + + return Impl(a, x); + } + + private: + /* igammac_impl calls igamma_impl::Impl. */ + friend struct igammac_impl<Scalar>; + + /* Actually computes igam(a, x). + * + * Preconditions: + * x > 0 + * a > 0 + * !(x > 1 && x > a) + */ + EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) { + const Scalar zero = 0; + const Scalar one = 1; + const Scalar machep = cephes_helper<Scalar>::machep(); + const Scalar maxlog = numext::log(NumTraits<Scalar>::highest()); + + Scalar ans, ax, c, r; + + /* Compute x**a * exp(-x) / gamma(a) */ + ax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a); + if (ax < -maxlog) { + // underflow + return zero; + } + ax = numext::exp(ax); + + /* power series */ + r = a; + c = one; + ans = one; + + while (true) { + r += one; + c *= x/r; + ans += c; + if (c/ans <= machep) { + break; + } + } + + return (ans * ax / a); + } +}; + +#endif // EIGEN_HAS_C99_MATH + +/***************************************************************************** + * Implementation of Riemann zeta function of two arguments, based on Cephes * + *****************************************************************************/ + +template <typename Scalar> +struct zeta_retval { + typedef Scalar type; +}; + +template <typename Scalar> +struct zeta_impl_series { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar) { + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +template <> +struct zeta_impl_series<float> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE bool run(float& a, float& b, float& s, const float x, const float machep) { + int i = 0; + while(i < 9) + { + i += 1; + a += 1.0f; + b = numext::pow( a, -x ); + s += b; + if( numext::abs(b/s) < machep ) + return true; + } + + //Return whether we are done + return false; + } +}; + +template <> +struct zeta_impl_series<double> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE bool run(double& a, double& b, double& s, const double x, const double machep) { + int i = 0; + while( (i < 9) || (a <= 9.0) ) + { + i += 1; + a += 1.0; + b = numext::pow( a, -x ); + s += b; + if( numext::abs(b/s) < machep ) + return true; + } + + //Return whether we are done + return false; + } +}; + +template <typename Scalar> +struct zeta_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar x, Scalar q) { + /* zeta.c + * + * Riemann zeta function of two arguments + * + * + * + * SYNOPSIS: + * + * double x, q, y, zeta(); + * + * y = zeta( x, q ); + * + * + * + * DESCRIPTION: + * + * + * + * inf. + * - -x + * zeta(x,q) = > (k+q) + * - + * k=0 + * + * where x > 1 and q is not a negative integer or zero. + * The Euler-Maclaurin summation formula is used to obtain + * the expansion + * + * n + * - -x + * zeta(x,q) = > (k+q) + * - + * k=1 + * + * 1-x inf. B x(x+1)...(x+2j) + * (n+q) 1 - 2j + * + --------- - ------- + > -------------------- + * x-1 x - x+2j+1 + * 2(n+q) j=1 (2j)! (n+q) + * + * where the B2j are Bernoulli numbers. Note that (see zetac.c) + * zeta(x,1) = zetac(x) + 1. + * + * + * + * ACCURACY: + * + * Relative error for single precision: + * arithmetic domain # trials peak rms + * IEEE 0,25 10000 6.9e-7 1.0e-7 + * + * Large arguments may produce underflow in powf(), in which + * case the results are inaccurate. + * + * REFERENCE: + * + * Gradshteyn, I. S., and I. M. Ryzhik, Tables of Integrals, + * Series, and Products, p. 1073; Academic Press, 1980. + * + */ + + int i; + Scalar p, r, a, b, k, s, t, w; + + const Scalar A[] = { + Scalar(12.0), + Scalar(-720.0), + Scalar(30240.0), + Scalar(-1209600.0), + Scalar(47900160.0), + Scalar(-1.8924375803183791606e9), /*1.307674368e12/691*/ + Scalar(7.47242496e10), + Scalar(-2.950130727918164224e12), /*1.067062284288e16/3617*/ + Scalar(1.1646782814350067249e14), /*5.109094217170944e18/43867*/ + Scalar(-4.5979787224074726105e15), /*8.028576626982912e20/174611*/ + Scalar(1.8152105401943546773e17), /*1.5511210043330985984e23/854513*/ + Scalar(-7.1661652561756670113e18) /*1.6938241367317436694528e27/236364091*/ + }; + + const Scalar maxnum = NumTraits<Scalar>::infinity(); + const Scalar zero = 0.0, half = 0.5, one = 1.0; + const Scalar machep = cephes_helper<Scalar>::machep(); + const Scalar nan = NumTraits<Scalar>::quiet_NaN(); + + if( x == one ) + return maxnum; + + if( x < one ) + { + return nan; + } + + if( q <= zero ) + { + if(q == numext::floor(q)) + { + return maxnum; + } + p = x; + r = numext::floor(p); + if (p != r) + return nan; + } + + /* Permit negative q but continue sum until n+q > +9 . + * This case should be handled by a reflection formula. + * If q<0 and x is an integer, there is a relation to + * the polygamma function. + */ + s = numext::pow( q, -x ); + a = q; + b = zero; + // Run the summation in a helper function that is specific to the floating precision + if (zeta_impl_series<Scalar>::run(a, b, s, x, machep)) { + return s; + } + + w = a; + s += b*w/(x-one); + s -= half * b; + a = one; + k = zero; + for( i=0; i<12; i++ ) + { + a *= x + k; + b /= w; + t = a*b/A[i]; + s = s + t; + t = numext::abs(t/s); + if( t < machep ) { + break; + } + k += one; + a *= x + k; + b /= w; + k += one; + } + return s; + } +}; + +/**************************************************************************** + * Implementation of polygamma function, requires C++11/C99 * + ****************************************************************************/ + +template <typename Scalar> +struct polygamma_retval { + typedef Scalar type; +}; + +#if !EIGEN_HAS_C99_MATH + +template <typename Scalar> +struct polygamma_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(Scalar n, Scalar x) { + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +#else + +template <typename Scalar> +struct polygamma_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar n, Scalar x) { + Scalar zero = 0.0, one = 1.0; + Scalar nplus = n + one; + const Scalar nan = NumTraits<Scalar>::quiet_NaN(); + + // Check that n is an integer + if (numext::floor(n) != n) { + return nan; + } + // Just return the digamma function for n = 1 + else if (n == zero) { + return digamma_impl<Scalar>::run(x); + } + // Use the same implementation as scipy + else { + Scalar factorial = numext::exp(lgamma_impl<Scalar>::run(nplus)); + return numext::pow(-one, nplus) * factorial * zeta_impl<Scalar>::run(nplus, x); + } + } +}; + +#endif // EIGEN_HAS_C99_MATH + +/************************************************************************************************ + * Implementation of betainc (incomplete beta integral), based on Cephes but requires C++11/C99 * + ************************************************************************************************/ + +template <typename Scalar> +struct betainc_retval { + typedef Scalar type; +}; + +#if !EIGEN_HAS_C99_MATH + +template <typename Scalar> +struct betainc_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar b, Scalar x) { + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +#else + +template <typename Scalar> +struct betainc_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(Scalar, Scalar, Scalar) { + /* betaincf.c + * + * Incomplete beta integral + * + * + * SYNOPSIS: + * + * float a, b, x, y, betaincf(); + * + * y = betaincf( a, b, x ); + * + * + * DESCRIPTION: + * + * Returns incomplete beta integral of the arguments, evaluated + * from zero to x. The function is defined as + * + * x + * - - + * | (a+b) | | a-1 b-1 + * ----------- | t (1-t) dt. + * - - | | + * | (a) | (b) - + * 0 + * + * The domain of definition is 0 <= x <= 1. In this + * implementation a and b are restricted to positive values. + * The integral from x to 1 may be obtained by the symmetry + * relation + * + * 1 - betainc( a, b, x ) = betainc( b, a, 1-x ). + * + * The integral is evaluated by a continued fraction expansion. + * If a < 1, the function calls itself recursively after a + * transformation to increase a to a+1. + * + * ACCURACY (float): + * + * Tested at random points (a,b,x) with a and b in the indicated + * interval and x between 0 and 1. + * + * arithmetic domain # trials peak rms + * Relative error: + * IEEE 0,30 10000 3.7e-5 5.1e-6 + * IEEE 0,100 10000 1.7e-4 2.5e-5 + * The useful domain for relative error is limited by underflow + * of the single precision exponential function. + * Absolute error: + * IEEE 0,30 100000 2.2e-5 9.6e-7 + * IEEE 0,100 10000 6.5e-5 3.7e-6 + * + * Larger errors may occur for extreme ratios of a and b. + * + * ACCURACY (double): + * arithmetic domain # trials peak rms + * IEEE 0,5 10000 6.9e-15 4.5e-16 + * IEEE 0,85 250000 2.2e-13 1.7e-14 + * IEEE 0,1000 30000 5.3e-12 6.3e-13 + * IEEE 0,10000 250000 9.3e-11 7.1e-12 + * IEEE 0,100000 10000 8.7e-10 4.8e-11 + * Outputs smaller than the IEEE gradual underflow threshold + * were excluded from these statistics. + * + * ERROR MESSAGES: + * message condition value returned + * incbet domain x<0, x>1 nan + * incbet underflow nan + */ + + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +/* Continued fraction expansion #1 for incomplete beta integral (small_branch = True) + * Continued fraction expansion #2 for incomplete beta integral (small_branch = False) + */ +template <typename Scalar> +struct incbeta_cfe { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar b, Scalar x, bool small_branch) { + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, float>::value || + internal::is_same<Scalar, double>::value), + THIS_TYPE_IS_NOT_SUPPORTED); + const Scalar big = cephes_helper<Scalar>::big(); + const Scalar machep = cephes_helper<Scalar>::machep(); + const Scalar biginv = cephes_helper<Scalar>::biginv(); + + const Scalar zero = 0; + const Scalar one = 1; + const Scalar two = 2; + + Scalar xk, pk, pkm1, pkm2, qk, qkm1, qkm2; + Scalar k1, k2, k3, k4, k5, k6, k7, k8, k26update; + Scalar ans; + int n; + + const int num_iters = (internal::is_same<Scalar, float>::value) ? 100 : 300; + const Scalar thresh = + (internal::is_same<Scalar, float>::value) ? machep : Scalar(3) * machep; + Scalar r = (internal::is_same<Scalar, float>::value) ? zero : one; + + if (small_branch) { + k1 = a; + k2 = a + b; + k3 = a; + k4 = a + one; + k5 = one; + k6 = b - one; + k7 = k4; + k8 = a + two; + k26update = one; + } else { + k1 = a; + k2 = b - one; + k3 = a; + k4 = a + one; + k5 = one; + k6 = a + b; + k7 = a + one; + k8 = a + two; + k26update = -one; + x = x / (one - x); + } + + pkm2 = zero; + qkm2 = one; + pkm1 = one; + qkm1 = one; + ans = one; + n = 0; + + do { + xk = -(x * k1 * k2) / (k3 * k4); + pk = pkm1 + pkm2 * xk; + qk = qkm1 + qkm2 * xk; + pkm2 = pkm1; + pkm1 = pk; + qkm2 = qkm1; + qkm1 = qk; + + xk = (x * k5 * k6) / (k7 * k8); + pk = pkm1 + pkm2 * xk; + qk = qkm1 + qkm2 * xk; + pkm2 = pkm1; + pkm1 = pk; + qkm2 = qkm1; + qkm1 = qk; + + if (qk != zero) { + r = pk / qk; + if (numext::abs(ans - r) < numext::abs(r) * thresh) { + return r; + } + ans = r; + } + + k1 += one; + k2 += k26update; + k3 += two; + k4 += two; + k5 += one; + k6 -= k26update; + k7 += two; + k8 += two; + + if ((numext::abs(qk) + numext::abs(pk)) > big) { + pkm2 *= biginv; + pkm1 *= biginv; + qkm2 *= biginv; + qkm1 *= biginv; + } + if ((numext::abs(qk) < biginv) || (numext::abs(pk) < biginv)) { + pkm2 *= big; + pkm1 *= big; + qkm2 *= big; + qkm1 *= big; + } + } while (++n < num_iters); + + return ans; + } +}; + +/* Helper functions depending on the Scalar type */ +template <typename Scalar> +struct betainc_helper {}; + +template <> +struct betainc_helper<float> { + /* Core implementation, assumes a large (> 1.0) */ + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float incbsa(float aa, float bb, + float xx) { + float ans, a, b, t, x, onemx; + bool reversed_a_b = false; + + onemx = 1.0f - xx; + + /* see if x is greater than the mean */ + if (xx > (aa / (aa + bb))) { + reversed_a_b = true; + a = bb; + b = aa; + t = xx; + x = onemx; + } else { + a = aa; + b = bb; + t = onemx; + x = xx; + } + + /* Choose expansion for optimal convergence */ + if (b > 10.0f) { + if (numext::abs(b * x / a) < 0.3f) { + t = betainc_helper<float>::incbps(a, b, x); + if (reversed_a_b) t = 1.0f - t; + return t; + } + } + + ans = x * (a + b - 2.0f) / (a - 1.0f); + if (ans < 1.0f) { + ans = incbeta_cfe<float>::run(a, b, x, true /* small_branch */); + t = b * numext::log(t); + } else { + ans = incbeta_cfe<float>::run(a, b, x, false /* small_branch */); + t = (b - 1.0f) * numext::log(t); + } + + t += a * numext::log(x) + lgamma_impl<float>::run(a + b) - + lgamma_impl<float>::run(a) - lgamma_impl<float>::run(b); + t += numext::log(ans / a); + t = numext::exp(t); + + if (reversed_a_b) t = 1.0f - t; + return t; + } + + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float incbps(float a, float b, float x) { + float t, u, y, s; + const float machep = cephes_helper<float>::machep(); + + y = a * numext::log(x) + (b - 1.0f) * numext::log1p(-x) - numext::log(a); + y -= lgamma_impl<float>::run(a) + lgamma_impl<float>::run(b); + y += lgamma_impl<float>::run(a + b); + + t = x / (1.0f - x); + s = 0.0f; + u = 1.0f; + do { + b -= 1.0f; + if (b == 0.0f) { + break; + } + a += 1.0f; + u *= t * b / a; + s += u; + } while (numext::abs(u) > machep); + + return numext::exp(y) * (1.0f + s); + } +}; + +template <> +struct betainc_impl<float> { + EIGEN_DEVICE_FUNC + static float run(float a, float b, float x) { + const float nan = NumTraits<float>::quiet_NaN(); + float ans, t; + + if (a <= 0.0f) return nan; + if (b <= 0.0f) return nan; + if ((x <= 0.0f) || (x >= 1.0f)) { + if (x == 0.0f) return 0.0f; + if (x == 1.0f) return 1.0f; + // mtherr("betaincf", DOMAIN); + return nan; + } + + /* transformation for small aa */ + if (a <= 1.0f) { + ans = betainc_helper<float>::incbsa(a + 1.0f, b, x); + t = a * numext::log(x) + b * numext::log1p(-x) + + lgamma_impl<float>::run(a + b) - lgamma_impl<float>::run(a + 1.0f) - + lgamma_impl<float>::run(b); + return (ans + numext::exp(t)); + } else { + return betainc_helper<float>::incbsa(a, b, x); + } + } +}; + +template <> +struct betainc_helper<double> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double incbps(double a, double b, double x) { + const double machep = cephes_helper<double>::machep(); + + double s, t, u, v, n, t1, z, ai; + + ai = 1.0 / a; + u = (1.0 - b) * x; + v = u / (a + 1.0); + t1 = v; + t = u; + n = 2.0; + s = 0.0; + z = machep * ai; + while (numext::abs(v) > z) { + u = (n - b) * x / n; + t *= u; + v = t / (a + n); + s += v; + n += 1.0; + } + s += t1; + s += ai; + + u = a * numext::log(x); + // TODO: gamma() is not directly implemented in Eigen. + /* + if ((a + b) < maxgam && numext::abs(u) < maxlog) { + t = gamma(a + b) / (gamma(a) * gamma(b)); + s = s * t * pow(x, a); + } else { + */ + t = lgamma_impl<double>::run(a + b) - lgamma_impl<double>::run(a) - + lgamma_impl<double>::run(b) + u + numext::log(s); + return s = numext::exp(t); + } +}; + +template <> +struct betainc_impl<double> { + EIGEN_DEVICE_FUNC + static double run(double aa, double bb, double xx) { + const double nan = NumTraits<double>::quiet_NaN(); + const double machep = cephes_helper<double>::machep(); + // const double maxgam = 171.624376956302725; + + double a, b, t, x, xc, w, y; + bool reversed_a_b = false; + + if (aa <= 0.0 || bb <= 0.0) { + return nan; // goto domerr; + } + + if ((xx <= 0.0) || (xx >= 1.0)) { + if (xx == 0.0) return (0.0); + if (xx == 1.0) return (1.0); + // mtherr("incbet", DOMAIN); + return nan; + } + + if ((bb * xx) <= 1.0 && xx <= 0.95) { + return betainc_helper<double>::incbps(aa, bb, xx); + } + + w = 1.0 - xx; + + /* Reverse a and b if x is greater than the mean. */ + if (xx > (aa / (aa + bb))) { + reversed_a_b = true; + a = bb; + b = aa; + xc = xx; + x = w; + } else { + a = aa; + b = bb; + xc = w; + x = xx; + } + + if (reversed_a_b && (b * x) <= 1.0 && x <= 0.95) { + t = betainc_helper<double>::incbps(a, b, x); + if (t <= machep) { + t = 1.0 - machep; + } else { + t = 1.0 - t; + } + return t; + } + + /* Choose expansion for better convergence. */ + y = x * (a + b - 2.0) - (a - 1.0); + if (y < 0.0) { + w = incbeta_cfe<double>::run(a, b, x, true /* small_branch */); + } else { + w = incbeta_cfe<double>::run(a, b, x, false /* small_branch */) / xc; + } + + /* Multiply w by the factor + a b _ _ _ + x (1-x) | (a+b) / ( a | (a) | (b) ) . */ + + y = a * numext::log(x); + t = b * numext::log(xc); + // TODO: gamma is not directly implemented in Eigen. + /* + if ((a + b) < maxgam && numext::abs(y) < maxlog && numext::abs(t) < maxlog) + { + t = pow(xc, b); + t *= pow(x, a); + t /= a; + t *= w; + t *= gamma(a + b) / (gamma(a) * gamma(b)); + } else { + */ + /* Resort to logarithms. */ + y += t + lgamma_impl<double>::run(a + b) - lgamma_impl<double>::run(a) - + lgamma_impl<double>::run(b); + y += numext::log(w / a); + t = numext::exp(y); + + /* } */ + // done: + + if (reversed_a_b) { + if (t <= machep) { + t = 1.0 - machep; + } else { + t = 1.0 - t; + } + } + return t; + } +}; + +#endif // EIGEN_HAS_C99_MATH + +} // end namespace internal + +namespace numext { + +template <typename Scalar> +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(lgamma, Scalar) + lgamma(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(lgamma, Scalar)::run(x); +} + +template <typename Scalar> +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(digamma, Scalar) + digamma(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(digamma, Scalar)::run(x); +} + +template <typename Scalar> +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(zeta, Scalar) +zeta(const Scalar& x, const Scalar& q) { + return EIGEN_MATHFUNC_IMPL(zeta, Scalar)::run(x, q); +} + +template <typename Scalar> +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(polygamma, Scalar) +polygamma(const Scalar& n, const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(polygamma, Scalar)::run(n, x); +} + +template <typename Scalar> +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erf, Scalar) + erf(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(erf, Scalar)::run(x); +} + +template <typename Scalar> +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erfc, Scalar) + erfc(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(erfc, Scalar)::run(x); +} + +template <typename Scalar> +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igamma, Scalar) + igamma(const Scalar& a, const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(igamma, Scalar)::run(a, x); +} + +template <typename Scalar> +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igammac, Scalar) + igammac(const Scalar& a, const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(igammac, Scalar)::run(a, x); +} + +template <typename Scalar> +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(betainc, Scalar) + betainc(const Scalar& a, const Scalar& b, const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(betainc, Scalar)::run(a, b, x); +} + +} // end namespace numext + + +} // end namespace Eigen + +#endif // EIGEN_SPECIAL_FUNCTIONS_H diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h new file mode 100644 index 000000000..46d60d323 --- /dev/null +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h @@ -0,0 +1,58 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPECIALFUNCTIONS_PACKETMATH_H +#define EIGEN_SPECIALFUNCTIONS_PACKETMATH_H + +namespace Eigen { + +namespace internal { + +/** \internal \returns the ln(|gamma(\a a)|) (coeff-wise) */ +template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet plgamma(const Packet& a) { using numext::lgamma; return lgamma(a); } + +/** \internal \returns the derivative of lgamma, psi(\a a) (coeff-wise) */ +template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pdigamma(const Packet& a) { using numext::digamma; return digamma(a); } + +/** \internal \returns the zeta function of two arguments (coeff-wise) */ +template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pzeta(const Packet& x, const Packet& q) { using numext::zeta; return zeta(x, q); } + +/** \internal \returns the polygamma function (coeff-wise) */ +template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet ppolygamma(const Packet& n, const Packet& x) { using numext::polygamma; return polygamma(n, x); } + +/** \internal \returns the erf(\a a) (coeff-wise) */ +template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet perf(const Packet& a) { using numext::erf; return erf(a); } + +/** \internal \returns the erfc(\a a) (coeff-wise) */ +template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet perfc(const Packet& a) { using numext::erfc; return erfc(a); } + +/** \internal \returns the incomplete gamma function igamma(\a a, \a x) */ +template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +Packet pigamma(const Packet& a, const Packet& x) { using numext::igamma; return igamma(a, x); } + +/** \internal \returns the complementary incomplete gamma function igammac(\a a, \a x) */ +template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +Packet pigammac(const Packet& a, const Packet& x) { using numext::igammac; return igammac(a, x); } + +/** \internal \returns the complementary incomplete gamma function betainc(\a a, \a b, \a x) */ +template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +Packet pbetainc(const Packet& a, const Packet& b,const Packet& x) { using numext::betainc; return betainc(a, b, x); } + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_SPECIALFUNCTIONS_PACKETMATH_H + diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h new file mode 100644 index 000000000..ec4fa8448 --- /dev/null +++ b/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h @@ -0,0 +1,165 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CUDA_SPECIALFUNCTIONS_H +#define EIGEN_CUDA_SPECIALFUNCTIONS_H + +namespace Eigen { + +namespace internal { + +// Make sure this is only available when targeting a GPU: we don't want to +// introduce conflicts between these packet_traits definitions and the ones +// we'll use on the host side (SSE, AVX, ...) +#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 plgamma<float4>(const float4& a) +{ + return make_float4(lgammaf(a.x), lgammaf(a.y), lgammaf(a.z), lgammaf(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 plgamma<double2>(const double2& a) +{ + using numext::lgamma; + return make_double2(lgamma(a.x), lgamma(a.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pdigamma<float4>(const float4& a) +{ + using numext::digamma; + return make_float4(digamma(a.x), digamma(a.y), digamma(a.z), digamma(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pdigamma<double2>(const double2& a) +{ + using numext::digamma; + return make_double2(digamma(a.x), digamma(a.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pzeta<float4>(const float4& x, const float4& q) +{ + using numext::zeta; + return make_float4(zeta(x.x, q.x), zeta(x.y, q.y), zeta(x.z, q.z), zeta(x.w, q.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pzeta<double2>(const double2& x, const double2& q) +{ + using numext::zeta; + return make_double2(zeta(x.x, q.x), zeta(x.y, q.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 ppolygamma<float4>(const float4& n, const float4& x) +{ + using numext::polygamma; + return make_float4(polygamma(n.x, x.x), polygamma(n.y, x.y), polygamma(n.z, x.z), polygamma(n.w, x.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 ppolygamma<double2>(const double2& n, const double2& x) +{ + using numext::polygamma; + return make_double2(polygamma(n.x, x.x), polygamma(n.y, x.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 perf<float4>(const float4& a) +{ + return make_float4(erff(a.x), erff(a.y), erff(a.z), erff(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 perf<double2>(const double2& a) +{ + using numext::erf; + return make_double2(erf(a.x), erf(a.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 perfc<float4>(const float4& a) +{ + using numext::erfc; + return make_float4(erfc(a.x), erfc(a.y), erfc(a.z), erfc(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 perfc<double2>(const double2& a) +{ + using numext::erfc; + return make_double2(erfc(a.x), erfc(a.y)); +} + + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pigamma<float4>(const float4& a, const float4& x) +{ + using numext::igamma; + return make_float4( + igamma(a.x, x.x), + igamma(a.y, x.y), + igamma(a.z, x.z), + igamma(a.w, x.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pigamma<double2>(const double2& a, const double2& x) +{ + using numext::igamma; + return make_double2(igamma(a.x, x.x), igamma(a.y, x.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pigammac<float4>(const float4& a, const float4& x) +{ + using numext::igammac; + return make_float4( + igammac(a.x, x.x), + igammac(a.y, x.y), + igammac(a.z, x.z), + igammac(a.w, x.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pigammac<double2>(const double2& a, const double2& x) +{ + using numext::igammac; + return make_double2(igammac(a.x, x.x), igammac(a.y, x.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pbetainc<float4>(const float4& a, const float4& b, const float4& x) +{ + using numext::betainc; + return make_float4( + betainc(a.x, b.x, x.x), + betainc(a.y, b.y, x.y), + betainc(a.z, b.z, x.z), + betainc(a.w, b.w, x.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pbetainc<double2>(const double2& a, const double2& b, const double2& x) +{ + using numext::betainc; + return make_double2(betainc(a.x, b.x, x.x), betainc(a.y, b.y, x.y)); +} + +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CUDA_SPECIALFUNCTIONS_H diff --git a/unsupported/Eigen/src/Splines/CMakeLists.txt b/unsupported/Eigen/src/Splines/CMakeLists.txt deleted file mode 100644 index 55c6271e9..000000000 --- a/unsupported/Eigen/src/Splines/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_Splines_SRCS "*.h") - -INSTALL(FILES - ${Eigen_Splines_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Splines COMPONENT Devel - ) diff --git a/unsupported/Eigen/src/Splines/Spline.h b/unsupported/Eigen/src/Splines/Spline.h index d1636f466..627f6e482 100644 --- a/unsupported/Eigen/src/Splines/Spline.h +++ b/unsupported/Eigen/src/Splines/Spline.h @@ -94,7 +94,7 @@ namespace Eigen const KnotVectorType& knots() const { return m_knots; } /** - * \brief Returns the knots of the underlying spline. + * \brief Returns the ctrls of the underlying spline. **/ const ControlPointVectorType& ctrls() const { return m_ctrls; } @@ -394,7 +394,7 @@ namespace Eigen Matrix<Scalar,Order,Order> ndu(p+1,p+1); - double saved, temp; + Scalar saved, temp; // FIXME These were double instead of Scalar. Was there a reason for that? ndu(0,0) = 1.0; @@ -433,7 +433,7 @@ namespace Eigen // Compute the k-th derivative for (DenseIndex k=1; k<=static_cast<DenseIndex>(n); ++k) { - double d = 0.0; + Scalar d = 0.0; DenseIndex rk,pk,j1,j2; rk = r-k; pk = p-k; diff --git a/unsupported/doc/examples/BVH_Example.cpp b/unsupported/doc/examples/BVH_Example.cpp index 6b6fac075..afb0c94c2 100644 --- a/unsupported/doc/examples/BVH_Example.cpp +++ b/unsupported/doc/examples/BVH_Example.cpp @@ -6,9 +6,7 @@ using namespace Eigen; typedef AlignedBox<double, 2> Box2d; namespace Eigen { - namespace internal { - Box2d bounding_box(const Vector2d &v) { return Box2d(v, v); } //compute the bounding box of a single point - } + Box2d bounding_box(const Vector2d &v) { return Box2d(v, v); } //compute the bounding box of a single point } struct PointPointMinimizer //how to compute squared distances between points and rectangles diff --git a/unsupported/doc/examples/EulerAngles.cpp b/unsupported/doc/examples/EulerAngles.cpp new file mode 100644 index 000000000..1ef6aee18 --- /dev/null +++ b/unsupported/doc/examples/EulerAngles.cpp @@ -0,0 +1,46 @@ +#include <unsupported/Eigen/EulerAngles> +#include <iostream> + +using namespace Eigen; + +int main() +{ + // A common Euler system by many armies around the world, + // where the first one is the azimuth(the angle from the north - + // the same angle that is show in compass) + // and the second one is elevation(the angle from the horizon) + // and the third one is roll(the angle between the horizontal body + // direction and the plane ground surface) + // Keep remembering we're using radian angles here! + typedef EulerSystem<-EULER_Z, EULER_Y, EULER_X> MyArmySystem; + typedef EulerAngles<double, MyArmySystem> MyArmyAngles; + + MyArmyAngles vehicleAngles( + 3.14/*PI*/ / 2, /* heading to east, notice that this angle is counter-clockwise */ + -0.3, /* going down from a mountain */ + 0.1); /* slightly rolled to the right */ + + // Some Euler angles representation that our plane use. + EulerAnglesZYZd planeAngles(0.78474, 0.5271, -0.513794); + + MyArmyAngles planeAnglesInMyArmyAngles = MyArmyAngles::FromRotation<true, false, false>(planeAngles); + + std::cout << "vehicle angles(MyArmy): " << vehicleAngles << std::endl; + std::cout << "plane angles(ZYZ): " << planeAngles << std::endl; + std::cout << "plane angles(MyArmy): " << planeAnglesInMyArmyAngles << std::endl; + + // Now lets rotate the plane a little bit + std::cout << "==========================================================\n"; + std::cout << "rotating plane now!\n"; + std::cout << "==========================================================\n"; + + Quaterniond planeRotated = AngleAxisd(-0.342, Vector3d::UnitY()) * planeAngles; + + planeAngles = planeRotated; + planeAnglesInMyArmyAngles = MyArmyAngles::FromRotation<true, false, false>(planeRotated); + + std::cout << "new plane angles(ZYZ): " << planeAngles << std::endl; + std::cout << "new plane angles(MyArmy): " << planeAnglesInMyArmyAngles << std::endl; + + return 0; +} diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 22442b394..a1823beaa 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -59,6 +59,8 @@ ei_add_test(alignedvector3) ei_add_test(FFT) +ei_add_test(EulerAngles) + find_package(MPFR 2.3.0) find_package(GMP) if(MPFR_FOUND AND EIGEN_COMPILER_SUPPORT_CXX11) @@ -109,10 +111,14 @@ ei_add_test(gmres) ei_add_test(minres) ei_add_test(levenberg_marquardt) ei_add_test(kronecker_product) +ei_add_test(special_functions) # TODO: The following test names are prefixed with the cxx11 string, since historically # the tests depended on c++11. This isn't the case anymore so we ought to rename them. -ei_add_test(cxx11_float16) +# FIXME: Old versions of MSVC fail to compile this code, so we just disable these tests +# when using visual studio. We should make the check more strict to enable the tests for +# newer versions of MSVC. +if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") ei_add_test(cxx11_tensor_dimension) ei_add_test(cxx11_tensor_map) ei_add_test(cxx11_tensor_assign) @@ -130,7 +136,8 @@ ei_add_test(cxx11_tensor_io) if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") # This test requires __uint128_t which is only available on 64bit systems ei_add_test(cxx11_tensor_uint128) -endif() +endif() +endif() if(EIGEN_TEST_CXX11) # It should be safe to always run these tests as there is some fallback code for @@ -139,6 +146,8 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_eventcount "-pthread" "${CMAKE_THREAD_LIBS_INIT}") ei_add_test(cxx11_runqueue "-pthread" "${CMAKE_THREAD_LIBS_INIT}") + ei_add_test(cxx11_non_blocking_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}") + ei_add_test(cxx11_meta) ei_add_test(cxx11_tensor_simple) # ei_add_test(cxx11_tensor_symmetry) @@ -174,6 +183,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_custom_index) ei_add_test(cxx11_tensor_fft) ei_add_test(cxx11_tensor_ifft) + ei_add_test(cxx11_tensor_scan) endif() @@ -183,37 +193,58 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA) # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor # and -fno-check-new flags since they trigger thousands of compilation warnings # in the CUDA runtime + # Also remove -ansi that is incompatible with std=c++11. string(REPLACE "-pedantic" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") string(REPLACE "-Wundef" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") string(REPLACE "-Wnon-virtual-dtor" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + string(REPLACE "-ansi" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") message(STATUS "Flags used to compile cuda code: " ${CMAKE_CXX_FLAGS}) if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE) + set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE) endif() if(EIGEN_TEST_CUDA_CLANG) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_${EIGEN_CUDA_COMPUTE_ARCH}") endif() - set(CUDA_NVCC_FLAGS "-std=c++11 --relaxed-constexpr -arch compute_${EIGEN_CUDA_COMPUTE_ARCH} -Xcudafe \"--display_error_number\"") + set(EIGEN_CUDA_RELAXED_CONSTEXPR "--expt-relaxed-constexpr") + if (${CUDA_VERSION} STREQUAL "7.0") + set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr") + endif() + + if( (NOT EIGEN_TEST_CXX11) OR (CMAKE_VERSION VERSION_LESS 3.3)) + set(EIGEN_CUDA_CXX11_FLAG "-std=c++11") + else() + # otherwise the flag has already been added because of the above set(CMAKE_CXX_STANDARD 11) + set(EIGEN_CUDA_CXX11_FLAG "") + endif() + + set(CUDA_NVCC_FLAGS "${EIGEN_CUDA_CXX11_FLAG} ${EIGEN_CUDA_RELAXED_CONSTEXPR} -arch compute_${EIGEN_CUDA_COMPUTE_ARCH} -Xcudafe \"--display_error_number\" ${CUDA_NVCC_FLAGS}") cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include") set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") - ei_add_test(cxx11_tensor_device) - ei_add_test(cxx11_tensor_cuda) - ei_add_test(cxx11_tensor_contract_cuda) + ei_add_test(cxx11_tensor_complex_cuda) + ei_add_test(cxx11_tensor_complex_cwise_ops_cuda) ei_add_test(cxx11_tensor_reduction_cuda) ei_add_test(cxx11_tensor_argmax_cuda) ei_add_test(cxx11_tensor_cast_float16_cuda) + ei_add_test(cxx11_tensor_scan_cuda) + + # Contractions require arch 3.0 or higher + if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 29) + ei_add_test(cxx11_tensor_device) + ei_add_test(cxx11_tensor_cuda) + ei_add_test(cxx11_tensor_contract_cuda) + ei_add_test(cxx11_tensor_of_float16_cuda) + endif() # The random number generation code requires arch 3.5 or greater. if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 34) ei_add_test(cxx11_tensor_random_cuda) endif() - ei_add_test(cxx11_tensor_of_float16_cuda) unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) endif() diff --git a/unsupported/test/EulerAngles.cpp b/unsupported/test/EulerAngles.cpp new file mode 100644 index 000000000..a8cb52864 --- /dev/null +++ b/unsupported/test/EulerAngles.cpp @@ -0,0 +1,208 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <unsupported/Eigen/EulerAngles> + +using namespace Eigen; + +template<typename EulerSystem, typename Scalar> +void verify_euler_ranged(const Matrix<Scalar,3,1>& ea, + bool positiveRangeAlpha, bool positiveRangeBeta, bool positiveRangeGamma) +{ + typedef EulerAngles<Scalar, EulerSystem> EulerAnglesType; + typedef Matrix<Scalar,3,3> Matrix3; + typedef Matrix<Scalar,3,1> Vector3; + typedef Quaternion<Scalar> QuaternionType; + typedef AngleAxis<Scalar> AngleAxisType; + using std::abs; + + Scalar alphaRangeStart, alphaRangeEnd; + Scalar betaRangeStart, betaRangeEnd; + Scalar gammaRangeStart, gammaRangeEnd; + + if (positiveRangeAlpha) + { + alphaRangeStart = Scalar(0); + alphaRangeEnd = Scalar(2 * EIGEN_PI); + } + else + { + alphaRangeStart = -Scalar(EIGEN_PI); + alphaRangeEnd = Scalar(EIGEN_PI); + } + + if (positiveRangeBeta) + { + betaRangeStart = Scalar(0); + betaRangeEnd = Scalar(2 * EIGEN_PI); + } + else + { + betaRangeStart = -Scalar(EIGEN_PI); + betaRangeEnd = Scalar(EIGEN_PI); + } + + if (positiveRangeGamma) + { + gammaRangeStart = Scalar(0); + gammaRangeEnd = Scalar(2 * EIGEN_PI); + } + else + { + gammaRangeStart = -Scalar(EIGEN_PI); + gammaRangeEnd = Scalar(EIGEN_PI); + } + + const int i = EulerSystem::AlphaAxisAbs - 1; + const int j = EulerSystem::BetaAxisAbs - 1; + const int k = EulerSystem::GammaAxisAbs - 1; + + const int iFactor = EulerSystem::IsAlphaOpposite ? -1 : 1; + const int jFactor = EulerSystem::IsBetaOpposite ? -1 : 1; + const int kFactor = EulerSystem::IsGammaOpposite ? -1 : 1; + + const Vector3 I = EulerAnglesType::AlphaAxisVector(); + const Vector3 J = EulerAnglesType::BetaAxisVector(); + const Vector3 K = EulerAnglesType::GammaAxisVector(); + + EulerAnglesType e(ea[0], ea[1], ea[2]); + + Matrix3 m(e); + Vector3 eabis = EulerAnglesType(m, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma).angles(); + + // Check that eabis in range + VERIFY(alphaRangeStart <= eabis[0] && eabis[0] <= alphaRangeEnd); + VERIFY(betaRangeStart <= eabis[1] && eabis[1] <= betaRangeEnd); + VERIFY(gammaRangeStart <= eabis[2] && eabis[2] <= gammaRangeEnd); + + Vector3 eabis2 = m.eulerAngles(i, j, k); + + // Invert the relevant axes + eabis2[0] *= iFactor; + eabis2[1] *= jFactor; + eabis2[2] *= kFactor; + + // Saturate the angles to the correct range + if (positiveRangeAlpha && (eabis2[0] < 0)) + eabis2[0] += Scalar(2 * EIGEN_PI); + if (positiveRangeBeta && (eabis2[1] < 0)) + eabis2[1] += Scalar(2 * EIGEN_PI); + if (positiveRangeGamma && (eabis2[2] < 0)) + eabis2[2] += Scalar(2 * EIGEN_PI); + + VERIFY_IS_APPROX(eabis, eabis2);// Verify that our estimation is the same as m.eulerAngles() is + + Matrix3 mbis(AngleAxisType(eabis[0], I) * AngleAxisType(eabis[1], J) * AngleAxisType(eabis[2], K)); + VERIFY_IS_APPROX(m, mbis); + + // Tests that are only relevant for no possitive range + if (!(positiveRangeAlpha || positiveRangeBeta || positiveRangeGamma)) + { + /* If I==K, and ea[1]==0, then there no unique solution. */ + /* The remark apply in the case where I!=K, and |ea[1]| is close to pi/2. */ + if( (i!=k || ea[1]!=0) && (i==k || !internal::isApprox(abs(ea[1]),Scalar(EIGEN_PI/2),test_precision<Scalar>())) ) + VERIFY((ea-eabis).norm() <= test_precision<Scalar>()); + + // approx_or_less_than does not work for 0 + VERIFY(0 < eabis[0] || test_isMuchSmallerThan(eabis[0], Scalar(1))); + } + + // Quaternions + QuaternionType q(e); + eabis = EulerAnglesType(q, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma).angles(); + VERIFY_IS_APPROX(eabis, eabis2);// Verify that the euler angles are still the same +} + +template<typename EulerSystem, typename Scalar> +void verify_euler(const Matrix<Scalar,3,1>& ea) +{ + verify_euler_ranged<EulerSystem>(ea, false, false, false); + verify_euler_ranged<EulerSystem>(ea, false, false, true); + verify_euler_ranged<EulerSystem>(ea, false, true, false); + verify_euler_ranged<EulerSystem>(ea, false, true, true); + verify_euler_ranged<EulerSystem>(ea, true, false, false); + verify_euler_ranged<EulerSystem>(ea, true, false, true); + verify_euler_ranged<EulerSystem>(ea, true, true, false); + verify_euler_ranged<EulerSystem>(ea, true, true, true); +} + +template<typename Scalar> void check_all_var(const Matrix<Scalar,3,1>& ea) +{ + verify_euler<EulerSystemXYZ>(ea); + verify_euler<EulerSystemXYX>(ea); + verify_euler<EulerSystemXZY>(ea); + verify_euler<EulerSystemXZX>(ea); + + verify_euler<EulerSystemYZX>(ea); + verify_euler<EulerSystemYZY>(ea); + verify_euler<EulerSystemYXZ>(ea); + verify_euler<EulerSystemYXY>(ea); + + verify_euler<EulerSystemZXY>(ea); + verify_euler<EulerSystemZXZ>(ea); + verify_euler<EulerSystemZYX>(ea); + verify_euler<EulerSystemZYZ>(ea); +} + +template<typename Scalar> void eulerangles() +{ + typedef Matrix<Scalar,3,3> Matrix3; + typedef Matrix<Scalar,3,1> Vector3; + typedef Array<Scalar,3,1> Array3; + typedef Quaternion<Scalar> Quaternionx; + typedef AngleAxis<Scalar> AngleAxisType; + + Scalar a = internal::random<Scalar>(-Scalar(EIGEN_PI), Scalar(EIGEN_PI)); + Quaternionx q1; + q1 = AngleAxisType(a, Vector3::Random().normalized()); + Matrix3 m; + m = q1; + + Vector3 ea = m.eulerAngles(0,1,2); + check_all_var(ea); + ea = m.eulerAngles(0,1,0); + check_all_var(ea); + + // Check with purely random Quaternion: + q1.coeffs() = Quaternionx::Coefficients::Random().normalized(); + m = q1; + ea = m.eulerAngles(0,1,2); + check_all_var(ea); + ea = m.eulerAngles(0,1,0); + check_all_var(ea); + + // Check with random angles in range [0:pi]x[-pi:pi]x[-pi:pi]. + ea = (Array3::Random() + Array3(1,0,0))*Scalar(EIGEN_PI)*Array3(0.5,1,1); + check_all_var(ea); + + ea[2] = ea[0] = internal::random<Scalar>(0,Scalar(EIGEN_PI)); + check_all_var(ea); + + ea[0] = ea[1] = internal::random<Scalar>(0,Scalar(EIGEN_PI)); + check_all_var(ea); + + ea[1] = 0; + check_all_var(ea); + + ea.head(2).setZero(); + check_all_var(ea); + + ea.setZero(); + check_all_var(ea); +} + +void test_EulerAngles() +{ + for(int i = 0; i < g_repeat; i++) { + CALL_SUBTEST_1( eulerangles<float>() ); + CALL_SUBTEST_2( eulerangles<double>() ); + } +} diff --git a/unsupported/test/FFTW.cpp b/unsupported/test/FFTW.cpp index d3718e2d2..8b7528fb7 100644 --- a/unsupported/test/FFTW.cpp +++ b/unsupported/test/FFTW.cpp @@ -18,11 +18,11 @@ using namespace Eigen; template < typename T> -complex<long double> promote(complex<T> x) { return complex<long double>(x.real(),x.imag()); } +complex<long double> promote(complex<T> x) { return complex<long double>((long double)x.real(),(long double)x.imag()); } -complex<long double> promote(float x) { return complex<long double>( x); } -complex<long double> promote(double x) { return complex<long double>( x); } -complex<long double> promote(long double x) { return complex<long double>( x); } +complex<long double> promote(float x) { return complex<long double>((long double)x); } +complex<long double> promote(double x) { return complex<long double>((long double)x); } +complex<long double> promote(long double x) { return complex<long double>((long double)x); } template <typename VT1,typename VT2> @@ -33,7 +33,7 @@ complex<long double> promote(long double x) { return complex<long double>( x); long double pi = acos((long double)-1 ); for (size_t k0=0;k0<(size_t)fftbuf.size();++k0) { complex<long double> acc = 0; - long double phinc = -2.*k0* pi / timebuf.size(); + long double phinc = (long double)(-2.)*k0* pi / timebuf.size(); for (size_t k1=0;k1<(size_t)timebuf.size();++k1) { acc += promote( timebuf[k1] ) * exp( complex<long double>(0,k1*phinc) ); } @@ -54,8 +54,8 @@ complex<long double> promote(long double x) { return complex<long double>( x); long double difpower=0; size_t n = (min)( buf1.size(),buf2.size() ); for (size_t k=0;k<n;++k) { - totalpower += (numext::abs2( buf1[k] ) + numext::abs2(buf2[k]) )/2.; - difpower += numext::abs2(buf1[k] - buf2[k]); + totalpower += (long double)((numext::abs2( buf1[k] ) + numext::abs2(buf2[k]) )/2); + difpower += (long double)(numext::abs2(buf1[k] - buf2[k])); } return sqrt(difpower/totalpower); } @@ -93,19 +93,19 @@ void test_scalar_generic(int nfft) fft.SetFlag(fft.HalfSpectrum ); fft.fwd( freqBuf,tbuf); VERIFY((size_t)freqBuf.size() == (size_t)( (nfft>>1)+1) ); - VERIFY( fft_rmse(freqBuf,tbuf) < test_precision<T>() );// gross check + VERIFY( T(fft_rmse(freqBuf,tbuf)) < test_precision<T>() );// gross check fft.ClearFlag(fft.HalfSpectrum ); fft.fwd( freqBuf,tbuf); VERIFY( (size_t)freqBuf.size() == (size_t)nfft); - VERIFY( fft_rmse(freqBuf,tbuf) < test_precision<T>() );// gross check + VERIFY( T(fft_rmse(freqBuf,tbuf)) < test_precision<T>() );// gross check if (nfft&1) return; // odd FFTs get the wrong size inverse FFT ScalarVector tbuf2; fft.inv( tbuf2 , freqBuf); - VERIFY( dif_rmse(tbuf,tbuf2) < test_precision<T>() );// gross check + VERIFY( T(dif_rmse(tbuf,tbuf2)) < test_precision<T>() );// gross check // verify that the Unscaled flag takes effect @@ -121,12 +121,12 @@ void test_scalar_generic(int nfft) //for (size_t i=0;i<(size_t) tbuf.size();++i) // cout << "freqBuf=" << freqBuf[i] << " in2=" << tbuf3[i] << " - in=" << tbuf[i] << " => " << (tbuf3[i] - tbuf[i] ) << endl; - VERIFY( dif_rmse(tbuf,tbuf3) < test_precision<T>() );// gross check + VERIFY( T(dif_rmse(tbuf,tbuf3)) < test_precision<T>() );// gross check // verify that ClearFlag works fft.ClearFlag(fft.Unscaled); fft.inv( tbuf2 , freqBuf); - VERIFY( dif_rmse(tbuf,tbuf2) < test_precision<T>() );// gross check + VERIFY( T(dif_rmse(tbuf,tbuf2)) < test_precision<T>() );// gross check } template <typename T> @@ -152,10 +152,10 @@ void test_complex_generic(int nfft) inbuf[k]= Complex( (T)(rand()/(double)RAND_MAX - .5), (T)(rand()/(double)RAND_MAX - .5) ); fft.fwd( outbuf , inbuf); - VERIFY( fft_rmse(outbuf,inbuf) < test_precision<T>() );// gross check + VERIFY( T(fft_rmse(outbuf,inbuf)) < test_precision<T>() );// gross check fft.inv( buf3 , outbuf); - VERIFY( dif_rmse(inbuf,buf3) < test_precision<T>() );// gross check + VERIFY( T(dif_rmse(inbuf,buf3)) < test_precision<T>() );// gross check // verify that the Unscaled flag takes effect ComplexVector buf4; @@ -163,12 +163,12 @@ void test_complex_generic(int nfft) fft.inv( buf4 , outbuf); for (int k=0;k<nfft;++k) buf4[k] *= T(1./nfft); - VERIFY( dif_rmse(inbuf,buf4) < test_precision<T>() );// gross check + VERIFY( T(dif_rmse(inbuf,buf4)) < test_precision<T>() );// gross check // verify that ClearFlag works fft.ClearFlag(fft.Unscaled); fft.inv( buf3 , outbuf); - VERIFY( dif_rmse(inbuf,buf3) < test_precision<T>() );// gross check + VERIFY( T(dif_rmse(inbuf,buf3)) < test_precision<T>() );// gross check } template <typename T> diff --git a/unsupported/test/autodiff.cpp b/unsupported/test/autodiff.cpp index 374f86df9..85743137e 100644 --- a/unsupported/test/autodiff.cpp +++ b/unsupported/test/autodiff.cpp @@ -16,7 +16,8 @@ EIGEN_DONT_INLINE Scalar foo(const Scalar& x, const Scalar& y) using namespace std; // return x+std::sin(y); EIGEN_ASM_COMMENT("mybegin"); - return static_cast<Scalar>(x*2 - 1 + pow(1+x,2) + 2*sqrt(y*y+0) - 4 * sin(0+x) + 2 * cos(y+0) - exp(-0.5*x*x+0)); + // pow(float, int) promotes to pow(double, double) + return x*2 - 1 + static_cast<Scalar>(pow(1+x,2)) + 2*sqrt(y*y+0) - 4 * sin(0+x) + 2 * cos(y+0) - exp(Scalar(-0.5)*x*x+0); //return x+2*y*x;//x*2 -std::pow(x,2);//(2*y/x);// - y*2; EIGEN_ASM_COMMENT("myend"); } @@ -104,6 +105,89 @@ struct TestFunc1 } }; + +#if EIGEN_HAS_VARIADIC_TEMPLATES +/* Test functor for the C++11 features. */ +template <typename Scalar> +struct integratorFunctor +{ + typedef Matrix<Scalar, 2, 1> InputType; + typedef Matrix<Scalar, 2, 1> ValueType; + + /* + * Implementation starts here. + */ + integratorFunctor(const Scalar gain) : _gain(gain) {} + integratorFunctor(const integratorFunctor& f) : _gain(f._gain) {} + const Scalar _gain; + + template <typename T1, typename T2> + void operator() (const T1 &input, T2 *output, const Scalar dt) const + { + T2 &o = *output; + + /* Integrator to test the AD. */ + o[0] = input[0] + input[1] * dt * _gain; + o[1] = input[1] * _gain; + } + + /* Only needed for the test */ + template <typename T1, typename T2, typename T3> + void operator() (const T1 &input, T2 *output, T3 *jacobian, const Scalar dt) const + { + T2 &o = *output; + + /* Integrator to test the AD. */ + o[0] = input[0] + input[1] * dt * _gain; + o[1] = input[1] * _gain; + + if (jacobian) + { + T3 &j = *jacobian; + + j(0, 0) = 1; + j(0, 1) = dt * _gain; + j(1, 0) = 0; + j(1, 1) = _gain; + } + } + +}; + +template<typename Func> void forward_jacobian_cpp11(const Func& f) +{ + typedef typename Func::ValueType::Scalar Scalar; + typedef typename Func::ValueType ValueType; + typedef typename Func::InputType InputType; + typedef typename AutoDiffJacobian<Func>::JacobianType JacobianType; + + InputType x = InputType::Random(InputType::RowsAtCompileTime); + ValueType y, yref; + JacobianType j, jref; + + const Scalar dt = internal::random<double>(); + + jref.setZero(); + yref.setZero(); + f(x, &yref, &jref, dt); + + //std::cerr << "y, yref, jref: " << "\n"; + //std::cerr << y.transpose() << "\n\n"; + //std::cerr << yref << "\n\n"; + //std::cerr << jref << "\n\n"; + + AutoDiffJacobian<Func> autoj(f); + autoj(x, &y, &j, dt); + + //std::cerr << "y j (via autodiff): " << "\n"; + //std::cerr << y.transpose() << "\n\n"; + //std::cerr << j << "\n\n"; + + VERIFY_IS_APPROX(y, yref); + VERIFY_IS_APPROX(j, jref); +} +#endif + template<typename Func> void forward_jacobian(const Func& f) { typename Func::InputType x = Func::InputType::Random(f.inputs()); @@ -127,7 +211,6 @@ template<typename Func> void forward_jacobian(const Func& f) VERIFY_IS_APPROX(j, jref); } - // TODO also check actual derivatives! template <int> void test_autodiff_scalar() @@ -140,6 +223,7 @@ void test_autodiff_scalar() VERIFY_IS_APPROX(res.value(), foo(p.x(),p.y())); } + // TODO also check actual derivatives! template <int> void test_autodiff_vector() @@ -150,7 +234,7 @@ void test_autodiff_vector() VectorAD ap = p.cast<AD>(); ap.x().derivatives() = Vector2f::UnitX(); ap.y().derivatives() = Vector2f::UnitY(); - + AD res = foo<VectorAD>(ap); VERIFY_IS_APPROX(res.value(), foo(p)); } @@ -163,6 +247,9 @@ void test_autodiff_jacobian() CALL_SUBTEST(( forward_jacobian(TestFunc1<double,3,2>()) )); CALL_SUBTEST(( forward_jacobian(TestFunc1<double,3,3>()) )); CALL_SUBTEST(( forward_jacobian(TestFunc1<double>(3,3)) )); +#if EIGEN_HAS_VARIADIC_TEMPLATES + CALL_SUBTEST(( forward_jacobian_cpp11(integratorFunctor<double>(10)) )); +#endif } @@ -204,9 +291,64 @@ void test_autodiff_hessian() VERIFY_IS_APPROX(y.value().derivatives()(1), s4*std::cos(s1*s3+s2*s4)); VERIFY_IS_APPROX(y.derivatives()(0).derivatives(), -std::sin(s1*s3+s2*s4)*Vector2d(s3*s3,s4*s3)); VERIFY_IS_APPROX(y.derivatives()(1).derivatives(), -std::sin(s1*s3+s2*s4)*Vector2d(s3*s4,s4*s4)); + + ADD z = x(0)*x(1); + VERIFY_IS_APPROX(z.derivatives()(0).derivatives(), Vector2d(0,1)); + VERIFY_IS_APPROX(z.derivatives()(1).derivatives(), Vector2d(1,0)); +} + +double bug_1222() { + typedef Eigen::AutoDiffScalar<Eigen::Vector3d> AD; + const double _cv1_3 = 1.0; + const AD chi_3 = 1.0; + // this line did not work, because operator+ returns ADS<DerType&>, which then cannot be converted to ADS<DerType> + const AD denom = chi_3 + _cv1_3; + return denom.value(); +} + +double bug_1223() { + using std::min; + typedef Eigen::AutoDiffScalar<Eigen::Vector3d> AD; + + const double _cv1_3 = 1.0; + const AD chi_3 = 1.0; + const AD denom = 1.0; + + // failed because implementation of min attempts to construct ADS<DerType&> via constructor AutoDiffScalar(const Real& value) + // without initializing m_derivatives (which is a reference in this case) + #define EIGEN_TEST_SPACE + const AD t = min EIGEN_TEST_SPACE (denom / chi_3, 1.0); + + const AD t2 = min EIGEN_TEST_SPACE (denom / (chi_3 * _cv1_3), 1.0); + + return t.value() + t2.value(); +} + +// regression test for some compilation issues with specializations of ScalarBinaryOpTraits +void bug_1260() { + Matrix4d A; + Vector4d v; + A*v; } +// check a compilation issue with numext::max +double bug_1261() { + typedef AutoDiffScalar<Matrix2d> AD; + typedef Matrix<AD,2,1> VectorAD; + + VectorAD v; + const AD maxVal = v.maxCoeff(); + const AD minVal = v.minCoeff(); + return maxVal.value() + minVal.value(); +} +double bug_1264() { + typedef AutoDiffScalar<Vector2d> AD; + const AD s; + const Matrix<AD, 3, 1> v1; + const Matrix<AD, 3, 1> v2 = (s + 3.0) * v1; + return v2(0).value(); +} void test_autodiff() { @@ -216,5 +358,10 @@ void test_autodiff() CALL_SUBTEST_3( test_autodiff_jacobian<1>() ); CALL_SUBTEST_4( test_autodiff_hessian<1>() ); } + + bug_1222(); + bug_1223(); + bug_1260(); + bug_1261(); } diff --git a/unsupported/test/autodiff_scalar.cpp b/unsupported/test/autodiff_scalar.cpp index c631c734a..4df2f5c57 100644 --- a/unsupported/test/autodiff_scalar.cpp +++ b/unsupported/test/autodiff_scalar.cpp @@ -36,13 +36,48 @@ template<typename Scalar> void check_atan2() VERIFY_IS_APPROX(res.derivatives(), x.derivatives()); } +template<typename Scalar> void check_hyperbolic_functions() +{ + using std::sinh; + using std::cosh; + using std::tanh; + typedef Matrix<Scalar, 1, 1> Deriv1; + typedef AutoDiffScalar<Deriv1> AD; + Deriv1 p = Deriv1::Random(); + AD val(p.x(),Deriv1::UnitX()); + + Scalar cosh_px = std::cosh(p.x()); + AD res1 = tanh(val); + VERIFY_IS_APPROX(res1.value(), std::tanh(p.x())); + VERIFY_IS_APPROX(res1.derivatives().x(), Scalar(1.0) / (cosh_px * cosh_px)); + AD res2 = sinh(val); + VERIFY_IS_APPROX(res2.value(), std::sinh(p.x())); + VERIFY_IS_APPROX(res2.derivatives().x(), cosh_px); + AD res3 = cosh(val); + VERIFY_IS_APPROX(res3.value(), cosh_px); + VERIFY_IS_APPROX(res3.derivatives().x(), std::sinh(p.x())); + + // Check constant values. + const Scalar sample_point = Scalar(1) / Scalar(3); + val = AD(sample_point,Deriv1::UnitX()); + res1 = tanh(val); + VERIFY_IS_APPROX(res1.derivatives().x(), Scalar(0.896629559604914)); + + res2 = sinh(val); + VERIFY_IS_APPROX(res2.derivatives().x(), Scalar(1.056071867829939)); + + res3 = cosh(val); + VERIFY_IS_APPROX(res3.derivatives().x(), Scalar(0.339540557256150)); +} void test_autodiff_scalar() { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( check_atan2<float>() ); CALL_SUBTEST_2( check_atan2<double>() ); + CALL_SUBTEST_3( check_hyperbolic_functions<float>() ); + CALL_SUBTEST_4( check_hyperbolic_functions<double>() ); } } diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp index f16cc6f07..3b598bf42 100644 --- a/unsupported/test/cxx11_eventcount.cpp +++ b/unsupported/test/cxx11_eventcount.cpp @@ -25,7 +25,8 @@ int rand_reentrant(unsigned int* s) { static void test_basic_eventcount() { - std::vector<EventCount::Waiter> waiters(1); + MaxSizeVector<EventCount::Waiter> waiters(1); + waiters.resize(1); EventCount ec(waiters); EventCount::Waiter& w = waiters[0]; ec.Notify(false); @@ -81,7 +82,8 @@ static void test_stress_eventcount() static const int kEvents = 1 << 16; static const int kQueues = 10; - std::vector<EventCount::Waiter> waiters(kThreads); + MaxSizeVector<EventCount::Waiter> waiters(kThreads); + waiters.resize(kThreads); EventCount ec(waiters); TestQueue queues[kQueues]; diff --git a/unsupported/test/cxx11_float16.cpp b/unsupported/test/cxx11_float16.cpp deleted file mode 100644 index 9a813653c..000000000 --- a/unsupported/test/cxx11_float16.cpp +++ /dev/null @@ -1,192 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_float16 - -#include "main.h" -#include <Eigen/src/Core/arch/CUDA/Half.h> - -using Eigen::half; - -void test_conversion() -{ - // Conversion from float. - VERIFY_IS_EQUAL(half(1.0f).x, 0x3c00); - VERIFY_IS_EQUAL(half(0.5f).x, 0x3800); - VERIFY_IS_EQUAL(half(0.33333f).x, 0x3555); - VERIFY_IS_EQUAL(half(0.0f).x, 0x0000); - VERIFY_IS_EQUAL(half(-0.0f).x, 0x8000); - VERIFY_IS_EQUAL(half(65504.0f).x, 0x7bff); - VERIFY_IS_EQUAL(half(65536.0f).x, 0x7c00); // Becomes infinity. - - // Denormals. - VERIFY_IS_EQUAL(half(-5.96046e-08f).x, 0x8001); - VERIFY_IS_EQUAL(half(5.96046e-08f).x, 0x0001); - VERIFY_IS_EQUAL(half(1.19209e-07f).x, 0x0002); - - // Verify round-to-nearest-even behavior. - float val1 = float(half(__half(0x3c00))); - float val2 = float(half(__half(0x3c01))); - float val3 = float(half(__half(0x3c02))); - VERIFY_IS_EQUAL(half(0.5 * (val1 + val2)).x, 0x3c00); - VERIFY_IS_EQUAL(half(0.5 * (val2 + val3)).x, 0x3c02); - - // Conversion from int. - VERIFY_IS_EQUAL(half(-1).x, 0xbc00); - VERIFY_IS_EQUAL(half(0).x, 0x0000); - VERIFY_IS_EQUAL(half(1).x, 0x3c00); - VERIFY_IS_EQUAL(half(2).x, 0x4000); - VERIFY_IS_EQUAL(half(3).x, 0x4200); - - // Conversion from bool. - VERIFY_IS_EQUAL(half(false).x, 0x0000); - VERIFY_IS_EQUAL(half(true).x, 0x3c00); - - // Conversion to float. - VERIFY_IS_EQUAL(float(half(__half(0x0000))), 0.0f); - VERIFY_IS_EQUAL(float(half(__half(0x3c00))), 1.0f); - - // Denormals. - VERIFY_IS_APPROX(float(half(__half(0x8001))), -5.96046e-08f); - VERIFY_IS_APPROX(float(half(__half(0x0001))), 5.96046e-08f); - VERIFY_IS_APPROX(float(half(__half(0x0002))), 1.19209e-07f); - - // NaNs and infinities. - VERIFY(!(numext::isinf)(float(half(65504.0f)))); // Largest finite number. - VERIFY(!(numext::isnan)(float(half(0.0f)))); - VERIFY((numext::isinf)(float(half(__half(0xfc00))))); - VERIFY((numext::isnan)(float(half(__half(0xfc01))))); - VERIFY((numext::isinf)(float(half(__half(0x7c00))))); - VERIFY((numext::isnan)(float(half(__half(0x7c01))))); - -#if !EIGEN_COMP_MSVC - // Visual Studio errors out on divisions by 0 - VERIFY((numext::isnan)(float(half(0.0 / 0.0)))); - VERIFY((numext::isinf)(float(half(1.0 / 0.0)))); - VERIFY((numext::isinf)(float(half(-1.0 / 0.0)))); -#endif - - // Exactly same checks as above, just directly on the half representation. - VERIFY(!(numext::isinf)(half(__half(0x7bff)))); - VERIFY(!(numext::isnan)(half(__half(0x0000)))); - VERIFY((numext::isinf)(half(__half(0xfc00)))); - VERIFY((numext::isnan)(half(__half(0xfc01)))); - VERIFY((numext::isinf)(half(__half(0x7c00)))); - VERIFY((numext::isnan)(half(__half(0x7c01)))); - -#if !EIGEN_COMP_MSVC - // Visual Studio errors out on divisions by 0 - VERIFY((numext::isnan)(half(0.0 / 0.0))); - VERIFY((numext::isinf)(half(1.0 / 0.0))); - VERIFY((numext::isinf)(half(-1.0 / 0.0))); -#endif -} - -void test_arithmetic() -{ - VERIFY_IS_EQUAL(float(half(2) + half(2)), 4); - VERIFY_IS_EQUAL(float(half(2) + half(-2)), 0); - VERIFY_IS_APPROX(float(half(0.33333f) + half(0.66667f)), 1.0f); - VERIFY_IS_EQUAL(float(half(2.0f) * half(-5.5f)), -11.0f); - VERIFY_IS_APPROX(float(half(1.0f) / half(3.0f)), 0.33333f); - VERIFY_IS_EQUAL(float(-half(4096.0f)), -4096.0f); - VERIFY_IS_EQUAL(float(-half(-4096.0f)), 4096.0f); -} - -void test_comparison() -{ - VERIFY(half(1.0f) > half(0.5f)); - VERIFY(half(0.5f) < half(1.0f)); - VERIFY(!(half(1.0f) < half(0.5f))); - VERIFY(!(half(0.5f) > half(1.0f))); - - VERIFY(!(half(4.0f) > half(4.0f))); - VERIFY(!(half(4.0f) < half(4.0f))); - - VERIFY(!(half(0.0f) < half(-0.0f))); - VERIFY(!(half(-0.0f) < half(0.0f))); - VERIFY(!(half(0.0f) > half(-0.0f))); - VERIFY(!(half(-0.0f) > half(0.0f))); - - VERIFY(half(0.2f) > half(-1.0f)); - VERIFY(half(-1.0f) < half(0.2f)); - VERIFY(half(-16.0f) < half(-15.0f)); - - VERIFY(half(1.0f) == half(1.0f)); - VERIFY(half(1.0f) != half(2.0f)); - - // Comparisons with NaNs and infinities. -#if !EIGEN_COMP_MSVC - // Visual Studio errors out on divisions by 0 - VERIFY(!(half(0.0 / 0.0) == half(0.0 / 0.0))); - VERIFY(half(0.0 / 0.0) != half(0.0 / 0.0)); - - VERIFY(!(half(1.0) == half(0.0 / 0.0))); - VERIFY(!(half(1.0) < half(0.0 / 0.0))); - VERIFY(!(half(1.0) > half(0.0 / 0.0))); - VERIFY(half(1.0) != half(0.0 / 0.0)); - - VERIFY(half(1.0) < half(1.0 / 0.0)); - VERIFY(half(1.0) > half(-1.0 / 0.0)); -#endif -} - -void test_basic_functions() -{ - VERIFY_IS_EQUAL(float(numext::abs(half(3.5f))), 3.5f); - VERIFY_IS_EQUAL(float(numext::abs(half(-3.5f))), 3.5f); - - VERIFY_IS_EQUAL(float(numext::floor(half(3.5f))), 3.0f); - VERIFY_IS_EQUAL(float(numext::floor(half(-3.5f))), -4.0f); - - VERIFY_IS_EQUAL(float(numext::ceil(half(3.5f))), 4.0f); - VERIFY_IS_EQUAL(float(numext::ceil(half(-3.5f))), -3.0f); - - VERIFY_IS_APPROX(float(numext::sqrt(half(0.0f))), 0.0f); - VERIFY_IS_APPROX(float(numext::sqrt(half(4.0f))), 2.0f); - - VERIFY_IS_APPROX(float(numext::pow(half(0.0f), half(1.0f))), 0.0f); - VERIFY_IS_APPROX(float(numext::pow(half(2.0f), half(2.0f))), 4.0f); - - VERIFY_IS_EQUAL(float(numext::exp(half(0.0f))), 1.0f); - VERIFY_IS_APPROX(float(numext::exp(half(EIGEN_PI))), float(20.0 + EIGEN_PI)); - - VERIFY_IS_EQUAL(float(numext::log(half(1.0f))), 0.0f); - VERIFY_IS_APPROX(float(numext::log(half(10.0f))), 2.30273f); -} - -void test_trigonometric_functions() -{ - VERIFY_IS_APPROX(numext::cos(half(0.0f)), half(cosf(0.0f))); - VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI)), half(cosf(EIGEN_PI))); - //VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI/2)), half(cosf(EIGEN_PI/2))); - //VERIFY_IS_APPROX(numext::cos(half(3*EIGEN_PI/2)), half(cosf(3*EIGEN_PI/2))); - VERIFY_IS_APPROX(numext::cos(half(3.5f)), half(cosf(3.5f))); - - VERIFY_IS_APPROX(numext::sin(half(0.0f)), half(sinf(0.0f))); - // VERIFY_IS_APPROX(numext::sin(half(EIGEN_PI)), half(sinf(EIGEN_PI))); - VERIFY_IS_APPROX(numext::sin(half(EIGEN_PI/2)), half(sinf(EIGEN_PI/2))); - VERIFY_IS_APPROX(numext::sin(half(3*EIGEN_PI/2)), half(sinf(3*EIGEN_PI/2))); - VERIFY_IS_APPROX(numext::sin(half(3.5f)), half(sinf(3.5f))); - - VERIFY_IS_APPROX(numext::tan(half(0.0f)), half(tanf(0.0f))); - // VERIFY_IS_APPROX(numext::tan(half(EIGEN_PI)), half(tanf(EIGEN_PI))); - // VERIFY_IS_APPROX(numext::tan(half(EIGEN_PI/2)), half(tanf(EIGEN_PI/2))); - //VERIFY_IS_APPROX(numext::tan(half(3*EIGEN_PI/2)), half(tanf(3*EIGEN_PI/2))); - VERIFY_IS_APPROX(numext::tan(half(3.5f)), half(tanf(3.5f))); -} - -void test_cxx11_float16() -{ - CALL_SUBTEST(test_conversion()); - CALL_SUBTEST(test_arithmetic()); - CALL_SUBTEST(test_comparison()); - CALL_SUBTEST(test_basic_functions()); - CALL_SUBTEST(test_trigonometric_functions()); -} diff --git a/unsupported/test/cxx11_non_blocking_thread_pool.cpp b/unsupported/test/cxx11_non_blocking_thread_pool.cpp new file mode 100644 index 000000000..5f9bb938b --- /dev/null +++ b/unsupported/test/cxx11_non_blocking_thread_pool.cpp @@ -0,0 +1,107 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com> +// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_USE_THREADS +#include "main.h" +#include "Eigen/CXX11/ThreadPool" + +static void test_create_destroy_empty_pool() +{ + // Just create and destroy the pool. This will wind up and tear down worker + // threads. Ensure there are no issues in that logic. + for (int i = 0; i < 16; ++i) { + NonBlockingThreadPool tp(i); + } +} + + +static void test_parallelism() +{ + // Test we never-ever fail to match available tasks with idle threads. + const int kThreads = 16; // code below expects that this is a multiple of 4 + NonBlockingThreadPool tp(kThreads); + VERIFY_IS_EQUAL(tp.NumThreads(), kThreads); + VERIFY_IS_EQUAL(tp.CurrentThreadId(), -1); + for (int iter = 0; iter < 100; ++iter) { + std::atomic<int> running(0); + std::atomic<int> done(0); + std::atomic<int> phase(0); + // Schedule kThreads tasks and ensure that they all are running. + for (int i = 0; i < kThreads; ++i) { + tp.Schedule([&]() { + const int thread_id = tp.CurrentThreadId(); + VERIFY_GE(thread_id, 0); + VERIFY_LE(thread_id, kThreads - 1); + running++; + while (phase < 1) { + } + done++; + }); + } + while (running != kThreads) { + } + running = 0; + phase = 1; + // Now, while the previous tasks exit, schedule another kThreads tasks and + // ensure that they are running. + for (int i = 0; i < kThreads; ++i) { + tp.Schedule([&, i]() { + running++; + while (phase < 2) { + } + // When all tasks are running, half of tasks exit, quarter of tasks + // continue running and quarter of tasks schedule another 2 tasks each. + // Concurrently main thread schedules another quarter of tasks. + // This gives us another kThreads tasks and we ensure that they all + // are running. + if (i < kThreads / 2) { + } else if (i < 3 * kThreads / 4) { + running++; + while (phase < 3) { + } + done++; + } else { + for (int j = 0; j < 2; ++j) { + tp.Schedule([&]() { + running++; + while (phase < 3) { + } + done++; + }); + } + } + done++; + }); + } + while (running != kThreads) { + } + running = 0; + phase = 2; + for (int i = 0; i < kThreads / 4; ++i) { + tp.Schedule([&]() { + running++; + while (phase < 3) { + } + done++; + }); + } + while (running != kThreads) { + } + phase = 3; + while (done != 3 * kThreads) { + } + } +} + +void test_cxx11_non_blocking_thread_pool() +{ + CALL_SUBTEST(test_create_destroy_empty_pool()); + CALL_SUBTEST(test_parallelism()); +} diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp index d1770ee1b..91f690114 100644 --- a/unsupported/test/cxx11_runqueue.cpp +++ b/unsupported/test/cxx11_runqueue.cpp @@ -33,73 +33,81 @@ void test_basic_runqueue() VERIFY_IS_EQUAL(0u, q.Size()); VERIFY_IS_EQUAL(0, q.PopFront()); std::vector<int> stolen; - VERIFY_IS_EQUAL(0, q.PopBackHalf(&stolen)); + VERIFY_IS_EQUAL(0u, q.PopBackHalf(&stolen)); VERIFY_IS_EQUAL(0u, stolen.size()); // Push one front, pop one front. VERIFY_IS_EQUAL(0, q.PushFront(1)); - VERIFY_IS_EQUAL(1, q.Size()); + VERIFY_IS_EQUAL(1u, q.Size()); VERIFY_IS_EQUAL(1, q.PopFront()); - VERIFY_IS_EQUAL(0, q.Size()); + VERIFY_IS_EQUAL(0u, q.Size()); // Push front to overflow. VERIFY_IS_EQUAL(0, q.PushFront(2)); - VERIFY_IS_EQUAL(1, q.Size()); + VERIFY_IS_EQUAL(1u, q.Size()); VERIFY_IS_EQUAL(0, q.PushFront(3)); - VERIFY_IS_EQUAL(2, q.Size()); + VERIFY_IS_EQUAL(2u, q.Size()); VERIFY_IS_EQUAL(0, q.PushFront(4)); - VERIFY_IS_EQUAL(3, q.Size()); + VERIFY_IS_EQUAL(3u, q.Size()); VERIFY_IS_EQUAL(0, q.PushFront(5)); - VERIFY_IS_EQUAL(4, q.Size()); + VERIFY_IS_EQUAL(4u, q.Size()); VERIFY_IS_EQUAL(6, q.PushFront(6)); - VERIFY_IS_EQUAL(4, q.Size()); + VERIFY_IS_EQUAL(4u, q.Size()); VERIFY_IS_EQUAL(5, q.PopFront()); - VERIFY_IS_EQUAL(3, q.Size()); + VERIFY_IS_EQUAL(3u, q.Size()); VERIFY_IS_EQUAL(4, q.PopFront()); - VERIFY_IS_EQUAL(2, q.Size()); + VERIFY_IS_EQUAL(2u, q.Size()); VERIFY_IS_EQUAL(3, q.PopFront()); - VERIFY_IS_EQUAL(1, q.Size()); + VERIFY_IS_EQUAL(1u, q.Size()); VERIFY_IS_EQUAL(2, q.PopFront()); - VERIFY_IS_EQUAL(0, q.Size()); + VERIFY_IS_EQUAL(0u, q.Size()); VERIFY_IS_EQUAL(0, q.PopFront()); // Push one back, pop one back. VERIFY_IS_EQUAL(0, q.PushBack(7)); - VERIFY_IS_EQUAL(1, q.Size()); - VERIFY_IS_EQUAL(1, q.PopBackHalf(&stolen)); - VERIFY_IS_EQUAL(1, stolen.size()); + VERIFY_IS_EQUAL(1u, q.Size()); + VERIFY_IS_EQUAL(1u, q.PopBackHalf(&stolen)); + VERIFY_IS_EQUAL(1u, stolen.size()); VERIFY_IS_EQUAL(7, stolen[0]); - VERIFY_IS_EQUAL(0, q.Size()); + VERIFY_IS_EQUAL(0u, q.Size()); stolen.clear(); // Push back to overflow. VERIFY_IS_EQUAL(0, q.PushBack(8)); - VERIFY_IS_EQUAL(1, q.Size()); + VERIFY_IS_EQUAL(1u, q.Size()); VERIFY_IS_EQUAL(0, q.PushBack(9)); - VERIFY_IS_EQUAL(2, q.Size()); + VERIFY_IS_EQUAL(2u, q.Size()); VERIFY_IS_EQUAL(0, q.PushBack(10)); - VERIFY_IS_EQUAL(3, q.Size()); + VERIFY_IS_EQUAL(3u, q.Size()); VERIFY_IS_EQUAL(0, q.PushBack(11)); - VERIFY_IS_EQUAL(4, q.Size()); + VERIFY_IS_EQUAL(4u, q.Size()); VERIFY_IS_EQUAL(12, q.PushBack(12)); - VERIFY_IS_EQUAL(4, q.Size()); + VERIFY_IS_EQUAL(4u, q.Size()); // Pop back in halves. - VERIFY_IS_EQUAL(2, q.PopBackHalf(&stolen)); - VERIFY_IS_EQUAL(2, stolen.size()); + VERIFY_IS_EQUAL(2u, q.PopBackHalf(&stolen)); + VERIFY_IS_EQUAL(2u, stolen.size()); VERIFY_IS_EQUAL(10, stolen[0]); VERIFY_IS_EQUAL(11, stolen[1]); - VERIFY_IS_EQUAL(2, q.Size()); + VERIFY_IS_EQUAL(2u, q.Size()); stolen.clear(); - VERIFY_IS_EQUAL(1, q.PopBackHalf(&stolen)); - VERIFY_IS_EQUAL(1, stolen.size()); + VERIFY_IS_EQUAL(1u, q.PopBackHalf(&stolen)); + VERIFY_IS_EQUAL(1u, stolen.size()); VERIFY_IS_EQUAL(9, stolen[0]); - VERIFY_IS_EQUAL(1, q.Size()); + VERIFY_IS_EQUAL(1u, q.Size()); stolen.clear(); - VERIFY_IS_EQUAL(1, q.PopBackHalf(&stolen)); - VERIFY_IS_EQUAL(1, stolen.size()); + VERIFY_IS_EQUAL(1u, q.PopBackHalf(&stolen)); + VERIFY_IS_EQUAL(1u, stolen.size()); VERIFY_IS_EQUAL(8, stolen[0]); stolen.clear(); - VERIFY_IS_EQUAL(0, q.PopBackHalf(&stolen)); - VERIFY_IS_EQUAL(0, stolen.size()); + VERIFY_IS_EQUAL(0u, q.PopBackHalf(&stolen)); + VERIFY_IS_EQUAL(0u, stolen.size()); // Empty again. VERIFY(q.Empty()); - VERIFY_IS_EQUAL(0, q.Size()); + VERIFY_IS_EQUAL(0u, q.Size()); + VERIFY_IS_EQUAL(0, q.PushFront(1)); + VERIFY_IS_EQUAL(0, q.PushFront(2)); + VERIFY_IS_EQUAL(0, q.PushFront(3)); + VERIFY_IS_EQUAL(1, q.PopBack()); + VERIFY_IS_EQUAL(2, q.PopBack()); + VERIFY_IS_EQUAL(3, q.PopBack()); + VERIFY(q.Empty()); + VERIFY_IS_EQUAL(0u, q.Size()); } // Empty tests that the queue is not claimed to be empty when is is in fact not. @@ -130,7 +138,7 @@ void test_empty_runqueue() stolen.clear(); break; } - VERIFY_IS_EQUAL(0, stolen.size()); + VERIFY_IS_EQUAL(0u, stolen.size()); } } } diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cu b/unsupported/test/cxx11_tensor_argmax_cuda.cu index 41ccbe974..6fe8982f2 100644 --- a/unsupported/test/cxx11_tensor_argmax_cuda.cu +++ b/unsupported/test/cxx11_tensor_argmax_cuda.cu @@ -12,6 +12,9 @@ #define EIGEN_TEST_FUNC cxx11_tensor_cuda #define EIGEN_USE_GPU +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#include <cuda_fp16.h> +#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp index e5cf61fe1..8fe85d83c 100644 --- a/unsupported/test/cxx11_tensor_assign.cpp +++ b/unsupported/test/cxx11_tensor_assign.cpp @@ -286,7 +286,7 @@ static void test_compound_assign() } static void test_std_initializers_tensor() { -#ifdef EIGEN_HAS_VARIADIC_TEMPLATES +#if EIGEN_HAS_VARIADIC_TEMPLATES Tensor<int, 1> a(3); a.setValues({0, 1, 2}); VERIFY_IS_EQUAL(a(0), 0); diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp index 2ddf47234..5c0ea5889 100644 --- a/unsupported/test/cxx11_tensor_broadcasting.cpp +++ b/unsupported/test/cxx11_tensor_broadcasting.cpp @@ -115,7 +115,7 @@ static void test_static_broadcasting() Tensor<float, 3, DataLayout> tensor(8,3,5); tensor.setRandom(); -#ifdef EIGEN_HAS_CONSTEXPR +#if EIGEN_HAS_CONSTEXPR Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3>, Eigen::type2index<4>> broadcasts; #else Eigen::array<int, 3> broadcasts; diff --git a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu index f22b99de8..88c233994 100644 --- a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu @@ -13,7 +13,9 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU - +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#include <cuda_fp16.h> +#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> diff --git a/unsupported/test/cxx11_tensor_complex_cuda.cu b/unsupported/test/cxx11_tensor_complex_cuda.cu new file mode 100644 index 000000000..f895efd01 --- /dev/null +++ b/unsupported/test/cxx11_tensor_complex_cuda.cu @@ -0,0 +1,115 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_FUNC cxx11_tensor_complex +#define EIGEN_USE_GPU + +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#include <cuda_fp16.h> +#endif +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::Tensor; + +void test_cuda_nullary() { + Tensor<std::complex<float>, 1, 0, int> in1(2); + Tensor<std::complex<float>, 1, 0, int> in2(2); + in1.setRandom(); + in2.setRandom(); + + std::size_t float_bytes = in1.size() * sizeof(float); + std::size_t complex_bytes = in1.size() * sizeof(std::complex<float>); + + std::complex<float>* d_in1; + std::complex<float>* d_in2; + float* d_out2; + cudaMalloc((void**)(&d_in1), complex_bytes); + cudaMalloc((void**)(&d_in2), complex_bytes); + cudaMalloc((void**)(&d_out2), float_bytes); + cudaMemcpy(d_in1, in1.data(), complex_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in2, in2.data(), complex_bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in1( + d_in1, 2); + Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in2( + d_in2, 2); + Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_out2( + d_out2, 2); + + gpu_in1.device(gpu_device) = gpu_in1.constant(std::complex<float>(3.14f, 2.7f)); + gpu_out2.device(gpu_device) = gpu_in2.abs(); + + Tensor<std::complex<float>, 1, 0, int> new1(2); + Tensor<float, 1, 0, int> new2(2); + + assert(cudaMemcpyAsync(new1.data(), d_in1, complex_bytes, cudaMemcpyDeviceToHost, + gpu_device.stream()) == cudaSuccess); + assert(cudaMemcpyAsync(new2.data(), d_out2, float_bytes, cudaMemcpyDeviceToHost, + gpu_device.stream()) == cudaSuccess); + + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 2; ++i) { + VERIFY_IS_APPROX(new1(i), std::complex<float>(3.14f, 2.7f)); + VERIFY_IS_APPROX(new2(i), std::abs(in2(i))); + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out2); +} + + +static void test_cuda_sum_reductions() { + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + const int num_rows = internal::random<int>(1024, 5*1024); + const int num_cols = internal::random<int>(1024, 5*1024); + + Tensor<std::complex<float>, 2> in(num_rows, num_cols); + in.setRandom(); + + Tensor<std::complex<float>, 0> full_redux; + full_redux = in.sum(); + + std::size_t in_bytes = in.size() * sizeof(std::complex<float>); + std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>); + std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes)); + std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes)); + gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes); + + TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols); + TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr); + + out_gpu.device(gpu_device) = in_gpu.sum(); + + Tensor<std::complex<float>, 0> full_redux_gpu; + gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes); + gpu_device.synchronize(); + + // Check that the CPU and GPU reductions return the same result. + VERIFY_IS_APPROX(full_redux(), full_redux_gpu()); + + gpu_device.deallocate(gpu_in_ptr); + gpu_device.deallocate(gpu_out_ptr); +} + + +void test_cxx11_tensor_complex() +{ + CALL_SUBTEST(test_cuda_nullary()); + CALL_SUBTEST(test_cuda_sum_reductions()); +} diff --git a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu new file mode 100644 index 000000000..2baf5eaad --- /dev/null +++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu @@ -0,0 +1,97 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_FUNC cxx11_tensor_complex_cwise_ops +#define EIGEN_USE_GPU + +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#include <cuda_fp16.h> +#endif +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::Tensor; + +template<typename T> +void test_cuda_complex_cwise_ops() { + const int kNumItems = 2; + std::size_t complex_bytes = kNumItems * sizeof(std::complex<T>); + + std::complex<T>* d_in1; + std::complex<T>* d_in2; + std::complex<T>* d_out; + cudaMalloc((void**)(&d_in1), complex_bytes); + cudaMalloc((void**)(&d_in2), complex_bytes); + cudaMalloc((void**)(&d_out), complex_bytes); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in1( + d_in1, kNumItems); + Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in2( + d_in2, kNumItems); + Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_out( + d_out, kNumItems); + + const std::complex<T> a(3.14f, 2.7f); + const std::complex<T> b(-10.6f, 1.4f); + + gpu_in1.device(gpu_device) = gpu_in1.constant(a); + gpu_in2.device(gpu_device) = gpu_in2.constant(b); + + enum CwiseOp { + Add = 0, + Sub, + Mul, + Div + }; + + Tensor<std::complex<T>, 1, 0, int> actual(kNumItems); + for (int op = Add; op <= Div; op++) { + std::complex<T> expected; + switch (static_cast<CwiseOp>(op)) { + case Add: + gpu_out.device(gpu_device) = gpu_in1 + gpu_in2; + expected = a + b; + break; + case Sub: + gpu_out.device(gpu_device) = gpu_in1 - gpu_in2; + expected = a - b; + break; + case Mul: + gpu_out.device(gpu_device) = gpu_in1 * gpu_in2; + expected = a * b; + break; + case Div: + gpu_out.device(gpu_device) = gpu_in1 / gpu_in2; + expected = a / b; + break; + } + assert(cudaMemcpyAsync(actual.data(), d_out, complex_bytes, cudaMemcpyDeviceToHost, + gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < kNumItems; ++i) { + VERIFY_IS_APPROX(actual(i), expected); + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); +} + + +void test_cxx11_tensor_complex_cwise_ops() +{ + CALL_SUBTEST(test_cuda_complex_cwise_ops<float>()); + CALL_SUBTEST(test_cuda_complex_cwise_ops<double>()); +} diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_cuda.cu index 6d1ef07f9..767e9c678 100644 --- a/unsupported/test/cxx11_tensor_contract_cuda.cu +++ b/unsupported/test/cxx11_tensor_contract_cuda.cu @@ -14,7 +14,9 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU - +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#include <cuda_fp16.h> +#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> @@ -84,6 +86,65 @@ void test_cuda_contraction(int m_size, int k_size, int n_size) cudaFree((void*)d_t_result); } + +template<int DataLayout> +void test_scalar(int m_size, int k_size, int n_size) +{ + std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; + // with these dimensions, the output has 300 * 140 elements, which is + // more than 30 * 1024, which is the number of threads in blocks on + // a 15 SM GK110 GPU + Tensor<float, 2, DataLayout> t_left(m_size, k_size); + Tensor<float, 2, DataLayout> t_right(k_size, n_size); + Tensor<float, 0, DataLayout> t_result; + Tensor<float, 0, DataLayout> t_result_gpu; + Eigen::array<DimPair, 2> dims(DimPair(0, 0), DimPair(1, 1)); + + t_left.setRandom(); + t_right.setRandom(); + + std::size_t t_left_bytes = t_left.size() * sizeof(float); + std::size_t t_right_bytes = t_right.size() * sizeof(float); + std::size_t t_result_bytes = sizeof(float); + + float* d_t_left; + float* d_t_right; + float* d_t_result; + + cudaMalloc((void**)(&d_t_left), t_left_bytes); + cudaMalloc((void**)(&d_t_right), t_right_bytes); + cudaMalloc((void**)(&d_t_result), t_result_bytes); + + cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > + gpu_t_left(d_t_left, m_size, k_size); + Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > + gpu_t_right(d_t_right, k_size, n_size); + Eigen::TensorMap<Eigen::Tensor<float, 0, DataLayout> > + gpu_t_result(d_t_result); + + gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); + t_result = t_left.contract(t_right, dims); + + cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); + if (fabs(t_result() - t_result_gpu()) > 1e-4f && + !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) { + std::cout << "mismatch detected: " << t_result() + << " vs " << t_result_gpu() << std::endl; + assert(false); + } + + cudaFree((void*)d_t_left); + cudaFree((void*)d_t_right); + cudaFree((void*)d_t_result); +} + + template<int DataLayout> void test_cuda_contraction_m() { for (int k = 32; k < 256; k++) { @@ -138,6 +199,9 @@ void test_cxx11_tensor_cuda() CALL_SUBTEST_1(test_cuda_contraction<ColMajor>(128, 128, 128)); CALL_SUBTEST_1(test_cuda_contraction<RowMajor>(128, 128, 128)); + CALL_SUBTEST_1(test_scalar<ColMajor>(128, 128, 128)); + CALL_SUBTEST_1(test_scalar<RowMajor>(128, 128, 128)); + CALL_SUBTEST_2(test_cuda_contraction_m<ColMajor>()); CALL_SUBTEST_3(test_cuda_contraction_m<RowMajor>()); diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp index 0e16308a2..ace97057f 100644 --- a/unsupported/test/cxx11_tensor_contraction.cpp +++ b/unsupported/test/cxx11_tensor_contraction.cpp @@ -87,19 +87,14 @@ static void test_scalar() vec1.setRandom(); vec2.setRandom(); - Tensor<float, 1, DataLayout> scalar(1); - scalar.setZero(); Eigen::array<DimPair, 1> dims = {{DimPair(0, 0)}}; - typedef TensorEvaluator<decltype(vec1.contract(vec2, dims)), DefaultDevice> Evaluator; - Evaluator eval(vec1.contract(vec2, dims), DefaultDevice()); - eval.evalTo(scalar.data()); - EIGEN_STATIC_ASSERT(Evaluator::NumDims==1ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + Tensor<float, 0, DataLayout> scalar = vec1.contract(vec2, dims); float expected = 0.0f; for (int i = 0; i < 6; ++i) { expected += vec1(i) * vec2(i); } - VERIFY_IS_APPROX(scalar(0), expected); + VERIFY_IS_APPROX(scalar(), expected); } template<int DataLayout> @@ -494,6 +489,27 @@ static void test_tensor_product() } +template<int DataLayout> +static void test_const_inputs() +{ + Tensor<float, 2, DataLayout> in1(2, 3); + Tensor<float, 2, DataLayout> in2(3, 2); + in1.setRandom(); + in2.setRandom(); + + TensorMap<Tensor<const float, 2, DataLayout> > mat1(in1.data(), 2, 3); + TensorMap<Tensor<const float, 2, DataLayout> > mat2(in2.data(), 3, 2); + Tensor<float, 2, DataLayout> mat3(2,2); + + Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}}; + mat3 = mat1.contract(mat2, dims); + + VERIFY_IS_APPROX(mat3(0,0), mat1(0,0)*mat2(0,0) + mat1(0,1)*mat2(1,0) + mat1(0,2)*mat2(2,0)); + VERIFY_IS_APPROX(mat3(0,1), mat1(0,0)*mat2(0,1) + mat1(0,1)*mat2(1,1) + mat1(0,2)*mat2(2,1)); + VERIFY_IS_APPROX(mat3(1,0), mat1(1,0)*mat2(0,0) + mat1(1,1)*mat2(1,0) + mat1(1,2)*mat2(2,0)); + VERIFY_IS_APPROX(mat3(1,1), mat1(1,0)*mat2(0,1) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(2,1)); +} + void test_cxx11_tensor_contraction() { CALL_SUBTEST(test_evals<ColMajor>()); @@ -524,4 +540,6 @@ void test_cxx11_tensor_contraction() CALL_SUBTEST(test_small_blocking_factors<RowMajor>()); CALL_SUBTEST(test_tensor_product<ColMajor>()); CALL_SUBTEST(test_tensor_product<RowMajor>()); + CALL_SUBTEST(test_const_inputs<ColMajor>()); + CALL_SUBTEST(test_const_inputs<RowMajor>()); } diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index 4026f48f0..bf216587a 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -10,19 +10,65 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_cuda -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU - +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#include <cuda_fp16.h> +#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> using Eigen::Tensor; +void test_cuda_nullary() { + Tensor<float, 1, 0, int> in1(2); + Tensor<float, 1, 0, int> in2(2); + in1.setRandom(); + in2.setRandom(); + + std::size_t tensor_bytes = in1.size() * sizeof(float); + + float* d_in1; + float* d_in2; + cudaMalloc((void**)(&d_in1), tensor_bytes); + cudaMalloc((void**)(&d_in2), tensor_bytes); + cudaMemcpy(d_in1, in1.data(), tensor_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in2, in2.data(), tensor_bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1( + d_in1, 2); + Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in2( + d_in2, 2); + + gpu_in1.device(gpu_device) = gpu_in1.constant(3.14f); + gpu_in2.device(gpu_device) = gpu_in2.random(); + + Tensor<float, 1, 0, int> new1(2); + Tensor<float, 1, 0, int> new2(2); + + assert(cudaMemcpyAsync(new1.data(), d_in1, tensor_bytes, cudaMemcpyDeviceToHost, + gpu_device.stream()) == cudaSuccess); + assert(cudaMemcpyAsync(new2.data(), d_in2, tensor_bytes, cudaMemcpyDeviceToHost, + gpu_device.stream()) == cudaSuccess); + + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 2; ++i) { + VERIFY_IS_APPROX(new1(i), 3.14f); + VERIFY_IS_NOT_EQUAL(new2(i), in2(i)); + } + + cudaFree(d_in1); + cudaFree(d_in2); +} + void test_cuda_elementwise_small() { - Tensor<float, 1> in1(Eigen::array<int, 1>(2)); - Tensor<float, 1> in2(Eigen::array<int, 1>(2)); - Tensor<float, 1> out(Eigen::array<int, 1>(2)); + Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>(2)); + Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>(2)); + Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>(2)); in1.setRandom(); in2.setRandom(); @@ -44,11 +90,11 @@ void test_cuda_elementwise_small() { Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1( - d_in1, Eigen::array<int, 1>(2)); + d_in1, Eigen::array<Eigen::DenseIndex, 1>(2)); Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2( - d_in2, Eigen::array<int, 1>(2)); + d_in2, Eigen::array<Eigen::DenseIndex, 1>(2)); Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out( - d_out, Eigen::array<int, 1>(2)); + d_out, Eigen::array<Eigen::DenseIndex, 1>(2)); gpu_out.device(gpu_device) = gpu_in1 + gpu_in2; @@ -58,8 +104,8 @@ void test_cuda_elementwise_small() { for (int i = 0; i < 2; ++i) { VERIFY_IS_APPROX( - out(Eigen::array<int, 1>(i)), - in1(Eigen::array<int, 1>(i)) + in2(Eigen::array<int, 1>(i))); + out(Eigen::array<Eigen::DenseIndex, 1>(i)), + in1(Eigen::array<Eigen::DenseIndex, 1>(i)) + in2(Eigen::array<Eigen::DenseIndex, 1>(i))); } cudaFree(d_in1); @@ -69,10 +115,10 @@ void test_cuda_elementwise_small() { void test_cuda_elementwise() { - Tensor<float, 3> in1(Eigen::array<int, 3>(72,53,97)); - Tensor<float, 3> in2(Eigen::array<int, 3>(72,53,97)); - Tensor<float, 3> in3(Eigen::array<int, 3>(72,53,97)); - Tensor<float, 3> out(Eigen::array<int, 3>(72,53,97)); + Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>(72,53,97)); + Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>(72,53,97)); + Tensor<float, 3> in3(Eigen::array<Eigen::DenseIndex, 3>(72,53,97)); + Tensor<float, 3> out(Eigen::array<Eigen::DenseIndex, 3>(72,53,97)); in1.setRandom(); in2.setRandom(); in3.setRandom(); @@ -98,10 +144,10 @@ void test_cuda_elementwise() Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); - Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<int, 3>(72,53,97)); - Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<int, 3>(72,53,97)); - Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<int, 3>(72,53,97)); - Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<int, 3>(72,53,97)); + Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>(72,53,97)); + Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<Eigen::DenseIndex, 3>(72,53,97)); + Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<Eigen::DenseIndex, 3>(72,53,97)); + Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<Eigen::DenseIndex, 3>(72,53,97)); gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3; @@ -111,7 +157,7 @@ void test_cuda_elementwise() for (int i = 0; i < 72; ++i) { for (int j = 0; j < 53; ++j) { for (int k = 0; k < 97; ++k) { - VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), in1(Eigen::array<int, 3>(i,j,k)) + in2(Eigen::array<int, 3>(i,j,k)) * in3(Eigen::array<int, 3>(i,j,k))); + VERIFY_IS_APPROX(out(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)), in1(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) + in2(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) * in3(Eigen::array<Eigen::DenseIndex, 3>(i,j,k))); } } } @@ -181,7 +227,7 @@ void test_cuda_reduction() Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113); Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97); - array<int, 2> reduction_axis; + array<Eigen::DenseIndex, 2> reduction_axis; reduction_axis[0] = 1; reduction_axis[1] = 3; @@ -214,8 +260,8 @@ void test_cuda_contraction() // more than 30 * 1024, which is the number of threads in blocks on // a 15 SM GK110 GPU Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31); - Tensor<float, 5, DataLayout> t_right(Eigen::array<int, 5>(3, 31, 7, 20, 1)); - Tensor<float, 5, DataLayout> t_result(Eigen::array<int, 5>(6, 50, 7, 20, 1)); + Tensor<float, 5, DataLayout> t_right(Eigen::array<Eigen::DenseIndex, 5>(3, 31, 7, 20, 1)); + Tensor<float, 5, DataLayout> t_result(Eigen::array<Eigen::DenseIndex, 5>(6, 50, 7, 20, 1)); t_left.setRandom(); t_right.setRandom(); @@ -299,7 +345,7 @@ void test_cuda_convolution_1d() Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4); Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137); - Eigen::array<int, 1> dims(1); + Eigen::array<Eigen::DenseIndex, 1> dims(1); gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); @@ -352,7 +398,7 @@ void test_cuda_convolution_inner_dim_col_major_1d() Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4); Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7); - Eigen::array<int, 1> dims(0); + Eigen::array<Eigen::DenseIndex, 1> dims(0); gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); @@ -405,7 +451,7 @@ void test_cuda_convolution_inner_dim_row_major_1d() Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4); Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71); - Eigen::array<int, 1> dims(3); + Eigen::array<Eigen::DenseIndex, 1> dims(3); gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); @@ -459,7 +505,7 @@ void test_cuda_convolution_2d() Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4); Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137); - Eigen::array<int, 2> dims(1,2); + Eigen::array<Eigen::DenseIndex, 2> dims(1,2); gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); @@ -496,9 +542,9 @@ void test_cuda_convolution_2d() template<int DataLayout> void test_cuda_convolution_3d() { - Tensor<float, 5, DataLayout> input(Eigen::array<int, 5>(74,37,11,137,17)); + Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>(74,37,11,137,17)); Tensor<float, 3, DataLayout> kernel(3,4,2); - Tensor<float, 5, DataLayout> out(Eigen::array<int, 5>(74,35,8,136,17)); + Tensor<float, 5, DataLayout> out(Eigen::array<Eigen::DenseIndex, 5>(74,35,8,136,17)); input = input.constant(10.0f) + input.random(); kernel = kernel.constant(7.0f) + kernel.random(); @@ -523,7 +569,7 @@ void test_cuda_convolution_3d() Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2); Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17); - Eigen::array<int, 3> dims(1,2,3); + Eigen::array<Eigen::DenseIndex, 3> dims(1,2,3); gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); @@ -1019,8 +1065,156 @@ void test_cuda_erfc(const Scalar stddev) cudaFree(d_out); } +template <typename Scalar> +void test_cuda_betainc() +{ + Tensor<Scalar, 1> in_x(125); + Tensor<Scalar, 1> in_a(125); + Tensor<Scalar, 1> in_b(125); + Tensor<Scalar, 1> out(125); + Tensor<Scalar, 1> expected_out(125); + out.setZero(); + + Scalar nan = std::numeric_limits<Scalar>::quiet_NaN(); + + Array<Scalar, 1, Dynamic> x(125); + Array<Scalar, 1, Dynamic> a(125); + Array<Scalar, 1, Dynamic> b(125); + Array<Scalar, 1, Dynamic> v(125); + + a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, + 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, + 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999, + 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, + 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, + 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999; + + b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999, + 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999, + 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999, + 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999, + 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999, + 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999, + 999.999, 999.999, 999.999; + + x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, + 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, + 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, + 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, + 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, + -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, + 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, + 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, + 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1; + + v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, + nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, + nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan, + 0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan, + 0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan, + 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan, + nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256, + 0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001, + 0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403, + 0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999, + 0.9999999999999999, nan, nan, nan, nan, nan, nan, nan, + 1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan, + nan, 7.864342668429763e-23, 3.015969667594166e-10, 0.0008598571564165444, + nan, nan, 6.031987710123844e-08, 0.5000000000000007, 0.9999999396801229, + nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, + nan, nan, nan, nan, nan, nan, 0.0, 7.029920380986636e-306, + 2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302, + 1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252, + 2.9303043666183996e-60, nan, nan, 2.248913486879199e-196, + 0.5000000000004947, 0.9999999999999999, nan; + + for (int i = 0; i < 125; ++i) { + in_x(i) = x(i); + in_a(i) = a(i); + in_b(i) = b(i); + expected_out(i) = v(i); + } + + std::size_t bytes = in_x.size() * sizeof(Scalar); + + Scalar* d_in_x; + Scalar* d_in_a; + Scalar* d_in_b; + Scalar* d_out; + cudaMalloc((void**)(&d_in_x), bytes); + cudaMalloc((void**)(&d_in_a), bytes); + cudaMalloc((void**)(&d_in_b), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in_a, in_a.data(), bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in_b, in_b.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125); + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_a(d_in_a, 125); + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_b(d_in_b, 125); + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 125); + + gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 1; i < 125; ++i) { + if ((std::isnan)(expected_out(i))) { + VERIFY((std::isnan)(out(i))); + } else { + VERIFY_IS_APPROX(out(i), expected_out(i)); + } + } + + cudaFree(d_in_x); + cudaFree(d_in_a); + cudaFree(d_in_b); + cudaFree(d_out); +} + + void test_cxx11_tensor_cuda() { + CALL_SUBTEST_1(test_cuda_nullary()); CALL_SUBTEST_1(test_cuda_elementwise_small()); CALL_SUBTEST_1(test_cuda_elementwise()); CALL_SUBTEST_1(test_cuda_props()); @@ -1086,5 +1280,8 @@ void test_cxx11_tensor_cuda() CALL_SUBTEST_5(test_cuda_igamma<double>()); CALL_SUBTEST_5(test_cuda_igammac<double>()); + + CALL_SUBTEST_6(test_cuda_betainc<float>()); + CALL_SUBTEST_6(test_cuda_betainc<double>()); #endif } diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu index cbe9e6449..fde20ddf2 100644 --- a/unsupported/test/cxx11_tensor_device.cu +++ b/unsupported/test/cxx11_tensor_device.cu @@ -13,7 +13,9 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU - +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#include <cuda_fp16.h> +#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> @@ -241,7 +243,7 @@ void test_cpu() { const float result = out(i,j,k); const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) + (in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f); - if (fabs(expected) < 1e-4 && fabs(result) < 1e-4) { + if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) { continue; } VERIFY_IS_APPROX(expected, result); @@ -258,7 +260,7 @@ void test_cpu() { in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) + (in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f + in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f); - if (fabs(expected) < 1e-4 && fabs(result) < 1e-4) { + if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) { continue; } VERIFY_IS_APPROX(expected, result); diff --git a/unsupported/test/cxx11_tensor_dimension.cpp b/unsupported/test/cxx11_tensor_dimension.cpp index 421e73693..16f168ed4 100644 --- a/unsupported/test/cxx11_tensor_dimension.cpp +++ b/unsupported/test/cxx11_tensor_dimension.cpp @@ -21,7 +21,7 @@ static void test_dynamic_size() VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2); VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3); VERIFY_IS_EQUAL((int)Eigen::internal::array_get<2>(dimensions), 7); - VERIFY_IS_EQUAL(dimensions.TotalSize(), 2*3*7); + VERIFY_IS_EQUAL((int)dimensions.TotalSize(), 2*3*7); VERIFY_IS_EQUAL((int)dimensions[0], 2); VERIFY_IS_EQUAL((int)dimensions[1], 3); VERIFY_IS_EQUAL((int)dimensions[2], 7); @@ -34,12 +34,12 @@ static void test_fixed_size() VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2); VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3); VERIFY_IS_EQUAL((int)Eigen::internal::array_get<2>(dimensions), 7); - VERIFY_IS_EQUAL(dimensions.TotalSize(), 2*3*7); + VERIFY_IS_EQUAL((int)dimensions.TotalSize(), 2*3*7); } static void test_match() { - Eigen::DSizes<int, 3> dyn(2,3,7); + Eigen::DSizes<unsigned int, 3> dyn((unsigned int)2,(unsigned int)3,(unsigned int)7); Eigen::Sizes<2,3,7> stat; VERIFY_IS_EQUAL(Eigen::dimensions_match(dyn, stat), true); @@ -51,13 +51,13 @@ static void test_match() static void test_rank_zero() { Eigen::Sizes<> scalar; - VERIFY_IS_EQUAL(scalar.TotalSize(), 1); - VERIFY_IS_EQUAL(scalar.rank(), 0); - VERIFY_IS_EQUAL(internal::array_prod(scalar), 1); + VERIFY_IS_EQUAL((int)scalar.TotalSize(), 1); + VERIFY_IS_EQUAL((int)scalar.rank(), 0); + VERIFY_IS_EQUAL((int)internal::array_prod(scalar), 1); Eigen::DSizes<ptrdiff_t, 0> dscalar; - VERIFY_IS_EQUAL(dscalar.TotalSize(), 1); - VERIFY_IS_EQUAL(dscalar.rank(), 0); + VERIFY_IS_EQUAL((int)dscalar.TotalSize(), 1); + VERIFY_IS_EQUAL((int)dscalar.rank(), 0); } void test_cxx11_tensor_dimension() diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp index 8389e9840..77e24cb67 100644 --- a/unsupported/test/cxx11_tensor_expr.cpp +++ b/unsupported/test/cxx11_tensor_expr.cpp @@ -16,8 +16,8 @@ using Eigen::RowMajor; static void test_1d() { - Tensor<float, 1> vec1({6}); - Tensor<float, 1, RowMajor> vec2({6}); + Tensor<float, 1> vec1(6); + Tensor<float, 1, RowMajor> vec2(6); vec1(0) = 4.0; vec2(0) = 0.0; vec1(1) = 8.0; vec2(1) = 1.0; @@ -112,13 +112,13 @@ static void test_3d() Tensor<float, 3> mat1(2,3,7); Tensor<float, 3, RowMajor> mat2(2,3,7); - float val = 1.0; + float val = 1.0f; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { mat1(i,j,k) = val; mat2(i,j,k) = val; - val += 1.0; + val += 1.0f; } } } @@ -142,7 +142,7 @@ static void test_3d() Tensor<float, 3, RowMajor> mat11(2,3,7); mat11 = mat2 / 3.14f; - val = 1.0; + val = 1.0f; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { @@ -155,7 +155,7 @@ static void test_3d() VERIFY_IS_APPROX(mat9(i,j,k), val + 3.14f); VERIFY_IS_APPROX(mat10(i,j,k), val - 3.14f); VERIFY_IS_APPROX(mat11(i,j,k), val / 3.14f); - val += 1.0; + val += 1.0f; } } } @@ -167,25 +167,25 @@ static void test_constants() Tensor<float, 3> mat2(2,3,7); Tensor<float, 3> mat3(2,3,7); - float val = 1.0; + float val = 1.0f; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { mat1(i,j,k) = val; - val += 1.0; + val += 1.0f; } } } mat2 = mat1.constant(3.14f); mat3 = mat1.cwiseMax(7.3f).exp(); - val = 1.0; + val = 1.0f; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { VERIFY_IS_APPROX(mat2(i,j,k), 3.14f); VERIFY_IS_APPROX(mat3(i,j,k), expf((std::max)(val, 7.3f))); - val += 1.0; + val += 1.0f; } } } @@ -228,25 +228,25 @@ static void test_functors() Tensor<float, 3> mat2(2,3,7); Tensor<float, 3> mat3(2,3,7); - float val = 1.0; + float val = 1.0f; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { mat1(i,j,k) = val; - val += 1.0; + val += 1.0f; } } } mat2 = mat1.inverse().unaryExpr(&asinf); mat3 = mat1.unaryExpr(&tanhf); - val = 1.0; + val = 1.0f; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { VERIFY_IS_APPROX(mat2(i,j,k), asinf(1.0f / mat1(i,j,k))); VERIFY_IS_APPROX(mat3(i,j,k), tanhf(mat1(i,j,k))); - val += 1.0; + val += 1.0f; } } } diff --git a/unsupported/test/cxx11_tensor_fft.cpp b/unsupported/test/cxx11_tensor_fft.cpp index 89874349f..2f14ebc62 100644 --- a/unsupported/test/cxx11_tensor_fft.cpp +++ b/unsupported/test/cxx11_tensor_fft.cpp @@ -205,15 +205,15 @@ static void test_fft_real_input_energy() { VERIFY_IS_EQUAL(output.dimension(i), input.dimension(i)); } - float energy_original = 0.0; - float energy_after_fft = 0.0; + RealScalar energy_original = 0.0; + RealScalar energy_after_fft = 0.0; for (int i = 0; i < total_size; ++i) { - energy_original += pow(std::abs(input(i)), 2); + energy_original += numext::abs2(input(i)); } for (int i = 0; i < total_size; ++i) { - energy_after_fft += pow(std::abs(output(i)), 2); + energy_after_fft += numext::abs2(output(i)); } if(FFTDirection == FFT_FORWARD) { diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp index 46d741b05..4c660de65 100644 --- a/unsupported/test/cxx11_tensor_fixed_size.cpp +++ b/unsupported/test/cxx11_tensor_fixed_size.cpp @@ -188,13 +188,13 @@ static void test_3d() // VERIFY_IS_EQUAL((mat1.dimension(1)), 3); // VERIFY_IS_EQUAL((mat1.dimension(2)), 7); - float val = 0.0; + float val = 0.0f; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { mat1(i,j,k) = val; mat2(i,j,k) = val; - val += 1.0; + val += 1.0f; } } } @@ -210,13 +210,13 @@ static void test_3d() // VERIFY_IS_EQUAL((mat3.dimension(2)), 7); - val = 0.0; + val = 0.0f; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { VERIFY_IS_APPROX(mat3(i,j,k), sqrtf(val)); VERIFY_IS_APPROX(mat4(i,j,k), sqrtf(val)); - val += 1.0; + val += 1.0f; } } } @@ -226,12 +226,12 @@ static void test_3d() static void test_array() { TensorFixedSize<float, Sizes<2, 3, 7> > mat1; - float val = 0.0; + float val = 0.0f; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { mat1(i,j,k) = val; - val += 1.0; + val += 1.0f; } } } @@ -239,12 +239,12 @@ static void test_array() TensorFixedSize<float, Sizes<2, 3, 7> > mat3; mat3 = mat1.pow(3.5f); - val = 0.0; + val = 0.0f; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { VERIFY_IS_APPROX(mat3(i,j,k), powf(val, 3.5f)); - val += 1.0; + val += 1.0f; } } } diff --git a/unsupported/test/cxx11_tensor_image_patch.cpp b/unsupported/test/cxx11_tensor_image_patch.cpp index 5d6a49181..988b01481 100644 --- a/unsupported/test/cxx11_tensor_image_patch.cpp +++ b/unsupported/test/cxx11_tensor_image_patch.cpp @@ -568,13 +568,7 @@ static void test_imagenet_patches() VERIFY_IS_EQUAL(l_out.dimension(4), 16); // RowMajor - Tensor<float, 4, RowMajor> l_in_row_major = l_in.swap_layout(); - VERIFY_IS_EQUAL(l_in.dimension(0), l_in_row_major.dimension(3)); - VERIFY_IS_EQUAL(l_in.dimension(1), l_in_row_major.dimension(2)); - VERIFY_IS_EQUAL(l_in.dimension(2), l_in_row_major.dimension(1)); - VERIFY_IS_EQUAL(l_in.dimension(3), l_in_row_major.dimension(0)); - - Tensor<float, 5, RowMajor> l_out_row_major = l_in_row_major.extract_image_patches(11, 11); + Tensor<float, 5, RowMajor> l_out_row_major = l_in.swap_layout().extract_image_patches(11, 11); VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 16); VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 128*128); VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 11); @@ -589,10 +583,8 @@ static void test_imagenet_patches() for (int r = 0; r < 11; ++r) { for (int d = 0; d < 3; ++d) { float expected = 0.0f; - float expected_row_major = 0.0f; if (r-5+i >= 0 && c-5+j >= 0 && r-5+i < 128 && c-5+j < 128) { expected = l_in(d, r-5+i, c-5+j, b); - expected_row_major = l_in_row_major(b, c-5+j, r-5+i, d); } // ColMajor if (l_out(d, r, c, patchId, b) != expected) { @@ -601,15 +593,13 @@ static void test_imagenet_patches() VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected); // RowMajor if (l_out_row_major(b, patchId, c, r, d) != - expected_row_major) { + expected) { std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; } VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), - expected_row_major); - // Check that ColMajor and RowMajor agree. - VERIFY_IS_EQUAL(expected, expected_row_major); + expected); } } } @@ -628,8 +618,7 @@ static void test_imagenet_patches() VERIFY_IS_EQUAL(l_out.dimension(4), 32); // RowMajor - l_in_row_major = l_in.swap_layout(); - l_out_row_major = l_in_row_major.extract_image_patches(9, 9); + l_out_row_major = l_in.swap_layout().extract_image_patches(9, 9); VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32); VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 64*64); VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 9); @@ -644,10 +633,8 @@ static void test_imagenet_patches() for (int r = 0; r < 9; ++r) { for (int d = 0; d < 16; ++d) { float expected = 0.0f; - float expected_row_major = 0.0f; if (r-4+i >= 0 && c-4+j >= 0 && r-4+i < 64 && c-4+j < 64) { expected = l_in(d, r-4+i, c-4+j, b); - expected_row_major = l_in_row_major(b, c-4+j, r-4+i, d); } // ColMajor if (l_out(d, r, c, patchId, b) != expected) { @@ -655,12 +642,10 @@ static void test_imagenet_patches() } VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected); // RowMajor - if (l_out_row_major(b, patchId, c, r, d) != expected_row_major) { + if (l_out_row_major(b, patchId, c, r, d) != expected) { std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; } - VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected_row_major); - // Check that ColMajor and RowMajor agree. - VERIFY_IS_EQUAL(expected, expected_row_major); + VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected); } } } @@ -679,8 +664,7 @@ static void test_imagenet_patches() VERIFY_IS_EQUAL(l_out.dimension(4), 32); // RowMajor - l_in_row_major = l_in.swap_layout(); - l_out_row_major = l_in_row_major.extract_image_patches(7, 7); + l_out_row_major = l_in.swap_layout().extract_image_patches(7, 7); VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32); VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 16*16); VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 7); @@ -695,10 +679,8 @@ static void test_imagenet_patches() for (int r = 0; r < 7; ++r) { for (int d = 0; d < 32; ++d) { float expected = 0.0f; - float expected_row_major = 0.0f; if (r-3+i >= 0 && c-3+j >= 0 && r-3+i < 16 && c-3+j < 16) { expected = l_in(d, r-3+i, c-3+j, b); - expected_row_major = l_in_row_major(b, c-3+j, r-3+i, d); } // ColMajor if (l_out(d, r, c, patchId, b) != expected) { @@ -706,12 +688,10 @@ static void test_imagenet_patches() } VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected); // RowMajor - if (l_out_row_major(b, patchId, c, r, d) != expected_row_major) { + if (l_out_row_major(b, patchId, c, r, d) != expected) { std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; } - VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected_row_major); - // Check that ColMajor and RowMajor agree. - VERIFY_IS_EQUAL(expected, expected_row_major); + VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected); } } } @@ -730,8 +710,7 @@ static void test_imagenet_patches() VERIFY_IS_EQUAL(l_out.dimension(4), 32); // RowMajor - l_in_row_major = l_in.swap_layout(); - l_out_row_major = l_in_row_major.extract_image_patches(3, 3); + l_out_row_major = l_in.swap_layout().extract_image_patches(3, 3); VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32); VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 13*13); VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 3); @@ -746,10 +725,8 @@ static void test_imagenet_patches() for (int r = 0; r < 3; ++r) { for (int d = 0; d < 64; ++d) { float expected = 0.0f; - float expected_row_major = 0.0f; if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 13 && c-1+j < 13) { expected = l_in(d, r-1+i, c-1+j, b); - expected_row_major = l_in_row_major(b, c-1+j, r-1+i, d); } // ColMajor if (l_out(d, r, c, patchId, b) != expected) { @@ -757,12 +734,10 @@ static void test_imagenet_patches() } VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected); // RowMajor - if (l_out_row_major(b, patchId, c, r, d) != expected_row_major) { + if (l_out_row_major(b, patchId, c, r, d) != expected) { std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; } - VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected_row_major); - // Check that ColMajor and RowMajor agree. - VERIFY_IS_EQUAL(expected, expected_row_major); + VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected); } } } diff --git a/unsupported/test/cxx11_tensor_index_list.cpp b/unsupported/test/cxx11_tensor_index_list.cpp index 4ce8dea20..4cf5df666 100644 --- a/unsupported/test/cxx11_tensor_index_list.cpp +++ b/unsupported/test/cxx11_tensor_index_list.cpp @@ -159,6 +159,111 @@ static void test_type2index_list() } +static void test_type2indexpair_list() +{ + Tensor<float, 5> tensor(2,3,5,7,11); + tensor.setRandom(); + tensor += tensor.constant(10.0f); + + typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>> Dims0; + typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>, Eigen::type2indexpair<1,11>, Eigen::type2indexpair<2,12>> Dims2_a; + typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>, Eigen::IndexPair<DenseIndex>, Eigen::type2indexpair<2,12>> Dims2_b; + typedef Eigen::IndexPairList<Eigen::IndexPair<DenseIndex>, Eigen::type2indexpair<1,11>, Eigen::IndexPair<DenseIndex>> Dims2_c; + + Dims0 d0; + Dims2_a d2_a; + + Dims2_b d2_b; + d2_b.set(1, Eigen::IndexPair<DenseIndex>(1,11)); + + Dims2_c d2_c; + d2_c.set(0, Eigen::IndexPair<DenseIndex>(Eigen::IndexPair<DenseIndex>(0,10))); + d2_c.set(1, Eigen::IndexPair<DenseIndex>(1,11)); // setting type2indexpair to correct value. + d2_c.set(2, Eigen::IndexPair<DenseIndex>(2,12)); + + VERIFY_IS_EQUAL(d2_a[0].first, 0); + VERIFY_IS_EQUAL(d2_a[0].second, 10); + VERIFY_IS_EQUAL(d2_a[1].first, 1); + VERIFY_IS_EQUAL(d2_a[1].second, 11); + VERIFY_IS_EQUAL(d2_a[2].first, 2); + VERIFY_IS_EQUAL(d2_a[2].second, 12); + + VERIFY_IS_EQUAL(d2_b[0].first, 0); + VERIFY_IS_EQUAL(d2_b[0].second, 10); + VERIFY_IS_EQUAL(d2_b[1].first, 1); + VERIFY_IS_EQUAL(d2_b[1].second, 11); + VERIFY_IS_EQUAL(d2_b[2].first, 2); + VERIFY_IS_EQUAL(d2_b[2].second, 12); + + VERIFY_IS_EQUAL(d2_c[0].first, 0); + VERIFY_IS_EQUAL(d2_c[0].second, 10); + VERIFY_IS_EQUAL(d2_c[1].first, 1); + VERIFY_IS_EQUAL(d2_c[1].second, 11); + VERIFY_IS_EQUAL(d2_c[2].first, 2); + VERIFY_IS_EQUAL(d2_c[2].second, 12); + + EIGEN_STATIC_ASSERT((d2_a.value_known_statically(0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((d2_a.value_known_statically(1) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((d2_a.value_known_statically(2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + + EIGEN_STATIC_ASSERT((d2_b.value_known_statically(0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((d2_b.value_known_statically(1) == false), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((d2_b.value_known_statically(2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + + EIGEN_STATIC_ASSERT((d2_c.value_known_statically(0) == false), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((d2_c.value_known_statically(1) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((d2_c.value_known_statically(2) == false), YOU_MADE_A_PROGRAMMING_MISTAKE); + + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims0>(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims0>(0, 1) == false), YOU_MADE_A_PROGRAMMING_MISTAKE); + + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_a>(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_a>(0, 1) == false), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_a>(1, 1) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_a>(1, 2) == false), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_a>(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_a>(2, 3) == false), YOU_MADE_A_PROGRAMMING_MISTAKE); + + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_b>(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_b>(0, 1) == false), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_b>(1, 1) == false), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_b>(1, 2) == false), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_b>(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_b>(2, 3) == false), YOU_MADE_A_PROGRAMMING_MISTAKE); + + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_c>(0, 0) == false), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_c>(0, 1) == false), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_c>(1, 1) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_c>(1, 2) == false), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_c>(2, 2) == false), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_c>(2, 3) == false), YOU_MADE_A_PROGRAMMING_MISTAKE); + + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims0>(0, 10) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims0>(0, 11) == false), YOU_MADE_A_PROGRAMMING_MISTAKE); + + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_a>(0, 10) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_a>(0, 11) == false), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_a>(1, 11) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_a>(1, 12) == false), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_a>(2, 12) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_a>(2, 13) == false), YOU_MADE_A_PROGRAMMING_MISTAKE); + + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_b>(0, 10) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_b>(0, 11) == false), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_b>(1, 11) == false), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_b>(1, 12) == false), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_b>(2, 12) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_b>(2, 13) == false), YOU_MADE_A_PROGRAMMING_MISTAKE); + + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_c>(0, 10) == false), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_c>(0, 11) == false), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_c>(1, 11) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_c>(1, 12) == false), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_c>(2, 12) == false), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_c>(2, 13) == false), YOU_MADE_A_PROGRAMMING_MISTAKE); +} + + static void test_dynamic_index_list() { Tensor<float, 4> tensor(2,3,5,7); @@ -273,6 +378,7 @@ void test_cxx11_tensor_index_list() #ifdef EIGEN_HAS_INDEX_LIST CALL_SUBTEST(test_static_index_list()); CALL_SUBTEST(test_type2index_list()); + CALL_SUBTEST(test_type2indexpair_list()); CALL_SUBTEST(test_dynamic_index_list()); CALL_SUBTEST(test_mixed_index_list()); CALL_SUBTEST(test_dim_check()); diff --git a/unsupported/test/cxx11_tensor_intdiv.cpp b/unsupported/test/cxx11_tensor_intdiv.cpp index 48aa6d368..8e2b70b75 100644 --- a/unsupported/test/cxx11_tensor_intdiv.cpp +++ b/unsupported/test/cxx11_tensor_intdiv.cpp @@ -128,7 +128,7 @@ void test_powers_64bit() { void test_specific() { // A particular combination that was previously failing int64_t div = 209715200; - int64_t num = 3238002688; + int64_t num = 3238002688ll; Eigen::internal::TensorIntDivisor<int64_t> divider(div); int64_t result = num/div; int64_t result_op = divider.divide(num); diff --git a/unsupported/test/cxx11_tensor_io.cpp b/unsupported/test/cxx11_tensor_io.cpp index 8bbcf7089..489960529 100644 --- a/unsupported/test/cxx11_tensor_io.cpp +++ b/unsupported/test/cxx11_tensor_io.cpp @@ -14,6 +14,20 @@ template<int DataLayout> +static void test_output_0d() +{ + Tensor<int, 0, DataLayout> tensor; + tensor() = 123; + + std::stringstream os; + os << tensor; + + std::string expected("123"); + VERIFY_IS_EQUAL(std::string(os.str()), expected); +} + + +template<int DataLayout> static void test_output_1d() { Tensor<int, 1, DataLayout> tensor(5); @@ -26,6 +40,12 @@ static void test_output_1d() std::string expected("0\n1\n2\n3\n4"); VERIFY_IS_EQUAL(std::string(os.str()), expected); + + Eigen::Tensor<double,1,DataLayout> empty_tensor(0); + std::stringstream empty_os; + empty_os << empty_tensor; + std::string empty_string; + VERIFY_IS_EQUAL(std::string(empty_os.str()), empty_string); } @@ -101,6 +121,8 @@ static void test_output_const() void test_cxx11_tensor_io() { + CALL_SUBTEST(test_output_0d<ColMajor>()); + CALL_SUBTEST(test_output_0d<RowMajor>()); CALL_SUBTEST(test_output_1d<ColMajor>()); CALL_SUBTEST(test_output_1d<RowMajor>()); CALL_SUBTEST(test_output_2d<ColMajor>()); diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp index eb3b891fd..f7de43110 100644 --- a/unsupported/test/cxx11_tensor_morphing.cpp +++ b/unsupported/test/cxx11_tensor_morphing.cpp @@ -13,6 +13,7 @@ using Eigen::Tensor; +template<typename> static void test_simple_reshape() { Tensor<float, 5> tensor1(2,3,1,7,1); @@ -40,7 +41,7 @@ static void test_simple_reshape() } } - +template<typename> static void test_reshape_in_expr() { MatrixXf m1(2,3*5*7*11); MatrixXf m2(3*5*7*11,13); @@ -65,7 +66,7 @@ static void test_reshape_in_expr() { } } - +template<typename> static void test_reshape_as_lvalue() { Tensor<float, 3> tensor(2,3,7); @@ -114,6 +115,7 @@ static void test_simple_slice() } } +template<typename=void> static void test_const_slice() { const float b[1] = {42}; @@ -315,6 +317,128 @@ static void test_slice_raw_data() VERIFY_IS_EQUAL(slice6.data(), tensor.data()); } + +template<int DataLayout> +static void test_strided_slice() +{ + typedef Tensor<float, 5, DataLayout> Tensor5f; + typedef Eigen::DSizes<Eigen::DenseIndex, 5> Index5; + typedef Tensor<float, 2, DataLayout> Tensor2f; + typedef Eigen::DSizes<Eigen::DenseIndex, 2> Index2; + Tensor<float, 5, DataLayout> tensor(2,3,5,7,11); + Tensor<float, 2, DataLayout> tensor2(7,11); + tensor.setRandom(); + tensor2.setRandom(); + + if (true) { + Tensor2f slice(2,3); + Index2 strides(-2,-1); + Index2 indicesStart(5,7); + Index2 indicesStop(0,4); + slice = tensor2.stridedSlice(indicesStart, indicesStop, strides); + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 3; ++k) { + VERIFY_IS_EQUAL(slice(j,k), tensor2(5-2*j,7-k)); + } + } + } + + if(true) { + Tensor2f slice(0,1); + Index2 strides(1,1); + Index2 indicesStart(5,4); + Index2 indicesStop(5,5); + slice = tensor2.stridedSlice(indicesStart, indicesStop, strides); + } + + if(true) { // test clamped degenerate interavls + Tensor2f slice(7,11); + Index2 strides(1,-1); + Index2 indicesStart(-3,20); // should become 0,10 + Index2 indicesStop(20,-11); // should become 11, -1 + slice = tensor2.stridedSlice(indicesStart, indicesStop, strides); + for (int j = 0; j < 7; ++j) { + for (int k = 0; k < 11; ++k) { + VERIFY_IS_EQUAL(slice(j,k), tensor2(j,10-k)); + } + } + } + + if(true) { + Tensor5f slice1(1,1,1,1,1); + Eigen::DSizes<Eigen::DenseIndex, 5> indicesStart(1, 2, 3, 4, 5); + Eigen::DSizes<Eigen::DenseIndex, 5> indicesStop(2, 3, 4, 5, 6); + Eigen::DSizes<Eigen::DenseIndex, 5> strides(1, 1, 1, 1, 1); + slice1 = tensor.stridedSlice(indicesStart, indicesStop, strides); + VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5)); + } + + if(true) { + Tensor5f slice(1,1,2,2,3); + Index5 start(1, 1, 3, 4, 5); + Index5 stop(2, 2, 5, 6, 8); + Index5 strides(1, 1, 1, 1, 1); + slice = tensor.stridedSlice(start, stop, strides); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 3; ++k) { + VERIFY_IS_EQUAL(slice(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k)); + } + } + } + } + + if(true) { + Tensor5f slice(1,1,2,2,3); + Index5 strides3(1, 1, -2, 1, -1); + Index5 indices3Start(1, 1, 4, 4, 7); + Index5 indices3Stop(2, 2, 0, 6, 4); + slice = tensor.stridedSlice(indices3Start, indices3Stop, strides3); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 3; ++k) { + VERIFY_IS_EQUAL(slice(0,0,i,j,k), tensor(1,1,4-2*i,4+j,7-k)); + } + } + } + } + + if(false) { // tests degenerate interval + Tensor5f slice(1,1,2,2,3); + Index5 strides3(1, 1, 2, 1, 1); + Index5 indices3Start(1, 1, 4, 4, 7); + Index5 indices3Stop(2, 2, 0, 6, 4); + slice = tensor.stridedSlice(indices3Start, indices3Stop, strides3); + } +} + +template<int DataLayout> +static void test_strided_slice_write() +{ + typedef Tensor<float, 2, DataLayout> Tensor2f; + typedef Eigen::DSizes<Eigen::DenseIndex, 2> Index2; + + Tensor<float, 2, DataLayout> tensor(7,11),tensor2(7,11); + tensor.setRandom(); + tensor2=tensor; + Tensor2f slice(2,3); + + slice.setRandom(); + + Index2 strides(1,1); + Index2 indicesStart(3,4); + Index2 indicesStop(5,7); + Index2 lengths(2,3); + + tensor.slice(indicesStart,lengths)=slice; + tensor2.stridedSlice(indicesStart,indicesStop,strides)=slice; + + for(int i=0;i<7;i++) for(int j=0;j<11;j++){ + VERIFY_IS_EQUAL(tensor(i,j), tensor2(i,j)); + } +} + + template<int DataLayout> static void test_composition() { @@ -337,20 +461,25 @@ static void test_composition() void test_cxx11_tensor_morphing() { - CALL_SUBTEST(test_simple_reshape()); - CALL_SUBTEST(test_reshape_in_expr()); - CALL_SUBTEST(test_reshape_as_lvalue()); - - CALL_SUBTEST(test_simple_slice<ColMajor>()); - CALL_SUBTEST(test_simple_slice<RowMajor>()); - CALL_SUBTEST(test_const_slice()); - CALL_SUBTEST(test_slice_in_expr<ColMajor>()); - CALL_SUBTEST(test_slice_in_expr<RowMajor>()); - CALL_SUBTEST(test_slice_as_lvalue<ColMajor>()); - CALL_SUBTEST(test_slice_as_lvalue<RowMajor>()); - CALL_SUBTEST(test_slice_raw_data<ColMajor>()); - CALL_SUBTEST(test_slice_raw_data<RowMajor>()); - - CALL_SUBTEST(test_composition<ColMajor>()); - CALL_SUBTEST(test_composition<RowMajor>()); + CALL_SUBTEST_1(test_simple_reshape<void>()); + CALL_SUBTEST_1(test_reshape_in_expr<void>()); + CALL_SUBTEST_1(test_reshape_as_lvalue<void>()); + + CALL_SUBTEST_1(test_simple_slice<ColMajor>()); + CALL_SUBTEST_1(test_simple_slice<RowMajor>()); + CALL_SUBTEST_1(test_const_slice()); + CALL_SUBTEST_2(test_slice_in_expr<ColMajor>()); + CALL_SUBTEST_3(test_slice_in_expr<RowMajor>()); + CALL_SUBTEST_4(test_slice_as_lvalue<ColMajor>()); + CALL_SUBTEST_4(test_slice_as_lvalue<RowMajor>()); + CALL_SUBTEST_5(test_slice_raw_data<ColMajor>()); + CALL_SUBTEST_5(test_slice_raw_data<RowMajor>()); + + CALL_SUBTEST_6(test_strided_slice_write<ColMajor>()); + CALL_SUBTEST_6(test_strided_slice<ColMajor>()); + CALL_SUBTEST_6(test_strided_slice_write<RowMajor>()); + CALL_SUBTEST_6(test_strided_slice<RowMajor>()); + + CALL_SUBTEST_7(test_composition<ColMajor>()); + CALL_SUBTEST_7(test_composition<RowMajor>()); } diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index 37fe3e9a4..2f86980a2 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -13,14 +13,55 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU - +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#include <cuda_fp16.h> +#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> using Eigen::Tensor; +template<typename> +void test_cuda_numext() { + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int num_elem = 101; + + float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + bool* d_res_half = (bool*)gpu_device.allocate(num_elem * sizeof(bool)); + bool* d_res_float = (bool*)gpu_device.allocate(num_elem * sizeof(bool)); + + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float( + d_float, num_elem); + Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_half( + d_res_half, num_elem); + Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_float( + d_res_float, num_elem); + + gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f); + gpu_res_float.device(gpu_device) = gpu_float.unaryExpr(Eigen::internal::scalar_isnan_op<float>()); + gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().unaryExpr(Eigen::internal::scalar_isnan_op<Eigen::half>()); + + Tensor<bool, 1> half_prec(num_elem); + Tensor<bool, 1> full_prec(num_elem); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(bool)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(bool)); + gpu_device.synchronize(); + + for (int i = 0; i < num_elem; ++i) { + std::cout << "Checking numext " << i << std::endl; + VERIFY_IS_EQUAL(full_prec(i), half_prec(i)); + } + + gpu_device.deallocate(d_float); + gpu_device.deallocate(d_res_half); + gpu_device.deallocate(d_res_float); +} + + #ifdef EIGEN_HAS_CUDA_FP16 +template<typename> void test_cuda_conversion() { Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); @@ -55,7 +96,7 @@ void test_cuda_conversion() { gpu_device.deallocate(d_conv); } - +template<typename> void test_cuda_unary() { Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); @@ -92,7 +133,7 @@ void test_cuda_unary() { gpu_device.deallocate(d_res_float); } - +template<typename> void test_cuda_elementwise() { Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); @@ -124,8 +165,8 @@ void test_cuda_elementwise() { gpu_device.synchronize(); for (int i = 0; i < num_elem; ++i) { - std::cout << "Checking elemwise " << i << std::endl; - VERIFY_IS_APPROX(full_prec(i), half_prec(i)); + std::cout << "Checking elemwise " << i << ": full prec = " << full_prec(i) << " vs half prec = " << half_prec(i) << std::endl; + VERIFY_IS_APPROX(static_cast<Eigen::half>(full_prec(i)), static_cast<Eigen::half>(half_prec(i))); } gpu_device.deallocate(d_float1); @@ -134,6 +175,7 @@ void test_cuda_elementwise() { gpu_device.deallocate(d_res_float); } +template<typename> void test_cuda_trancendental() { Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); @@ -141,43 +183,58 @@ void test_cuda_trancendental() { float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_res1_half = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_res1_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_res2_half = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_res2_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_float3 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + Eigen::half* d_res1_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); + Eigen::half* d_res1_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); + Eigen::half* d_res2_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); + Eigen::half* d_res2_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); + Eigen::half* d_res3_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); + Eigen::half* d_res3_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); + + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float3(d_float3, num_elem); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem); - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1( - d_float1, num_elem); - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2( - d_float2, num_elem); - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res1_half( - d_res1_half, num_elem); - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res1_float( - d_res1_float, num_elem); - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res2_half( - d_res2_half, num_elem); - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res2_float( - d_res2_float, num_elem); + gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f); + gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f); + gpu_float3.device(gpu_device) = gpu_float3.random(); + gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>(); + gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>(); + gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>(); - gpu_float1.device(gpu_device) = gpu_float1.random(); - gpu_float2.device(gpu_device) = gpu_float2.random(); - gpu_res1_float.device(gpu_device) = gpu_float1.exp(); - gpu_res2_float.device(gpu_device) = gpu_float2.log(); - gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().exp().cast<float>(); - gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>().log().cast<float>(); + gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>(); + gpu_res1_half.device(gpu_device) = gpu_res1_half.exp(); + + gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>(); + gpu_res2_half.device(gpu_device) = gpu_res2_half.log(); + + gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>(); + gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p(); Tensor<float, 1> input1(num_elem); - Tensor<float, 1> half_prec1(num_elem); - Tensor<float, 1> full_prec1(num_elem); + Tensor<Eigen::half, 1> half_prec1(num_elem); + Tensor<Eigen::half, 1> full_prec1(num_elem); Tensor<float, 1> input2(num_elem); - Tensor<float, 1> half_prec2(num_elem); - Tensor<float, 1> full_prec2(num_elem); + Tensor<Eigen::half, 1> half_prec2(num_elem); + Tensor<Eigen::half, 1> full_prec2(num_elem); + Tensor<float, 1> input3(num_elem); + Tensor<Eigen::half, 1> half_prec3(num_elem); + Tensor<Eigen::half, 1> full_prec3(num_elem); gpu_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float)); gpu_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float)); - gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(float)); - gpu_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(float)); - gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(float)); - gpu_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half)); + gpu_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half)); + gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half)); + gpu_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half)); + gpu_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half)); + gpu_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half)); gpu_device.synchronize(); for (int i = 0; i < num_elem; ++i) { @@ -186,17 +243,27 @@ void test_cuda_trancendental() { } for (int i = 0; i < num_elem; ++i) { std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl; - VERIFY_IS_APPROX(full_prec2(i), half_prec2(i)); + if(std::abs(input2(i)-1.f)<0.05f) // log lacks accurary nearby 1 + VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f)); + else + VERIFY_IS_APPROX(full_prec2(i), half_prec2(i)); + } + for (int i = 0; i < num_elem; ++i) { + std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl; + VERIFY_IS_APPROX(full_prec3(i), half_prec3(i)); } gpu_device.deallocate(d_float1); gpu_device.deallocate(d_float2); + gpu_device.deallocate(d_float3); gpu_device.deallocate(d_res1_half); gpu_device.deallocate(d_res1_float); gpu_device.deallocate(d_res2_half); gpu_device.deallocate(d_res2_float); + gpu_device.deallocate(d_res3_float); + gpu_device.deallocate(d_res3_half); } - +template<typename> void test_cuda_contractions() { Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); @@ -206,36 +273,38 @@ void test_cuda_contractions() { float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); + Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1( d_float1, rows, cols); Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2( d_float2, rows, cols); - Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_res_half( + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_half( d_res_half, rows, cols); - Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_res_float( + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_float( d_res_float, rows, cols); gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f); - gpu_float2.device(gpu_device) = gpu_float2.random() - gpu_float1.constant(0.5f); + gpu_float2.device(gpu_device) = gpu_float2.random() - gpu_float2.constant(0.5f); typedef Tensor<float, 2>::DimensionPair DimPair; Eigen::array<DimPair, 1> dims(DimPair(1, 0)); - gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims); - gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims).cast<float>(); + gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims).cast<Eigen::half>(); + gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims); - Tensor<float, 2> half_prec(rows, cols); - Tensor<float, 2> full_prec(rows, cols); - gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); - gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); + Tensor<Eigen::half, 2> half_prec(rows, cols); + Tensor<Eigen::half, 2> full_prec(rows, cols); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(Eigen::half)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(Eigen::half)); gpu_device.synchronize(); for (int i = 0; i < rows; ++i) { for (int j = 0; j < cols; ++j) { - std::cout << "Checking contract " << i << " " << j << std::endl; - VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j)); + std::cout << "Checking contract " << i << " " << j << full_prec(i, j) << " " << half_prec(i, j) << std::endl; + if (numext::abs(full_prec(i, j) - half_prec(i, j)) > Eigen::half(1e-2f)) { + VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j)); + } } } @@ -245,8 +314,69 @@ void test_cuda_contractions() { gpu_device.deallocate(d_res_float); } +template<typename> +void test_cuda_reductions(int size1, int size2, int redux) { + std::cout << "Reducing " << size1 << " by " << size2 + << " tensor along dim " << redux << std::endl; + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int num_elem = size1*size2; + int result_size = (redux == 1 ? size1 : size2); + + float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half)); + Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half)); + + Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1( + d_float1, size1, size2); + Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2( + d_float2, size1, size2); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half( + d_res_half, result_size); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float( + d_res_float, result_size); + + gpu_float1.device(gpu_device) = gpu_float1.random() * 2.0f; + gpu_float2.device(gpu_device) = gpu_float2.random() * 2.0f; + + Eigen::array<int, 1> redux_dim = {{redux}}; + gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim).cast<Eigen::half>(); + gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(redux_dim); + + Tensor<Eigen::half, 1> half_prec(result_size); + Tensor<Eigen::half, 1> full_prec(result_size); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, result_size*sizeof(Eigen::half)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, result_size*sizeof(Eigen::half)); + gpu_device.synchronize(); + + for (int i = 0; i < result_size; ++i) { + std::cout << "EXPECTED " << full_prec(i) << " GOT " << half_prec(i) << std::endl; + VERIFY_IS_APPROX(full_prec(i), half_prec(i)); + } + + gpu_device.deallocate(d_float1); + gpu_device.deallocate(d_float2); + gpu_device.deallocate(d_res_half); + gpu_device.deallocate(d_res_float); +} + +template<typename> void test_cuda_reductions() { + test_cuda_reductions<void>(13, 13, 0); + test_cuda_reductions<void>(13, 13, 1); + + test_cuda_reductions<void>(35, 36, 0); + test_cuda_reductions<void>(35, 36, 1); + + test_cuda_reductions<void>(36, 35, 0); + test_cuda_reductions<void>(36, 35, 1); +} + +template<typename> +void test_cuda_full_reductions() { Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); int size = 13; @@ -254,35 +384,39 @@ void test_cuda_reductions() { float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_res_half = (float*)gpu_device.allocate(size * sizeof(float)); - float* d_res_float = (float*)gpu_device.allocate(size * sizeof(float)); + Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half)); + Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half)); Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1( d_float1, size, size); Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2( d_float2, size, size); - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half( - d_res_half, size); - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float( - d_res_float, size); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half( + d_res_half); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float( + d_res_float); gpu_float1.device(gpu_device) = gpu_float1.random(); gpu_float2.device(gpu_device) = gpu_float2.random(); - Eigen::array<int, 1> redux_dim = {{0}}; - gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim); - gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(redux_dim).cast<float>(); + gpu_res_float.device(gpu_device) = gpu_float1.sum().cast<Eigen::half>(); + gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(); - Tensor<float, 1> half_prec(size); - Tensor<float, 1> full_prec(size); - gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, size*sizeof(float)); - gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, size*sizeof(float)); + Tensor<Eigen::half, 0> half_prec; + Tensor<Eigen::half, 0> full_prec; + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half)); gpu_device.synchronize(); - for (int i = 0; i < size; ++i) { - std::cout << "Checking redux " << i << std::endl; - VERIFY_IS_APPROX(full_prec(i), half_prec(i)); - } + VERIFY_IS_APPROX(full_prec(), half_prec()); + + gpu_res_float.device(gpu_device) = gpu_float1.maximum().cast<Eigen::half>(); + gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().maximum(); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half)); + gpu_device.synchronize(); + + VERIFY_IS_APPROX(full_prec(), half_prec()); gpu_device.deallocate(d_float1); gpu_device.deallocate(d_float2); @@ -290,6 +424,7 @@ void test_cuda_reductions() { gpu_device.deallocate(d_res_float); } +template<typename> void test_cuda_forced_evals() { Eigen::CudaStreamDevice stream; @@ -297,59 +432,62 @@ void test_cuda_forced_evals() { int num_elem = 101; float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_half1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_half2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float( d_float, num_elem); - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half( - d_res_half, num_elem); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1( + d_res_half1, num_elem); + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2( + d_res_half2, num_elem); Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float( d_res_float, num_elem); + Eigen::array<int, 1> no_bcast; + no_bcast[0] = 1; + gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f); gpu_res_float.device(gpu_device) = gpu_float.abs(); - gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>(); + gpu_res_half1.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>(); + gpu_res_half2.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().broadcast(no_bcast).eval().cast<float>(); - Tensor<float, 1> half_prec(num_elem); + Tensor<float, 1> half_prec1(num_elem); + Tensor<float, 1> half_prec2(num_elem); Tensor<float, 1> full_prec(num_elem); - gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half1, num_elem*sizeof(float)); gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); gpu_device.synchronize(); for (int i = 0; i < num_elem; ++i) { - std::cout << "Checking unary " << i << std::endl; - VERIFY_IS_APPROX(full_prec(i), half_prec(i)); + std::cout << "Checking forced eval " << i << full_prec(i) << " vs " << half_prec1(i) << " vs " << half_prec2(i) << std::endl; + VERIFY_IS_APPROX(full_prec(i), half_prec1(i)); + VERIFY_IS_APPROX(full_prec(i), half_prec2(i)); } gpu_device.deallocate(d_float); - gpu_device.deallocate(d_res_half); + gpu_device.deallocate(d_res_half1); + gpu_device.deallocate(d_res_half2); gpu_device.deallocate(d_res_float); } - #endif void test_cxx11_tensor_of_float16_cuda() { + CALL_SUBTEST_1(test_cuda_numext<void>()); + #ifdef EIGEN_HAS_CUDA_FP16 - Eigen::CudaStreamDevice stream; - Eigen::GpuDevice device(&stream); - if (device.majorDeviceVersion() > 5 || - (device.majorDeviceVersion() == 5 && device.minorDeviceVersion() >= 3)) { - std::cout << "Running test on device with capability " << device.majorDeviceVersion() << "." << device.minorDeviceVersion() << std::endl; - - CALL_SUBTEST_1(test_cuda_conversion()); - CALL_SUBTEST_1(test_cuda_unary()); - CALL_SUBTEST_1(test_cuda_elementwise()); - CALL_SUBTEST_1(test_cuda_trancendental()); - CALL_SUBTEST_2(test_cuda_contractions()); - CALL_SUBTEST_3(test_cuda_reductions()); - CALL_SUBTEST_4(test_cuda_forced_evals()); - } - else { - std::cout << "Half floats require compute capability of at least 5.3. This device only supports " << device.majorDeviceVersion() << "." << device.minorDeviceVersion() << ". Skipping the test" << std::endl; - } + CALL_SUBTEST_1(test_cuda_conversion<void>()); + CALL_SUBTEST_1(test_cuda_unary<void>()); + CALL_SUBTEST_1(test_cuda_elementwise<void>()); + CALL_SUBTEST_1(test_cuda_trancendental<void>()); + CALL_SUBTEST_2(test_cuda_contractions<void>()); + CALL_SUBTEST_3(test_cuda_reductions<void>()); + CALL_SUBTEST_4(test_cuda_full_reductions<void>()); + CALL_SUBTEST_5(test_cuda_forced_evals<void>()); #else std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl; #endif diff --git a/unsupported/test/cxx11_tensor_random_cuda.cu b/unsupported/test/cxx11_tensor_random_cuda.cu index 5d091de15..b3be199e1 100644 --- a/unsupported/test/cxx11_tensor_random_cuda.cu +++ b/unsupported/test/cxx11_tensor_random_cuda.cu @@ -13,10 +13,61 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#include <cuda_fp16.h> +#endif #include "main.h" #include <Eigen/CXX11/Tensor> -static void test_default() + +void test_cuda_random_uniform() +{ + Tensor<float, 2> out(72,97); + out.setZero(); + + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_out; + cudaMalloc((void**)(&d_out), out_bytes); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97); + + gpu_out.device(gpu_device) = gpu_out.random(); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + // For now we just check thes code doesn't crash. + // TODO: come up with a valid test of randomness +} + + +void test_cuda_random_normal() +{ + Tensor<float, 2> out(72,97); + out.setZero(); + + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_out; + cudaMalloc((void**)(&d_out), out_bytes); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97); + + Eigen::internal::NormalRandomGenerator<float> gen(true); + gpu_out.device(gpu_device) = gpu_out.random(gen); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); +} + +static void test_complex() { Tensor<std::complex<float>, 1> vec(6); vec.setRandom(); @@ -31,5 +82,7 @@ static void test_default() void test_cxx11_tensor_random_cuda() { - CALL_SUBTEST(test_default()); + CALL_SUBTEST(test_cuda_random_uniform()); + CALL_SUBTEST(test_cuda_random_normal()); + CALL_SUBTEST(test_complex()); } diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp index 6a128901a..1490ec3da 100644 --- a/unsupported/test/cxx11_tensor_reduction.cpp +++ b/unsupported/test/cxx11_tensor_reduction.cpp @@ -239,6 +239,33 @@ static void test_simple_reductions() { } } + +template <int DataLayout> +static void test_reductions_in_expr() { + Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7); + tensor.setRandom(); + array<ptrdiff_t, 2> reduction_axis2; + reduction_axis2[0] = 1; + reduction_axis2[1] = 3; + + Tensor<float, 2, DataLayout> result(2, 5); + result = result.constant(1.0f) - tensor.sum(reduction_axis2); + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_EQUAL(result.dimension(1), 5); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 5; ++j) { + float sum = 0.0f; + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 7; ++l) { + sum += tensor(i, k, j, l); + } + } + VERIFY_IS_APPROX(result(i, j), 1.0f - sum); + } + } +} + + template <int DataLayout> static void test_full_reductions() { Tensor<float, 2, DataLayout> tensor(2, 3); @@ -341,7 +368,7 @@ static void test_static_dims() { Tensor<float, 2, DataLayout> out(72, 97); in.setRandom(); -#ifndef EIGEN_HAS_CONSTEXPR +#if !EIGEN_HAS_CONSTEXPR array<int, 2> reduction_axis; reduction_axis[0] = 1; reduction_axis[1] = 3; @@ -371,7 +398,7 @@ static void test_innermost_last_dims() { in.setRandom(); // Reduce on the innermost dimensions. -#ifndef EIGEN_HAS_CONSTEXPR +#if !EIGEN_HAS_CONSTEXPR array<int, 2> reduction_axis; reduction_axis[0] = 0; reduction_axis[1] = 1; @@ -402,7 +429,7 @@ static void test_innermost_first_dims() { in.setRandom(); // Reduce on the innermost dimensions. -#ifndef EIGEN_HAS_CONSTEXPR +#if !EIGEN_HAS_CONSTEXPR array<int, 2> reduction_axis; reduction_axis[0] = 2; reduction_axis[1] = 3; @@ -433,7 +460,7 @@ static void test_reduce_middle_dims() { in.setRandom(); // Reduce on the innermost dimensions. -#ifndef EIGEN_HAS_CONSTEXPR +#if !EIGEN_HAS_CONSTEXPR array<int, 2> reduction_axis; reduction_axis[0] = 1; reduction_axis[1] = 2; @@ -462,6 +489,8 @@ void test_cxx11_tensor_reduction() { CALL_SUBTEST(test_trivial_reductions<RowMajor>()); CALL_SUBTEST(test_simple_reductions<ColMajor>()); CALL_SUBTEST(test_simple_reductions<RowMajor>()); + CALL_SUBTEST(test_reductions_in_expr<ColMajor>()); + CALL_SUBTEST(test_reductions_in_expr<RowMajor>()); CALL_SUBTEST(test_full_reductions<ColMajor>()); CALL_SUBTEST(test_full_reductions<RowMajor>()); CALL_SUBTEST(test_user_defined_reductions<ColMajor>()); diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cu b/unsupported/test/cxx11_tensor_reduction_cuda.cu index cad0c08e0..6858b43a7 100644 --- a/unsupported/test/cxx11_tensor_reduction_cuda.cu +++ b/unsupported/test/cxx11_tensor_reduction_cuda.cu @@ -12,11 +12,14 @@ #define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda #define EIGEN_USE_GPU +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#include <cuda_fp16.h> +#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> -template<int DataLayout> +template<typename Type, int DataLayout> static void test_full_reductions() { Eigen::CudaStreamDevice stream; @@ -25,24 +28,24 @@ static void test_full_reductions() { const int num_rows = internal::random<int>(1024, 5*1024); const int num_cols = internal::random<int>(1024, 5*1024); - Tensor<float, 2, DataLayout> in(num_rows, num_cols); + Tensor<Type, 2, DataLayout> in(num_rows, num_cols); in.setRandom(); - Tensor<float, 0, DataLayout> full_redux; + Tensor<Type, 0, DataLayout> full_redux; full_redux = in.sum(); - std::size_t in_bytes = in.size() * sizeof(float); - std::size_t out_bytes = full_redux.size() * sizeof(float); - float* gpu_in_ptr = static_cast<float*>(gpu_device.allocate(in_bytes)); - float* gpu_out_ptr = static_cast<float*>(gpu_device.allocate(out_bytes)); + std::size_t in_bytes = in.size() * sizeof(Type); + std::size_t out_bytes = full_redux.size() * sizeof(Type); + Type* gpu_in_ptr = static_cast<Type*>(gpu_device.allocate(in_bytes)); + Type* gpu_out_ptr = static_cast<Type*>(gpu_device.allocate(out_bytes)); gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes); - TensorMap<Tensor<float, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols); - TensorMap<Tensor<float, 0, DataLayout> > out_gpu(gpu_out_ptr); + TensorMap<Tensor<Type, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols); + TensorMap<Tensor<Type, 0, DataLayout> > out_gpu(gpu_out_ptr); out_gpu.device(gpu_device) = in_gpu.sum(); - Tensor<float, 0, DataLayout> full_redux_gpu; + Tensor<Type, 0, DataLayout> full_redux_gpu; gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes); gpu_device.synchronize(); @@ -53,7 +56,102 @@ static void test_full_reductions() { gpu_device.deallocate(gpu_out_ptr); } +template<typename Type, int DataLayout> +static void test_first_dim_reductions() { + int dim_x = 33; + int dim_y = 1; + int dim_z = 128; + + Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z); + in.setRandom(); + + Eigen::array<int, 1> red_axis; + red_axis[0] = 0; + Tensor<Type, 2, DataLayout> redux = in.sum(red_axis); + + // Create device + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice dev(&stream); + + // Create data(T) + Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type)); + Type* out_data = (Type*)dev.allocate(dim_z*dim_y*sizeof(Type)); + Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z); + Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_y, dim_z); + + // Perform operation + dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type)); + gpu_out.device(dev) = gpu_in.sum(red_axis); + gpu_out.device(dev) += gpu_in.sum(red_axis); + Tensor<Type, 2, DataLayout> redux_gpu(dim_y, dim_z); + dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type)); + dev.synchronize(); + + // Check that the CPU and GPU reductions return the same result. + for (int i = 0; i < gpu_out.size(); ++i) { + VERIFY_IS_APPROX(2*redux(i), redux_gpu(i)); + } + + dev.deallocate(in_data); + dev.deallocate(out_data); +} + +template<typename Type, int DataLayout> +static void test_last_dim_reductions() { + int dim_x = 128; + int dim_y = 1; + int dim_z = 33; + + Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z); + in.setRandom(); + + Eigen::array<int, 1> red_axis; + red_axis[0] = 2; + Tensor<Type, 2, DataLayout> redux = in.sum(red_axis); + + // Create device + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice dev(&stream); + + // Create data + Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type)); + Type* out_data = (Type*)dev.allocate(dim_x*dim_y*sizeof(Type)); + Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z); + Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_x, dim_y); + + // Perform operation + dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type)); + gpu_out.device(dev) = gpu_in.sum(red_axis); + gpu_out.device(dev) += gpu_in.sum(red_axis); + Tensor<Type, 2, DataLayout> redux_gpu(dim_x, dim_y); + dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type)); + dev.synchronize(); + + // Check that the CPU and GPU reductions return the same result. + for (int i = 0; i < gpu_out.size(); ++i) { + VERIFY_IS_APPROX(2*redux(i), redux_gpu(i)); + } + + dev.deallocate(in_data); + dev.deallocate(out_data); +} + + void test_cxx11_tensor_reduction_cuda() { - CALL_SUBTEST_1(test_full_reductions<ColMajor>()); - CALL_SUBTEST_2(test_full_reductions<RowMajor>()); + CALL_SUBTEST_1((test_full_reductions<float, ColMajor>())); + CALL_SUBTEST_1((test_full_reductions<double, ColMajor>())); + CALL_SUBTEST_2((test_full_reductions<float, RowMajor>())); + CALL_SUBTEST_2((test_full_reductions<double, RowMajor>())); + + CALL_SUBTEST_3((test_first_dim_reductions<float, ColMajor>())); + CALL_SUBTEST_3((test_first_dim_reductions<double, ColMajor>())); + CALL_SUBTEST_4((test_first_dim_reductions<float, RowMajor>())); +// Outer reductions of doubles aren't supported just yet. +// CALL_SUBTEST_4((test_first_dim_reductions<double, RowMajor>())) + + CALL_SUBTEST_5((test_last_dim_reductions<float, ColMajor>())); +// Outer reductions of doubles aren't supported just yet. +// CALL_SUBTEST_5((test_last_dim_reductions<double, ColMajor>())); + CALL_SUBTEST_6((test_last_dim_reductions<float, RowMajor>())); + CALL_SUBTEST_6((test_last_dim_reductions<double, RowMajor>())); } diff --git a/unsupported/test/cxx11_tensor_scan.cpp b/unsupported/test/cxx11_tensor_scan.cpp new file mode 100644 index 000000000..af59aa3ef --- /dev/null +++ b/unsupported/test/cxx11_tensor_scan.cpp @@ -0,0 +1,110 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Igor Babuschkin <igor@babuschk.in> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" +#include <limits> +#include <numeric> +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; + +template <int DataLayout, typename Type=float, bool Exclusive = false> +static void test_1d_scan() +{ + int size = 50; + Tensor<Type, 1, DataLayout> tensor(size); + tensor.setRandom(); + Tensor<Type, 1, DataLayout> result = tensor.cumsum(0, Exclusive); + + VERIFY_IS_EQUAL(tensor.dimension(0), result.dimension(0)); + + float accum = 0; + for (int i = 0; i < size; i++) { + if (Exclusive) { + VERIFY_IS_EQUAL(result(i), accum); + accum += tensor(i); + } else { + accum += tensor(i); + VERIFY_IS_EQUAL(result(i), accum); + } + } + + accum = 1; + result = tensor.cumprod(0, Exclusive); + for (int i = 0; i < size; i++) { + if (Exclusive) { + VERIFY_IS_EQUAL(result(i), accum); + accum *= tensor(i); + } else { + accum *= tensor(i); + VERIFY_IS_EQUAL(result(i), accum); + } + } +} + +template <int DataLayout, typename Type=float> +static void test_4d_scan() +{ + int size = 5; + Tensor<Type, 4, DataLayout> tensor(size, size, size, size); + tensor.setRandom(); + + Tensor<Type, 4, DataLayout> result(size, size, size, size); + + result = tensor.cumsum(0); + float accum = 0; + for (int i = 0; i < size; i++) { + accum += tensor(i, 1, 2, 3); + VERIFY_IS_EQUAL(result(i, 1, 2, 3), accum); + } + result = tensor.cumsum(1); + accum = 0; + for (int i = 0; i < size; i++) { + accum += tensor(1, i, 2, 3); + VERIFY_IS_EQUAL(result(1, i, 2, 3), accum); + } + result = tensor.cumsum(2); + accum = 0; + for (int i = 0; i < size; i++) { + accum += tensor(1, 2, i, 3); + VERIFY_IS_EQUAL(result(1, 2, i, 3), accum); + } + result = tensor.cumsum(3); + accum = 0; + for (int i = 0; i < size; i++) { + accum += tensor(1, 2, 3, i); + VERIFY_IS_EQUAL(result(1, 2, 3, i), accum); + } +} + +template <int DataLayout> +static void test_tensor_maps() { + int inputs[20]; + TensorMap<Tensor<int, 1, DataLayout> > tensor_map(inputs, 20); + tensor_map.setRandom(); + + Tensor<int, 1, DataLayout> result = tensor_map.cumsum(0); + + int accum = 0; + for (int i = 0; i < 20; ++i) { + accum += tensor_map(i); + VERIFY_IS_EQUAL(result(i), accum); + } +} + +void test_cxx11_tensor_scan() { + CALL_SUBTEST((test_1d_scan<ColMajor, float, true>())); + CALL_SUBTEST((test_1d_scan<ColMajor, float, false>())); + CALL_SUBTEST((test_1d_scan<RowMajor, float, true>())); + CALL_SUBTEST((test_1d_scan<RowMajor, float, false>())); + CALL_SUBTEST(test_4d_scan<ColMajor>()); + CALL_SUBTEST(test_4d_scan<RowMajor>()); + CALL_SUBTEST(test_tensor_maps<ColMajor>()); + CALL_SUBTEST(test_tensor_maps<RowMajor>()); +} diff --git a/unsupported/test/cxx11_tensor_scan_cuda.cu b/unsupported/test/cxx11_tensor_scan_cuda.cu new file mode 100644 index 000000000..761d11fd1 --- /dev/null +++ b/unsupported/test/cxx11_tensor_scan_cuda.cu @@ -0,0 +1,79 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_scan_cuda +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_GPU + +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#include <cuda_fp16.h> +#endif +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::Tensor; +typedef Tensor<float, 1>::DimensionPair DimPair; + +template<int DataLayout> +void test_cuda_cumsum(int m_size, int k_size, int n_size) +{ + std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; + Tensor<float, 3, DataLayout> t_input(m_size, k_size, n_size); + Tensor<float, 3, DataLayout> t_result(m_size, k_size, n_size); + Tensor<float, 3, DataLayout> t_result_gpu(m_size, k_size, n_size); + + t_input.setRandom(); + + std::size_t t_input_bytes = t_input.size() * sizeof(float); + std::size_t t_result_bytes = t_result.size() * sizeof(float); + + float* d_t_input; + float* d_t_result; + + cudaMalloc((void**)(&d_t_input), t_input_bytes); + cudaMalloc((void**)(&d_t_result), t_result_bytes); + + cudaMemcpy(d_t_input, t_input.data(), t_input_bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > + gpu_t_input(d_t_input, Eigen::array<int, 3>(m_size, k_size, n_size)); + Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > + gpu_t_result(d_t_result, Eigen::array<int, 3>(m_size, k_size, n_size)); + + gpu_t_result.device(gpu_device) = gpu_t_input.cumsum(1); + t_result = t_input.cumsum(1); + + cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); + for (size_t i = 0; i < t_result.size(); i++) { + if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) { + continue; + } + if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) { + continue; + } + std::cout << "mismatch detected at index " << i << ": " << t_result(i) + << " vs " << t_result_gpu(i) << std::endl; + assert(false); + } + + cudaFree((void*)d_t_input); + cudaFree((void*)d_t_result); +} + + +void test_cxx11_tensor_scan_cuda() +{ + CALL_SUBTEST_1(test_cuda_cumsum<ColMajor>(128, 128, 128)); + CALL_SUBTEST_2(test_cuda_cumsum<RowMajor>(128, 128, 128)); +} diff --git a/unsupported/test/cxx11_tensor_sugar.cpp b/unsupported/test/cxx11_tensor_sugar.cpp index a03f75cfe..2f56eb495 100644 --- a/unsupported/test/cxx11_tensor_sugar.cpp +++ b/unsupported/test/cxx11_tensor_sugar.cpp @@ -33,7 +33,7 @@ static void test_comparison_sugar() { } -static void test_scalar_sugar() { +static void test_scalar_sugar_add_mul() { Tensor<float, 3> A(6, 7, 5); Tensor<float, 3> B(6, 7, 5); A.setRandom(); @@ -41,21 +41,41 @@ static void test_scalar_sugar() { const float alpha = 0.43f; const float beta = 0.21f; + const float gamma = 0.14f; - Tensor<float, 3> R = A * A.constant(alpha) + B * B.constant(beta); - Tensor<float, 3> S = A * alpha + B * beta; - - // TODO: add enough syntactic sugar to support this - // Tensor<float, 3> T = alpha * A + beta * B; + Tensor<float, 3> R = A.constant(gamma) + A * A.constant(alpha) + B * B.constant(beta); + Tensor<float, 3> S = A * alpha + B * beta + gamma; + Tensor<float, 3> T = gamma + alpha * A + beta * B; for (int i = 0; i < 6*7*5; ++i) { VERIFY_IS_APPROX(R(i), S(i)); + VERIFY_IS_APPROX(R(i), T(i)); } } +static void test_scalar_sugar_sub_div() { + Tensor<float, 3> A(6, 7, 5); + Tensor<float, 3> B(6, 7, 5); + A.setRandom(); + B.setRandom(); + + const float alpha = 0.43f; + const float beta = 0.21f; + const float gamma = 0.14f; + const float delta = 0.32f; + + Tensor<float, 3> R = A.constant(gamma) - A / A.constant(alpha) + - B.constant(beta) / B - A.constant(delta); + Tensor<float, 3> S = gamma - A / alpha - beta / B - delta; + + for (int i = 0; i < 6*7*5; ++i) { + VERIFY_IS_APPROX(R(i), S(i)); + } +} void test_cxx11_tensor_sugar() { CALL_SUBTEST(test_comparison_sugar()); - CALL_SUBTEST(test_scalar_sugar()); + CALL_SUBTEST(test_scalar_sugar_add_mul()); + CALL_SUBTEST(test_scalar_sugar_sub_div()); } diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index e46197464..2ef665f30 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -91,7 +91,7 @@ void test_multithread_contraction() for (ptrdiff_t i = 0; i < t_result.size(); i++) { VERIFY(&t_result.data()[i] != &m_result.data()[i]); - if (fabs(t_result(i) - m_result(i)) < 1e-4) { + if (fabsf(t_result(i) - m_result(i)) < 1e-4f) { continue; } if (Eigen::internal::isApprox(t_result(i), m_result(i), 1e-4f)) { @@ -132,7 +132,7 @@ void test_contraction_corner_cases() for (ptrdiff_t i = 0; i < t_result.size(); i++) { assert(!(numext::isnan)(t_result.data()[i])); - if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) { std::cout << "mismatch detected at index " << i << " : " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; assert(false); } @@ -147,7 +147,7 @@ void test_contraction_corner_cases() m_result = m_left.transpose() * m_right; for (ptrdiff_t i = 0; i < t_result.size(); i++) { assert(!(numext::isnan)(t_result.data()[i])); - if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) { std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; assert(false); } @@ -165,7 +165,7 @@ void test_contraction_corner_cases() m_result = m_left.transpose() * m_right; for (ptrdiff_t i = 0; i < t_result.size(); i++) { assert(!(numext::isnan)(t_result.data()[i])); - if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) { std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; assert(false); } @@ -183,7 +183,7 @@ void test_contraction_corner_cases() m_result = m_left.transpose() * m_right; for (ptrdiff_t i = 0; i < t_result.size(); i++) { assert(!(numext::isnan)(t_result.data()[i])); - if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) { std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; assert(false); } @@ -226,7 +226,7 @@ void test_multithread_contraction_agrees_with_singlethread() { for (ptrdiff_t i = 0; i < st_result.size(); i++) { // if both of the values are very small, then do nothing (because the test will fail // due to numerical precision issues when values are small) - if (fabs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4) { + if (numext::abs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4f) { VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]); } } @@ -234,6 +234,42 @@ void test_multithread_contraction_agrees_with_singlethread() { template<int DataLayout> +void test_full_contraction() { + int contract_size1 = internal::random<int>(1, 500); + int contract_size2 = internal::random<int>(1, 500); + + Tensor<float, 2, DataLayout> left(contract_size1, + contract_size2); + Tensor<float, 2, DataLayout> right(contract_size1, + contract_size2); + left.setRandom(); + right.setRandom(); + + // add constants to shift values away from 0 for more precision + left += left.constant(1.5f); + right += right.constant(1.5f); + + typedef Tensor<float, 2>::DimensionPair DimPair; + Eigen::array<DimPair, 2> dims({{DimPair(0, 0), DimPair(1, 1)}}); + + Eigen::ThreadPool tp(internal::random<int>(2, 11)); + Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(2, 11)); + + Tensor<float, 0, DataLayout> st_result; + st_result = left.contract(right, dims); + + Tensor<float, 0, DataLayout> tp_result; + tp_result.device(thread_pool_device) = left.contract(right, dims); + + VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions())); + // if both of the values are very small, then do nothing (because the test will fail + // due to numerical precision issues when values are small) + if (numext::abs(st_result() - tp_result()) >= 1e-4f) { + VERIFY_IS_APPROX(st_result(), tp_result()); + } +} + +template<int DataLayout> void test_multithreaded_reductions() { const int num_threads = internal::random<int>(3, 11); ThreadPool thread_pool(num_threads); @@ -324,6 +360,9 @@ void test_cxx11_tensor_thread_pool() CALL_SUBTEST_4(test_contraction_corner_cases<ColMajor>()); CALL_SUBTEST_4(test_contraction_corner_cases<RowMajor>()); + CALL_SUBTEST_4(test_full_contraction<ColMajor>()); + CALL_SUBTEST_4(test_full_contraction<RowMajor>()); + CALL_SUBTEST_5(test_multithreaded_reductions<ColMajor>()); CALL_SUBTEST_5(test_multithreaded_reductions<RowMajor>()); diff --git a/unsupported/test/kronecker_product.cpp b/unsupported/test/kronecker_product.cpp index 02411a262..e770049e5 100644 --- a/unsupported/test/kronecker_product.cpp +++ b/unsupported/test/kronecker_product.cpp @@ -9,12 +9,12 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#ifdef EIGEN_TEST_PART_1 #include "sparse.h" #include <Eigen/SparseExtra> #include <Eigen/KroneckerProduct> - template<typename MatrixType> void check_dimension(const MatrixType& ab, const int rows, const int cols) { @@ -230,3 +230,23 @@ void test_kronecker_product() VERIFY_IS_APPROX(MatrixXf(sC2),dC); } } + +#endif + +#ifdef EIGEN_TEST_PART_2 + +// simply check that for a dense kronecker product, sparse module is not needed + +#include "main.h" +#include <Eigen/KroneckerProduct> + +void test_kronecker_product() +{ + MatrixXd a(2,2), b(3,3), c; + a.setRandom(); + b.setRandom(); + c = kroneckerProduct(a,b); + VERIFY_IS_APPROX(c.block(3,3,3,3), a(1,1)*b); +} + +#endif diff --git a/unsupported/test/matrix_function.cpp b/unsupported/test/matrix_function.cpp index 9a995f941..7c9b68a3c 100644 --- a/unsupported/test/matrix_function.cpp +++ b/unsupported/test/matrix_function.cpp @@ -113,8 +113,8 @@ void testMatrixLogarithm(const MatrixType& A) MatrixType scaledA; RealScalar maxImagPartOfSpectrum = A.eigenvalues().imag().cwiseAbs().maxCoeff(); - if (maxImagPartOfSpectrum >= 0.9 * EIGEN_PI) - scaledA = A * 0.9 * EIGEN_PI / maxImagPartOfSpectrum; + if (maxImagPartOfSpectrum >= RealScalar(0.9L * EIGEN_PI)) + scaledA = A * RealScalar(0.9L * EIGEN_PI) / maxImagPartOfSpectrum; else scaledA = A; diff --git a/unsupported/test/matrix_functions.h b/unsupported/test/matrix_functions.h index 150b4c0c5..4e2636404 100644 --- a/unsupported/test/matrix_functions.h +++ b/unsupported/test/matrix_functions.h @@ -61,7 +61,7 @@ struct generateTestMatrix<MatrixType,1> }; template <typename Derived, typename OtherDerived> -double relerr(const MatrixBase<Derived>& A, const MatrixBase<OtherDerived>& B) +typename Derived::RealScalar relerr(const MatrixBase<Derived>& A, const MatrixBase<OtherDerived>& B) { return std::sqrt((A - B).cwiseAbs2().sum() / (std::min)(A.cwiseAbs2().sum(), B.cwiseAbs2().sum())); } diff --git a/unsupported/test/matrix_power.cpp b/unsupported/test/matrix_power.cpp index 8e104ed1e..7ccfacfdf 100644 --- a/unsupported/test/matrix_power.cpp +++ b/unsupported/test/matrix_power.cpp @@ -10,7 +10,7 @@ #include "matrix_functions.h" template<typename T> -void test2dRotation(double tol) +void test2dRotation(const T& tol) { Matrix<T,2,2> A, B, C; T angle, c, s; @@ -19,19 +19,19 @@ void test2dRotation(double tol) MatrixPower<Matrix<T,2,2> > Apow(A); for (int i=0; i<=20; ++i) { - angle = pow(10, (i-10) / 5.); + angle = std::pow(T(10), (i-10) / T(5.)); c = std::cos(angle); s = std::sin(angle); B << c, s, -s, c; - C = Apow(std::ldexp(angle,1) / EIGEN_PI); + C = Apow(std::ldexp(angle,1) / T(EIGEN_PI)); std::cout << "test2dRotation: i = " << i << " error powerm = " << relerr(C,B) << '\n'; VERIFY(C.isApprox(B, tol)); } } template<typename T> -void test2dHyperbolicRotation(double tol) +void test2dHyperbolicRotation(const T& tol) { Matrix<std::complex<T>,2,2> A, B, C; T angle, ch = std::cosh((T)1); @@ -53,7 +53,7 @@ void test2dHyperbolicRotation(double tol) } template<typename T> -void test3dRotation(double tol) +void test3dRotation(const T& tol) { Matrix<T,3,1> v; T angle; @@ -61,13 +61,13 @@ void test3dRotation(double tol) for (int i=0; i<=20; ++i) { v = Matrix<T,3,1>::Random(); v.normalize(); - angle = pow(10, (i-10) / 5.); + angle = std::pow(T(10), (i-10) / T(5.)); VERIFY(AngleAxis<T>(angle, v).matrix().isApprox(AngleAxis<T>(1,v).matrix().pow(angle), tol)); } } template<typename MatrixType> -void testGeneral(const MatrixType& m, double tol) +void testGeneral(const MatrixType& m, const typename MatrixType::RealScalar& tol) { typedef typename MatrixType::RealScalar RealScalar; MatrixType m1, m2, m3, m4, m5; @@ -97,7 +97,7 @@ void testGeneral(const MatrixType& m, double tol) } template<typename MatrixType> -void testSingular(const MatrixType& m_const, double tol) +void testSingular(const MatrixType& m_const, const typename MatrixType::RealScalar& tol) { // we need to pass by reference in order to prevent errors with // MSVC for aligned data types ... @@ -119,18 +119,18 @@ void testSingular(const MatrixType& m_const, double tol) MatrixPower<MatrixType> mpow(m); T = T.sqrt(); - VERIFY(mpow(0.5).isApprox(U * (TriangularType(T) * U.adjoint()), tol)); + VERIFY(mpow(0.5L).isApprox(U * (TriangularType(T) * U.adjoint()), tol)); T = T.sqrt(); - VERIFY(mpow(0.25).isApprox(U * (TriangularType(T) * U.adjoint()), tol)); + VERIFY(mpow(0.25L).isApprox(U * (TriangularType(T) * U.adjoint()), tol)); T = T.sqrt(); - VERIFY(mpow(0.125).isApprox(U * (TriangularType(T) * U.adjoint()), tol)); + VERIFY(mpow(0.125L).isApprox(U * (TriangularType(T) * U.adjoint()), tol)); } } template<typename MatrixType> -void testLogThenExp(const MatrixType& m_const, double tol) +void testLogThenExp(const MatrixType& m_const, const typename MatrixType::RealScalar& tol) { // we need to pass by reference in order to prevent errors with // MSVC for aligned data types ... @@ -154,14 +154,14 @@ void test_matrix_power() { CALL_SUBTEST_2(test2dRotation<double>(1e-13)); CALL_SUBTEST_1(test2dRotation<float>(2e-5)); // was 1e-5, relaxed for clang 2.8 / linux / x86-64 - CALL_SUBTEST_9(test2dRotation<long double>(1e-13)); + CALL_SUBTEST_9(test2dRotation<long double>(1e-13L)); CALL_SUBTEST_2(test2dHyperbolicRotation<double>(1e-14)); CALL_SUBTEST_1(test2dHyperbolicRotation<float>(1e-5)); - CALL_SUBTEST_9(test2dHyperbolicRotation<long double>(1e-14)); + CALL_SUBTEST_9(test2dHyperbolicRotation<long double>(1e-14L)); CALL_SUBTEST_10(test3dRotation<double>(1e-13)); CALL_SUBTEST_11(test3dRotation<float>(1e-5)); - CALL_SUBTEST_12(test3dRotation<long double>(1e-13)); + CALL_SUBTEST_12(test3dRotation<long double>(1e-13L)); CALL_SUBTEST_2(testGeneral(Matrix2d(), 1e-13)); CALL_SUBTEST_7(testGeneral(Matrix3dRowMajor(), 1e-13)); @@ -171,10 +171,10 @@ void test_matrix_power() CALL_SUBTEST_5(testGeneral(Matrix3cf(), 1e-4)); CALL_SUBTEST_8(testGeneral(Matrix4f(), 1e-4)); CALL_SUBTEST_6(testGeneral(MatrixXf(2,2), 1e-3)); // see bug 614 - CALL_SUBTEST_9(testGeneral(MatrixXe(7,7), 1e-13)); + CALL_SUBTEST_9(testGeneral(MatrixXe(7,7), 1e-13L)); CALL_SUBTEST_10(testGeneral(Matrix3d(), 1e-13)); CALL_SUBTEST_11(testGeneral(Matrix3f(), 1e-4)); - CALL_SUBTEST_12(testGeneral(Matrix3e(), 1e-13)); + CALL_SUBTEST_12(testGeneral(Matrix3e(), 1e-13L)); CALL_SUBTEST_2(testSingular(Matrix2d(), 1e-13)); CALL_SUBTEST_7(testSingular(Matrix3dRowMajor(), 1e-13)); @@ -184,10 +184,10 @@ void test_matrix_power() CALL_SUBTEST_5(testSingular(Matrix3cf(), 1e-4)); CALL_SUBTEST_8(testSingular(Matrix4f(), 1e-4)); CALL_SUBTEST_6(testSingular(MatrixXf(2,2), 1e-3)); - CALL_SUBTEST_9(testSingular(MatrixXe(7,7), 1e-13)); + CALL_SUBTEST_9(testSingular(MatrixXe(7,7), 1e-13L)); CALL_SUBTEST_10(testSingular(Matrix3d(), 1e-13)); CALL_SUBTEST_11(testSingular(Matrix3f(), 1e-4)); - CALL_SUBTEST_12(testSingular(Matrix3e(), 1e-13)); + CALL_SUBTEST_12(testSingular(Matrix3e(), 1e-13L)); CALL_SUBTEST_2(testLogThenExp(Matrix2d(), 1e-13)); CALL_SUBTEST_7(testLogThenExp(Matrix3dRowMajor(), 1e-13)); @@ -197,8 +197,8 @@ void test_matrix_power() CALL_SUBTEST_5(testLogThenExp(Matrix3cf(), 1e-4)); CALL_SUBTEST_8(testLogThenExp(Matrix4f(), 1e-4)); CALL_SUBTEST_6(testLogThenExp(MatrixXf(2,2), 1e-3)); - CALL_SUBTEST_9(testLogThenExp(MatrixXe(7,7), 1e-13)); + CALL_SUBTEST_9(testLogThenExp(MatrixXe(7,7), 1e-13L)); CALL_SUBTEST_10(testLogThenExp(Matrix3d(), 1e-13)); CALL_SUBTEST_11(testLogThenExp(Matrix3f(), 1e-4)); - CALL_SUBTEST_12(testLogThenExp(Matrix3e(), 1e-13)); + CALL_SUBTEST_12(testLogThenExp(Matrix3e(), 1e-13L)); } diff --git a/unsupported/test/mpreal/mpreal.h b/unsupported/test/mpreal/mpreal.h index 9b0cf7268..8404f1ff8 100644 --- a/unsupported/test/mpreal/mpreal.h +++ b/unsupported/test/mpreal/mpreal.h @@ -99,7 +99,7 @@ // Detect support for explicit converters.
#if (__has_feature(cxx_explicit_conversions) || \
- (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GNUC_MINOR >= 5) || __cplusplus >= 201103L || \
+ (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GNUC_MINOR__ >= 5) || __cplusplus >= 201103L || \
(defined(_MSC_VER) && _MSC_VER >= 1800))
#define MPREAL_HAVE_EXPLICIT_CONVERTERS
diff --git a/unsupported/test/mpreal_support.cpp b/unsupported/test/mpreal_support.cpp index 1aa9e786a..ffa5691eb 100644 --- a/unsupported/test/mpreal_support.cpp +++ b/unsupported/test/mpreal_support.cpp @@ -17,6 +17,7 @@ void test_mpreal_support() std::cerr << "dummy_precision = " << NumTraits<mpreal>::dummy_precision() << "\n"; std::cerr << "highest = " << NumTraits<mpreal>::highest() << "\n"; std::cerr << "lowest = " << NumTraits<mpreal>::lowest() << "\n"; + std::cerr << "digits10 = " << NumTraits<mpreal>::digits10() << "\n"; for(int i = 0; i < g_repeat; i++) { int s = Eigen::internal::random<int>(1,100); diff --git a/unsupported/test/special_functions.cpp b/unsupported/test/special_functions.cpp new file mode 100644 index 000000000..057fb3e92 --- /dev/null +++ b/unsupported/test/special_functions.cpp @@ -0,0 +1,345 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" +#include "../Eigen/SpecialFunctions" + +template<typename X, typename Y> +void verify_component_wise(const X& x, const Y& y) +{ + for(Index i=0; i<x.size(); ++i) + { + if((numext::isfinite)(y(i))) + VERIFY_IS_APPROX( x(i), y(i) ); + else if((numext::isnan)(y(i))) + VERIFY((numext::isnan)(x(i))); + else + VERIFY_IS_EQUAL( x(i), y(i) ); + } +} + +template<typename ArrayType> void array_special_functions() +{ + using std::abs; + using std::sqrt; + typedef typename ArrayType::Scalar Scalar; + typedef typename NumTraits<Scalar>::Real RealScalar; + + Scalar plusinf = std::numeric_limits<Scalar>::infinity(); + Scalar nan = std::numeric_limits<Scalar>::quiet_NaN(); + + Index rows = internal::random<Index>(1,30); + Index cols = 1; + + // API + { + ArrayType m1 = ArrayType::Random(rows,cols); +#if EIGEN_HAS_C99_MATH + VERIFY_IS_APPROX(m1.lgamma(), lgamma(m1)); + VERIFY_IS_APPROX(m1.digamma(), digamma(m1)); + VERIFY_IS_APPROX(m1.erf(), erf(m1)); + VERIFY_IS_APPROX(m1.erfc(), erfc(m1)); +#endif // EIGEN_HAS_C99_MATH + } + + +#if EIGEN_HAS_C99_MATH + // check special functions (comparing against numpy implementation) + if (!NumTraits<Scalar>::IsComplex) + { + + { + ArrayType m1 = ArrayType::Random(rows,cols); + ArrayType m2 = ArrayType::Random(rows,cols); + + // Test various propreties of igamma & igammac. These are normalized + // gamma integrals where + // igammac(a, x) = Gamma(a, x) / Gamma(a) + // igamma(a, x) = gamma(a, x) / Gamma(a) + // where Gamma and gamma are considered the standard unnormalized + // upper and lower incomplete gamma functions, respectively. + ArrayType a = m1.abs() + 2; + ArrayType x = m2.abs() + 2; + ArrayType zero = ArrayType::Zero(rows, cols); + ArrayType one = ArrayType::Constant(rows, cols, Scalar(1.0)); + ArrayType a_m1 = a - one; + ArrayType Gamma_a_x = Eigen::igammac(a, x) * a.lgamma().exp(); + ArrayType Gamma_a_m1_x = Eigen::igammac(a_m1, x) * a_m1.lgamma().exp(); + ArrayType gamma_a_x = Eigen::igamma(a, x) * a.lgamma().exp(); + ArrayType gamma_a_m1_x = Eigen::igamma(a_m1, x) * a_m1.lgamma().exp(); + + // Gamma(a, 0) == Gamma(a) + VERIFY_IS_APPROX(Eigen::igammac(a, zero), one); + + // Gamma(a, x) + gamma(a, x) == Gamma(a) + VERIFY_IS_APPROX(Gamma_a_x + gamma_a_x, a.lgamma().exp()); + + // Gamma(a, x) == (a - 1) * Gamma(a-1, x) + x^(a-1) * exp(-x) + VERIFY_IS_APPROX(Gamma_a_x, (a - 1) * Gamma_a_m1_x + x.pow(a-1) * (-x).exp()); + + // gamma(a, x) == (a - 1) * gamma(a-1, x) - x^(a-1) * exp(-x) + VERIFY_IS_APPROX(gamma_a_x, (a - 1) * gamma_a_m1_x - x.pow(a-1) * (-x).exp()); + } + + { + // Check exact values of igamma and igammac against a third party calculation. + Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; + + // location i*6+j corresponds to a_s[i], x_s[j]. + Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan}, + {0.0, 0.6321205588285578, 0.7768698398515702, + 0.9816843611112658, 9.999500016666262e-05, 1.0}, + {0.0, 0.4275932955291202, 0.608374823728911, + 0.9539882943107686, 7.522076445089201e-07, 1.0}, + {0.0, 0.01898815687615381, 0.06564245437845008, + 0.5665298796332909, 4.166333347221828e-18, 1.0}, + {0.0, 0.9999780593618628, 0.9999899967080838, + 0.9999996219837988, 0.9991370418689945, 1.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}}; + Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan}, + {1.0, 0.36787944117144233, 0.22313016014842982, + 0.018315638888734182, 0.9999000049998333, 0.0}, + {1.0, 0.5724067044708798, 0.3916251762710878, + 0.04601170568923136, 0.9999992477923555, 0.0}, + {1.0, 0.9810118431238462, 0.9343575456215499, + 0.4334701203667089, 1.0, 0.0}, + {1.0, 2.1940638138146658e-05, 1.0003291916285e-05, + 3.7801620118431334e-07, 0.0008629581310054535, + 0.0}, + {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}}; + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 6; ++j) { + if ((std::isnan)(igamma_s[i][j])) { + VERIFY((std::isnan)(numext::igamma(a_s[i], x_s[j]))); + } else { + VERIFY_IS_APPROX(numext::igamma(a_s[i], x_s[j]), igamma_s[i][j]); + } + + if ((std::isnan)(igammac_s[i][j])) { + VERIFY((std::isnan)(numext::igammac(a_s[i], x_s[j]))); + } else { + VERIFY_IS_APPROX(numext::igammac(a_s[i], x_s[j]), igammac_s[i][j]); + } + } + } + } + } +#endif // EIGEN_HAS_C99_MATH + + // Check the zeta function against scipy.special.zeta + { + ArrayType x(7), q(7), res(7), ref(7); + x << 1.5, 4, 10.5, 10000.5, 3, 1, 0.9; + q << 2, 1.5, 3, 1.0001, -2.5, 1.2345, 1.2345; + ref << 1.61237534869, 0.234848505667, 1.03086757337e-5, 0.367879440865, 0.054102025820864097, plusinf, nan; + CALL_SUBTEST( verify_component_wise(ref, ref); ); + CALL_SUBTEST( res = x.zeta(q); verify_component_wise(res, ref); ); + CALL_SUBTEST( res = zeta(x,q); verify_component_wise(res, ref); ); + } + + // digamma + { + ArrayType x(7), res(7), ref(7); + x << 1, 1.5, 4, -10.5, 10000.5, 0, -1; + ref << -0.5772156649015329, 0.03648997397857645, 1.2561176684318, 2.398239129535781, 9.210340372392849, plusinf, plusinf; + CALL_SUBTEST( verify_component_wise(ref, ref); ); + + CALL_SUBTEST( res = x.digamma(); verify_component_wise(res, ref); ); + CALL_SUBTEST( res = digamma(x); verify_component_wise(res, ref); ); + } + + +#if EIGEN_HAS_C99_MATH + { + ArrayType n(11), x(11), res(11), ref(11); + n << 1, 1, 1, 1.5, 17, 31, 28, 8, 42, 147, 170; + x << 2, 3, 25.5, 1.5, 4.7, 11.8, 17.7, 30.2, 15.8, 54.1, 64; + ref << 0.644934066848, 0.394934066848, 0.0399946696496, nan, 293.334565435, 0.445487887616, -2.47810300902e-07, -8.29668781082e-09, -0.434562276666, 0.567742190178, -0.0108615497927; + CALL_SUBTEST( verify_component_wise(ref, ref); ); + + if(sizeof(RealScalar)>=8) { // double + // Reason for commented line: http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1232 + // CALL_SUBTEST( res = x.polygamma(n); verify_component_wise(res, ref); ); + CALL_SUBTEST( res = polygamma(n,x); verify_component_wise(res, ref); ); + } + else { + // CALL_SUBTEST( res = x.polygamma(n); verify_component_wise(res.head(8), ref.head(8)); ); + CALL_SUBTEST( res = polygamma(n,x); verify_component_wise(res.head(8), ref.head(8)); ); + } + } +#endif + +#if EIGEN_HAS_C99_MATH + { + // Inputs and ground truth generated with scipy via: + // a = np.logspace(-3, 3, 5) - 1e-3 + // b = np.logspace(-3, 3, 5) - 1e-3 + // x = np.linspace(-0.1, 1.1, 5) + // (full_a, full_b, full_x) = np.vectorize(lambda a, b, x: (a, b, x))(*np.ix_(a, b, x)) + // full_a = full_a.flatten().tolist() # same for full_b, full_x + // v = scipy.special.betainc(full_a, full_b, full_x).flatten().tolist() + // + // Note in Eigen, we call betainc with arguments in the order (x, a, b). + ArrayType a(125); + ArrayType b(125); + ArrayType x(125); + ArrayType v(125); + ArrayType res(125); + + a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, + 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, + 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, + 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, + 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, + 999.999, 999.999, 999.999; + + b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999, + 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, + 999.999, 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, + 0.999, 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999, + 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999, + 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999, + 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, + 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, + 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, + 31.62177660168379, 31.62177660168379, 31.62177660168379, + 31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999, + 999.999, 999.999; + + x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, + 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, + 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, + 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, + -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, + 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, + 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, + 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, + 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, + 0.8, 1.1; + + v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, + nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, + nan, nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan, + 0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan, + 0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan, + 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, + nan, nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256, + 0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001, + 0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403, + 0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999, + 0.9999999999999999, nan, nan, nan, nan, nan, nan, nan, + 1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, + nan, nan, 7.864342668429763e-23, 3.015969667594166e-10, + 0.0008598571564165444, nan, nan, 6.031987710123844e-08, + 0.5000000000000007, 0.9999999396801229, nan, nan, 0.9999999999999999, + 0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan, nan, nan, + nan, 0.0, 7.029920380986636e-306, 2.2450728208591345e-101, nan, nan, + 0.0, 9.275871147869727e-302, 1.2232913026152827e-97, nan, nan, 0.0, + 3.0891393081932924e-252, 2.9303043666183996e-60, nan, nan, + 2.248913486879199e-196, 0.5000000000004947, 0.9999999999999999, nan; + + CALL_SUBTEST(res = betainc(a, b, x); + verify_component_wise(res, v);); + } + + // Test various properties of betainc + { + ArrayType m1 = ArrayType::Random(32); + ArrayType m2 = ArrayType::Random(32); + ArrayType m3 = ArrayType::Random(32); + ArrayType one = ArrayType::Constant(32, Scalar(1.0)); + const Scalar eps = std::numeric_limits<Scalar>::epsilon(); + ArrayType a = (m1 * 4.0).exp(); + ArrayType b = (m2 * 4.0).exp(); + ArrayType x = m3.abs(); + + // betainc(a, 1, x) == x**a + CALL_SUBTEST( + ArrayType test = betainc(a, one, x); + ArrayType expected = x.pow(a); + verify_component_wise(test, expected);); + + // betainc(1, b, x) == 1 - (1 - x)**b + CALL_SUBTEST( + ArrayType test = betainc(one, b, x); + ArrayType expected = one - (one - x).pow(b); + verify_component_wise(test, expected);); + + // betainc(a, b, x) == 1 - betainc(b, a, 1-x) + CALL_SUBTEST( + ArrayType test = betainc(a, b, x) + betainc(b, a, one - x); + ArrayType expected = one; + verify_component_wise(test, expected);); + + // betainc(a+1, b, x) = betainc(a, b, x) - x**a * (1 - x)**b / (a * beta(a, b)) + CALL_SUBTEST( + ArrayType num = x.pow(a) * (one - x).pow(b); + ArrayType denom = a * (a.lgamma() + b.lgamma() - (a + b).lgamma()).exp(); + // Add eps to rhs and lhs so that component-wise test doesn't result in + // nans when both outputs are zeros. + ArrayType expected = betainc(a, b, x) - num / denom + eps; + ArrayType test = betainc(a + one, b, x) + eps; + if (sizeof(Scalar) >= 8) { // double + verify_component_wise(test, expected); + } else { + // Reason for limited test: http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1232 + verify_component_wise(test.head(8), expected.head(8)); + }); + + // betainc(a, b+1, x) = betainc(a, b, x) + x**a * (1 - x)**b / (b * beta(a, b)) + CALL_SUBTEST( + // Add eps to rhs and lhs so that component-wise test doesn't result in + // nans when both outputs are zeros. + ArrayType num = x.pow(a) * (one - x).pow(b); + ArrayType denom = b * (a.lgamma() + b.lgamma() - (a + b).lgamma()).exp(); + ArrayType expected = betainc(a, b, x) + num / denom + eps; + ArrayType test = betainc(a, b + one, x) + eps; + verify_component_wise(test, expected);); + } +#endif +} + +void test_special_functions() +{ + CALL_SUBTEST_1(array_special_functions<ArrayXf>()); + CALL_SUBTEST_2(array_special_functions<ArrayXd>()); +} |