aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported
diff options
context:
space:
mode:
Diffstat (limited to 'unsupported')
-rw-r--r--unsupported/Eigen/CMakeLists.txt4
-rw-r--r--unsupported/Eigen/CXX11/CMakeLists.txt2
-rw-r--r--unsupported/Eigen/CXX11/Tensor2
-rw-r--r--unsupported/Eigen/CXX11/src/CMakeLists.txt4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/README.md4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBase.h124
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h8
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h74
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h12
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h24
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h37
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h28
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h99
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h80
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h8
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h3
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h86
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h33
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorIO.h69
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h50
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h37
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h46
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h145
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorScan.h172
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h3
-rw-r--r--unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt8
-rw-r--r--unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h5
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h44
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h37
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h4
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h7
-rw-r--r--unsupported/Eigen/CXX11/src/util/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/KroneckerProduct2
-rw-r--r--unsupported/Eigen/MPRealSupport29
-rw-r--r--unsupported/Eigen/SpecialFunctions61
-rwxr-xr-xunsupported/Eigen/src/AutoDiff/AutoDiffScalar.h177
-rw-r--r--unsupported/Eigen/src/AutoDiff/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/src/BVH/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/src/CMakeLists.txt16
-rw-r--r--unsupported/Eigen/src/Eigenvalues/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/src/FFT/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/src/IterativeSolvers/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/src/KroneckerProduct/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h4
-rw-r--r--unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/src/MatrixFunctions/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h33
-rw-r--r--unsupported/Eigen/src/MoreVectorization/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/src/NonLinearOptimization/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/src/NumericalDiff/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/src/Polynomials/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/src/Skyline/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/src/SparseExtra/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h124
-rw-r--r--unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h236
-rw-r--r--unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h47
-rw-r--r--unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h1551
-rw-r--r--unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h58
-rw-r--r--unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h165
-rw-r--r--unsupported/Eigen/src/Splines/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/src/Splines/Spline.h2
-rw-r--r--unsupported/doc/examples/BVH_Example.cpp4
-rw-r--r--unsupported/test/CMakeLists.txt21
-rw-r--r--unsupported/test/FFTW.cpp32
-rw-r--r--unsupported/test/autodiff.cpp32
-rw-r--r--unsupported/test/autodiff_scalar.cpp35
-rw-r--r--unsupported/test/cxx11_float16.cpp203
-rw-r--r--unsupported/test/cxx11_non_blocking_thread_pool.cpp5
-rw-r--r--unsupported/test/cxx11_tensor_cuda.cu150
-rw-r--r--unsupported/test/cxx11_tensor_dimension.cpp16
-rw-r--r--unsupported/test/cxx11_tensor_io.cpp22
-rw-r--r--unsupported/test/cxx11_tensor_morphing.cpp48
-rw-r--r--unsupported/test/cxx11_tensor_of_float16_cuda.cu141
-rw-r--r--unsupported/test/cxx11_tensor_reduction.cpp29
-rw-r--r--unsupported/test/cxx11_tensor_reduction_cuda.cu26
-rw-r--r--unsupported/test/cxx11_tensor_scan.cpp94
-rw-r--r--unsupported/test/cxx11_tensor_scan_cuda.cu77
-rw-r--r--unsupported/test/cxx11_tensor_sugar.cpp34
-rw-r--r--unsupported/test/kronecker_product.cpp22
-rw-r--r--unsupported/test/mpreal_support.cpp1
-rw-r--r--unsupported/test/special_functions.cpp345
89 files changed, 4328 insertions, 898 deletions
diff --git a/unsupported/Eigen/CMakeLists.txt b/unsupported/Eigen/CMakeLists.txt
index 2fc8db412..631a06014 100644
--- a/unsupported/Eigen/CMakeLists.txt
+++ b/unsupported/Eigen/CMakeLists.txt
@@ -18,6 +18,7 @@ set(Eigen_HEADERS
Polynomials
Skyline
SparseExtra
+ SpecialFunctions
Splines
)
@@ -26,5 +27,6 @@ install(FILES
DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel
)
-add_subdirectory(src)
+install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel FILES_MATCHING PATTERN "*.h")
+
add_subdirectory(CXX11)
diff --git a/unsupported/Eigen/CXX11/CMakeLists.txt b/unsupported/Eigen/CXX11/CMakeLists.txt
index a40bc4715..385ed240c 100644
--- a/unsupported/Eigen/CXX11/CMakeLists.txt
+++ b/unsupported/Eigen/CXX11/CMakeLists.txt
@@ -5,4 +5,4 @@ install(FILES
DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel
)
-add_subdirectory(src)
+install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel FILES_MATCHING PATTERN "*.h")
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 859147404..f7b94cee1 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -15,6 +15,7 @@
#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+#include "../SpecialFunctions"
#include "src/util/CXX11Meta.h"
#include "src/util/MaxSizeVector.h"
@@ -80,6 +81,7 @@ typedef unsigned __int64 uint64_t;
#include "src/Tensor/TensorTraits.h"
#include "src/Tensor/TensorUInt128.h"
#include "src/Tensor/TensorIntDiv.h"
+#include "src/Tensor/TensorGlobalFunctions.h"
#include "src/Tensor/TensorBase.h"
diff --git a/unsupported/Eigen/CXX11/src/CMakeLists.txt b/unsupported/Eigen/CXX11/src/CMakeLists.txt
deleted file mode 100644
index 1734262bb..000000000
--- a/unsupported/Eigen/CXX11/src/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-add_subdirectory(util)
-add_subdirectory(ThreadPool)
-add_subdirectory(Tensor)
-add_subdirectory(TensorSymmetry)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt b/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt
deleted file mode 100644
index 6d4b3ea0d..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_Tensor_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_CXX11_Tensor_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/Tensor COMPONENT Devel
- )
diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md
index fda33edda..02146527b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/README.md
+++ b/unsupported/Eigen/CXX11/src/Tensor/README.md
@@ -1102,7 +1102,7 @@ Example: Reduction along two dimensions.
As a special case, if you pass no parameter to a reduction operation the
original tensor is reduced along *all* its dimensions. The result is a
-one-dimension tensor with a single value.
+scalar, represented as a zero-dimension tensor.
Eigen::Tensor<float, 3> a(2, 3, 4);
a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f},
@@ -1112,7 +1112,7 @@ one-dimension tensor with a single value.
{19.0f, 18.0f, 17.0f, 16.0f},
{20.0f, 21.0f, 22.0f, 23.0f}}});
// Reduce along all dimensions using the sum() operator.
- Eigen::Tensor<float, 1> b = a.sum();
+ Eigen::Tensor<float, 0> b = a.sum();
cout << "b" << endl << b << endl << endl;
=>
b
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index 1eaa8d4fc..7a45a5cf4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -192,6 +192,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
}
EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log1p_op<Scalar>, const Derived>
+ log1p() const {
+ return unaryExpr(internal::scalar_log1p_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived>
abs() const {
return unaryExpr(internal::scalar_abs_op<Scalar>());
@@ -204,34 +210,74 @@ class TensorBase<Derived, ReadOnlyAccessors>
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_pow_op<Scalar>, const Derived>
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_pow_op<Scalar,Scalar> >, const Derived>
pow(Scalar exponent) const {
- return unaryExpr(internal::scalar_pow_op<Scalar>(exponent));
+ return unaryExpr(internal::bind2nd_op<internal::scalar_pow_op<Scalar,Scalar> >(exponent));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_real_op<Scalar>, const Derived>
+ real() const {
+ return unaryExpr(internal::scalar_real_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_imag_op<Scalar>, const Derived>
+ imag() const {
+ return unaryExpr(internal::scalar_imag_op<Scalar>());
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_sum_op<Scalar,Scalar> >, const Derived>
operator+ (Scalar rhs) const {
- return unaryExpr(internal::scalar_add_op<Scalar>(rhs));
+ return unaryExpr(internal::bind2nd_op<internal::scalar_sum_op<Scalar,Scalar> >(rhs));
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sub_op<Scalar>, const Derived>
+ EIGEN_STRONG_INLINE friend
+ const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_sum_op<Scalar> >, const Derived>
+ operator+ (Scalar lhs, const Derived& rhs) {
+ return rhs.unaryExpr(internal::bind1st_op<internal::scalar_sum_op<Scalar> >(lhs));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_difference_op<Scalar,Scalar> >, const Derived>
operator- (Scalar rhs) const {
EIGEN_STATIC_ASSERT((NumTraits<Scalar>::IsSigned || internal::is_same<Scalar, const std::complex<float> >::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
- return unaryExpr(internal::scalar_sub_op<Scalar>(rhs));
+ return unaryExpr(internal::bind2nd_op<internal::scalar_difference_op<Scalar,Scalar> >(rhs));
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Derived>
+ EIGEN_STRONG_INLINE friend
+ const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_difference_op<Scalar> >, const Derived>
+ operator- (Scalar lhs, const Derived& rhs) {
+ return rhs.unaryExpr(internal::bind1st_op<internal::scalar_difference_op<Scalar> >(lhs));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_product_op<Scalar,Scalar> >, const Derived>
operator* (Scalar rhs) const {
- return unaryExpr(internal::scalar_multiple_op<Scalar>(rhs));
+ return unaryExpr(internal::bind2nd_op<internal::scalar_product_op<Scalar,Scalar> >(rhs));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE friend
+ const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_product_op<Scalar> >, const Derived>
+ operator* (Scalar lhs, const Derived& rhs) {
+ return rhs.unaryExpr(internal::bind1st_op<internal::scalar_product_op<Scalar> >(lhs));
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_quotient1_op<Scalar>, const Derived>
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_quotient_op<Scalar,Scalar> >, const Derived>
operator/ (Scalar rhs) const {
- return unaryExpr(internal::scalar_quotient1_op<Scalar>(rhs));
+ return unaryExpr(internal::bind2nd_op<internal::scalar_quotient_op<Scalar,Scalar> >(rhs));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE friend
+ const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_quotient_op<Scalar> >, const Derived>
+ operator/ (Scalar lhs, const Derived& rhs) {
+ return rhs.unaryExpr(internal::bind1st_op<internal::scalar_quotient_op<Scalar> >(lhs));
}
EIGEN_DEVICE_FUNC
@@ -277,7 +323,6 @@ class TensorBase<Derived, ReadOnlyAccessors>
return unaryExpr(internal::scalar_floor_op<Scalar>());
}
-
// Generic binary operation support.
template <typename CustomBinaryOp, typename OtherDerived> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>
@@ -342,66 +387,66 @@ class TensorBase<Derived, ReadOnlyAccessors>
// Comparisons and tests.
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LT>, const Derived, const OtherDerived>
+ const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const OtherDerived>
operator<(const OtherDerived& other) const {
- return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_LT>());
+ return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>());
}
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LE>, const Derived, const OtherDerived>
+ const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const OtherDerived>
operator<=(const OtherDerived& other) const {
- return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_LE>());
+ return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>());
}
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GT>, const Derived, const OtherDerived>
+ const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const OtherDerived>
operator>(const OtherDerived& other) const {
- return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_GT>());
+ return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>());
}
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GE>, const Derived, const OtherDerived>
+ const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const OtherDerived>
operator>=(const OtherDerived& other) const {
- return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_GE>());
+ return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>());
}
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_EQ>, const Derived, const OtherDerived>
+ const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const OtherDerived>
operator==(const OtherDerived& other) const {
- return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_EQ>());
+ return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>());
}
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>, const Derived, const OtherDerived>
+ const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const OtherDerived>
operator!=(const OtherDerived& other) const {
- return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>());
+ return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>());
}
// comparisons and tests for Scalars
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+ EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
operator<(Scalar threshold) const {
return operator<(constant(threshold));
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+ EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
operator<=(Scalar threshold) const {
return operator<=(constant(threshold));
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+ EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
operator>(Scalar threshold) const {
return operator>(constant(threshold));
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+ EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
operator>=(Scalar threshold) const {
return operator>=(constant(threshold));
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_EQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+ EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
operator==(Scalar threshold) const {
return operator==(constant(threshold));
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+ EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
operator!=(Scalar threshold) const {
return operator!=(constant(threshold));
}
@@ -457,15 +502,22 @@ class TensorBase<Derived, ReadOnlyAccessors>
typedef TensorScanOp<internal::SumReducer<CoeffReturnType>, const Derived> TensorScanSumOp;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorScanSumOp
- cumsum(const Index& axis) const {
- return TensorScanSumOp(derived(), axis);
+ cumsum(const Index& axis, bool exclusive = false) const {
+ return TensorScanSumOp(derived(), axis, exclusive);
}
typedef TensorScanOp<internal::ProdReducer<CoeffReturnType>, const Derived> TensorScanProdOp;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorScanProdOp
- cumprod(const Index& axis) const {
- return TensorScanProdOp(derived(), axis);
+ cumprod(const Index& axis, bool exclusive = false) const {
+ return TensorScanProdOp(derived(), axis, exclusive);
+ }
+
+ template <typename Reducer>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorScanOp<Reducer, const Derived>
+ scan(const Index& axis, const Reducer& reducer, bool exclusive = false) const {
+ return TensorScanOp<Reducer, const Derived>(derived(), axis, exclusive, reducer);
}
// Reductions.
@@ -771,8 +823,8 @@ class TensorBase<Derived, ReadOnlyAccessors>
EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
};
-template<typename Derived>
-class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyAccessors> {
+template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value>
+class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
public:
typedef internal::traits<Derived> DerivedTraits;
typedef typename DerivedTraits::Scalar Scalar;
@@ -782,7 +834,7 @@ class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyA
template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor;
template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize;
- template <typename OtherDerived, int AccessLevel> friend class TensorBase;
+ template <typename OtherDerived, int OtherAccessLevel> friend class TensorBase;
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Derived& setZero() {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 56d9c2025..a6001074b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -25,8 +25,8 @@ template<typename Dimensions, typename LhsXprType, typename RhsXprType>
struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
{
// Type promotion to handle the case where the types of the lhs and the rhs are different.
- typedef typename internal::promote_storage_type<typename LhsXprType::Scalar,
- typename RhsXprType::Scalar>::ret Scalar;
+ typedef typename gebp_traits<typename LhsXprType::Scalar, typename RhsXprType::Scalar>::ResScalar Scalar;
+
typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
typename traits<RhsXprType>::StorageKind>::ret StorageKind;
typedef typename promote_index_type<typename traits<LhsXprType>::Index,
@@ -75,8 +75,8 @@ class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXp
{
public:
typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar;
- typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
- typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
+ typedef typename internal::gebp_traits<typename LhsXprType::CoeffReturnType,
+ typename RhsXprType::CoeffReturnType>::ResScalar CoeffReturnType;
typedef typename Eigen::internal::nested<TensorContractionOp>::type Nested;
typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index 886474986..d65dbb40f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -461,8 +461,8 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
#undef writeResultShmem
#undef writeRow
- const int max_i_write = (min)((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
- const int max_j_write = (min)((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
+ const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
+ const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
if (threadIdx.x < max_i_write) {
if (max_j_write == 8) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
index b27e1a1b4..9b2cb3ff6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@@ -130,19 +130,19 @@ class SimpleTensorContractionMapper {
}
Index contract_val = left ? col : row;
- for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
- const Index idx = contract_val / m_k_strides[i];
- linidx += idx * m_contract_strides[i];
- contract_val -= idx * m_k_strides[i];
- }
-
if(array_size<contract_t>::value > 0) {
- if (side == Rhs && inner_dim_contiguous) {
- eigen_assert(m_contract_strides[0] == 1);
- linidx += contract_val;
- } else {
- linidx += contract_val * m_contract_strides[0];
- }
+ for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
+ const Index idx = contract_val / m_k_strides[i];
+ linidx += idx * m_contract_strides[i];
+ contract_val -= idx * m_k_strides[i];
+ }
+
+ if (side == Rhs && inner_dim_contiguous) {
+ eigen_assert(m_contract_strides[0] == 1);
+ linidx += contract_val;
+ } else {
+ linidx += contract_val * m_contract_strides[0];
+ }
}
return linidx;
@@ -153,15 +153,15 @@ class SimpleTensorContractionMapper {
const bool left = (side == Lhs);
Index nocontract_val[2] = {left ? row : col, left ? row + distance : col};
Index linidx[2] = {0, 0};
- for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
- const Index idx0 = nocontract_val[0] / m_ij_strides[i];
- const Index idx1 = nocontract_val[1] / m_ij_strides[i];
- linidx[0] += idx0 * m_nocontract_strides[i];
- linidx[1] += idx1 * m_nocontract_strides[i];
- nocontract_val[0] -= idx0 * m_ij_strides[i];
- nocontract_val[1] -= idx1 * m_ij_strides[i];
- }
if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+ for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
+ const Index idx0 = nocontract_val[0] / m_ij_strides[i];
+ const Index idx1 = nocontract_val[1] / m_ij_strides[i];
+ linidx[0] += idx0 * m_nocontract_strides[i];
+ linidx[1] += idx1 * m_nocontract_strides[i];
+ nocontract_val[0] -= idx0 * m_ij_strides[i];
+ nocontract_val[1] -= idx1 * m_ij_strides[i];
+ }
if (side == Lhs && inner_dim_contiguous) {
eigen_assert(m_nocontract_strides[0] == 1);
linidx[0] += nocontract_val[0];
@@ -173,22 +173,24 @@ class SimpleTensorContractionMapper {
}
Index contract_val[2] = {left ? col : row, left ? col : row + distance};
- for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
- const Index idx0 = contract_val[0] / m_k_strides[i];
- const Index idx1 = contract_val[1] / m_k_strides[i];
- linidx[0] += idx0 * m_contract_strides[i];
- linidx[1] += idx1 * m_contract_strides[i];
- contract_val[0] -= idx0 * m_k_strides[i];
- contract_val[1] -= idx1 * m_k_strides[i];
- }
+ if (array_size<contract_t>::value> 0) {
+ for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
+ const Index idx0 = contract_val[0] / m_k_strides[i];
+ const Index idx1 = contract_val[1] / m_k_strides[i];
+ linidx[0] += idx0 * m_contract_strides[i];
+ linidx[1] += idx1 * m_contract_strides[i];
+ contract_val[0] -= idx0 * m_k_strides[i];
+ contract_val[1] -= idx1 * m_k_strides[i];
+ }
- if (side == Rhs && inner_dim_contiguous) {
- eigen_assert(m_contract_strides[0] == 1);
- linidx[0] += contract_val[0];
- linidx[1] += contract_val[1];
- } else {
- linidx[0] += contract_val[0] * m_contract_strides[0];
- linidx[1] += contract_val[1] * m_contract_strides[0];
+ if (side == Rhs && inner_dim_contiguous) {
+ eigen_assert(m_contract_strides[0] == 1);
+ linidx[0] += contract_val[0];
+ linidx[1] += contract_val[1];
+ } else {
+ linidx[0] += contract_val[0] * m_contract_strides[0];
+ linidx[1] += contract_val[1] * m_contract_strides[0];
+ }
}
return IndexPair<Index>(linidx[0], linidx[1]);
}
@@ -200,7 +202,7 @@ class SimpleTensorContractionMapper {
return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size;
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const {
- return ((side == Lhs) && inner_dim_contiguous) ? m_contract_strides[0] : 1;
+ return ((side == Lhs) && inner_dim_contiguous && array_size<contract_t>::value > 0) ? m_contract_strides[0] : 1;
}
protected:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index a60a17049..ee16cde9b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -202,7 +202,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
// across k dimension.
const TensorOpCost cost =
contractionCost(m, n, bm, bn, bk, shard_by_col, false);
- Index num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
+ int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
static_cast<double>(n) * m, cost, this->m_device.numThreads());
// TODO(dvyukov): this is a stop-gap to prevent regressions while the cost
@@ -301,7 +301,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
class Context {
public:
Context(const Device& device, int num_threads, LhsMapper& lhs,
- RhsMapper& rhs, Scalar* buffer, Index m, Index n, Index k, Index bm,
+ RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm,
Index bn, Index bk, Index nm, Index nn, Index nk, Index gm,
Index gn, Index nm0, Index nn0, bool shard_by_col,
bool parallel_pack)
@@ -309,13 +309,13 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
lhs_(lhs),
rhs_(rhs),
buffer_(buffer),
- output_(buffer, m),
+ output_(buffer, tm),
num_threads_(num_threads),
shard_by_col_(shard_by_col),
parallel_pack_(parallel_pack),
- m_(m),
- n_(n),
- k_(k),
+ m_(tm),
+ n_(tn),
+ k_(tk),
bm_(bm),
bn_(bn),
bk_(bk),
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
index a76c8ca35..d66e45d50 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@@ -91,21 +91,21 @@ class TensorOpCost {
}
// TODO(rmlarsen): Define min in terms of total cost, not elementwise.
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMin(
- const TensorOpCost& rhs) {
- bytes_loaded_ = numext::mini(bytes_loaded_, rhs.bytes_loaded());
- bytes_stored_ = numext::mini(bytes_stored_, rhs.bytes_stored());
- compute_cycles_ = numext::mini(compute_cycles_, rhs.compute_cycles());
- return *this;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(
+ const TensorOpCost& rhs) const {
+ double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());
+ double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());
+ double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());
+ return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
}
// TODO(rmlarsen): Define max in terms of total cost, not elementwise.
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMax(
- const TensorOpCost& rhs) {
- bytes_loaded_ = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
- bytes_stored_ = numext::maxi(bytes_stored_, rhs.bytes_stored());
- compute_cycles_ = numext::maxi(compute_cycles_, rhs.compute_cycles());
- return *this;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(
+ const TensorOpCost& rhs) const {
+ double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
+ double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());
+ double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());
+ return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index 6c12b2ed8..1468caa23 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -12,6 +12,8 @@
namespace Eigen {
+static const int kCudaScratchSize = 1024;
+
// This defines an interface that GPUDevice can take to use
// CUDA streams underneath.
class StreamInterface {
@@ -27,6 +29,12 @@ class StreamInterface {
// Return a scratchpad buffer of size 1k
virtual void* scratchpad() const = 0;
+
+ // Return a semaphore. The semaphore is initially initialized to 0, and
+ // each kernel using it is responsible for resetting to 0 upon completion
+ // to maintain the invariant that the semaphore is always equal to 0 upon
+ // each kernel start.
+ virtual unsigned int* semaphore() const = 0;
};
static cudaDeviceProp* m_deviceProperties;
@@ -65,12 +73,12 @@ static const cudaStream_t default_stream = cudaStreamDefault;
class CudaStreamDevice : public StreamInterface {
public:
// Use the default stream on the current device
- CudaStreamDevice() : stream_(&default_stream), scratch_(NULL) {
+ CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
cudaGetDevice(&device_);
initializeDeviceProp();
}
// Use the default stream on the specified device
- CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL) {
+ CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
initializeDeviceProp();
}
// Use the specified stream. Note that it's the
@@ -78,7 +86,7 @@ class CudaStreamDevice : public StreamInterface {
// the specified device. If no device is specified the code
// assumes that the stream is associated to the current gpu device.
CudaStreamDevice(const cudaStream_t* stream, int device = -1)
- : stream_(stream), device_(device), scratch_(NULL) {
+ : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
if (device < 0) {
cudaGetDevice(&device_);
} else {
@@ -123,15 +131,27 @@ class CudaStreamDevice : public StreamInterface {
virtual void* scratchpad() const {
if (scratch_ == NULL) {
- scratch_ = allocate(1024);
+ scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int));
}
return scratch_;
}
+ virtual unsigned int* semaphore() const {
+ if (semaphore_ == NULL) {
+ char* scratch = static_cast<char*>(scratchpad()) + kCudaScratchSize;
+ semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+ cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
+ EIGEN_UNUSED_VARIABLE(err)
+ assert(err == cudaSuccess);
+ }
+ return semaphore_;
+ }
+
private:
const cudaStream_t* stream_;
int device_;
mutable void* scratch_;
+ mutable unsigned int* semaphore_;
};
struct GpuDevice {
@@ -174,6 +194,15 @@ struct GpuDevice {
#endif
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned int* semaphore() const {
+#ifndef __CUDA_ARCH__
+ return stream_->semaphore();
+#else
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
+ return NULL;
+#endif
+ }
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
#ifndef __CUDA_ARCH__
cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index d31b0ad38..069680a11 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -106,7 +106,7 @@ static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) {
// Build a thread pool device on top the an existing pool of threads.
struct ThreadPoolDevice {
// The ownership of the thread pool remains with the caller.
- ThreadPoolDevice(ThreadPoolInterface* pool, size_t num_cores) : pool_(pool), num_threads_(num_cores) { }
+ ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores) : pool_(pool), num_threads_(num_cores) { }
EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
return internal::aligned_malloc(num_bytes);
@@ -130,7 +130,7 @@ struct ThreadPoolDevice {
::memset(buffer, c, n);
}
- EIGEN_STRONG_INLINE size_t numThreads() const {
+ EIGEN_STRONG_INLINE int numThreads() const {
return num_threads_;
}
@@ -151,9 +151,7 @@ struct ThreadPoolDevice {
template <class Function, class... Args>
EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const {
Notification* n = new Notification();
- std::function<void()> func =
- std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, f, args...);
- pool_->Schedule(func);
+ pool_->Schedule(std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, f, args...));
return n;
}
@@ -161,15 +159,19 @@ struct ThreadPoolDevice {
EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b,
Function&& f,
Args&&... args) const {
- std::function<void()> func = std::bind(
- &FunctionWrapperWithBarrier<Function, Args...>::run, b, f, args...);
- pool_->Schedule(func);
+ pool_->Schedule(std::bind(
+ &FunctionWrapperWithBarrier<Function, Args...>::run, b, f, args...));
}
template <class Function, class... Args>
EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const {
- std::function<void()> func = std::bind(f, args...);
- pool_->Schedule(func);
+ pool_->Schedule(std::bind(f, args...));
+ }
+
+ // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if
+ // called from one of the threads in pool_. Returns -1 otherwise.
+ EIGEN_STRONG_INLINE int currentThreadId() const {
+ return pool_->CurrentThreadId();
}
// parallelFor executes f with [0, n) arguments in parallel and waits for
@@ -182,7 +184,7 @@ struct ThreadPoolDevice {
std::function<void(Index, Index)> f) const {
typedef TensorCostModel<ThreadPoolDevice> CostModel;
if (n <= 1 || numThreads() == 1 ||
- CostModel::numThreads(n, cost, numThreads()) == 1) {
+ CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
f(0, n);
return;
}
@@ -242,7 +244,7 @@ struct ThreadPoolDevice {
// Recursively divide size into halves until we reach block_size.
// Division code rounds mid to block_size, so we are guaranteed to get
// block_count leaves that do actual computations.
- Barrier barrier(block_count);
+ Barrier barrier(static_cast<unsigned int>(block_count));
std::function<void(Index, Index)> handleRange;
handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) {
if (last - first <= block_size) {
@@ -268,7 +270,7 @@ struct ThreadPoolDevice {
private:
ThreadPoolInterface* pool_;
- size_t num_threads_;
+ int num_threads_;
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index 26b1f65a8..a08dfa7c3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -94,7 +94,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
enum {
- IsAligned = true,
+ IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index 31b361c83..33ffaa600 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -131,7 +131,7 @@ double loadConstant(const double* address) {
}
template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
Eigen::half loadConstant(const Eigen::half* address) {
- return Eigen::half(internal::raw_uint16_to_half(__ldg(&address->x)));
+ return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x)));
}
#endif
}
@@ -403,6 +403,101 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
TensorEvaluator<RightArgType, Device> m_rightImpl;
};
+// -------------------- CwiseTernaryOp --------------------
+
+template<typename TernaryOp, typename Arg1Type, typename Arg2Type, typename Arg3Type, typename Device>
+struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type>, Device>
+{
+ typedef TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type> XprType;
+
+ enum {
+ IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned & TensorEvaluator<Arg3Type, Device>::IsAligned,
+ PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess &
+ internal::functor_traits<TernaryOp>::PacketAccess,
+ Layout = TensorEvaluator<Arg1Type, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
+ };
+
+ EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+ : m_functor(op.functor()),
+ m_arg1Impl(op.arg1Expression(), device),
+ m_arg2Impl(op.arg2Expression(), device),
+ m_arg3Impl(op.arg3Expression(), device)
+ {
+ EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<Arg1Type, Device>::Layout) == static_cast<int>(TensorEvaluator<Arg3Type, Device>::Layout) || internal::traits<XprType>::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
+ typename internal::traits<Arg2Type>::StorageKind>::value),
+ STORAGE_KIND_MUST_MATCH)
+ EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
+ typename internal::traits<Arg3Type>::StorageKind>::value),
+ STORAGE_KIND_MUST_MATCH)
+ EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
+ typename internal::traits<Arg2Type>::Index>::value),
+ STORAGE_INDEX_MUST_MATCH)
+ EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
+ typename internal::traits<Arg3Type>::Index>::value),
+ STORAGE_INDEX_MUST_MATCH)
+
+ eigen_assert(dimensions_match(m_arg1Impl.dimensions(), m_arg2Impl.dimensions()) && dimensions_match(m_arg1Impl.dimensions(), m_arg3Impl.dimensions()));
+ }
+
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+ typedef typename TensorEvaluator<Arg1Type, Device>::Dimensions Dimensions;
+
+ EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
+ {
+ // TODO: use arg2 or arg3 dimensions if they are known at compile time.
+ return m_arg1Impl.dimensions();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+ m_arg1Impl.evalSubExprsIfNeeded(NULL);
+ m_arg2Impl.evalSubExprsIfNeeded(NULL);
+ m_arg3Impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ m_arg1Impl.cleanup();
+ m_arg2Impl.cleanup();
+ m_arg3Impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+ {
+ return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index));
+ }
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ return m_functor.packetOp(m_arg1Impl.template packet<LoadMode>(index),
+ m_arg2Impl.template packet<LoadMode>(index),
+ m_arg3Impl.template packet<LoadMode>(index));
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ const double functor_cost = internal::functor_traits<TernaryOp>::Cost;
+ return m_arg1Impl.costPerCoeff(vectorized) +
+ m_arg2Impl.costPerCoeff(vectorized) +
+ m_arg3Impl.costPerCoeff(vectorized) +
+ TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
+ }
+
+ EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+
+ private:
+ const TernaryOp m_functor;
+ TensorEvaluator<Arg1Type, Device> m_arg1Impl;
+ TensorEvaluator<Arg1Type, Device> m_arg2Impl;
+ TensorEvaluator<Arg3Type, Device> m_arg3Impl;
+};
+
// -------------------- SelectOp --------------------
@@ -479,7 +574,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
.cwiseMax(m_elseImpl.costPerCoeff(vectorized));
}
- EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return NULL; }
private:
TensorEvaluator<IfArgType, Device> m_condImpl;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index ad5c97b57..a116bf17f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -159,7 +159,6 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
#else
size_t num_threads = device.numThreads();
if (num_threads > 1) {
- cost = evaluator.costPerCoeff(Vectorizable)
num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
size, evaluator.costPerCoeff(Vectorizable), num_threads);
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
index ea250d8bc..5f2e329f2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
@@ -219,6 +219,86 @@ class TensorCwiseBinaryOp : public TensorBase<TensorCwiseBinaryOp<BinaryOp, LhsX
namespace internal {
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+struct traits<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> >
+{
+ // Type promotion to handle the case where the types of the args are different.
+ typedef typename result_of<
+ TernaryOp(typename Arg1XprType::Scalar,
+ typename Arg2XprType::Scalar,
+ typename Arg3XprType::Scalar)>::type Scalar;
+ typedef traits<Arg1XprType> XprTraits;
+ typedef typename traits<Arg1XprType>::StorageKind StorageKind;
+ typedef typename traits<Arg1XprType>::Index Index;
+ typedef typename Arg1XprType::Nested Arg1Nested;
+ typedef typename Arg2XprType::Nested Arg2Nested;
+ typedef typename Arg3XprType::Nested Arg3Nested;
+ typedef typename remove_reference<Arg1Nested>::type _Arg1Nested;
+ typedef typename remove_reference<Arg2Nested>::type _Arg2Nested;
+ typedef typename remove_reference<Arg3Nested>::type _Arg3Nested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+
+ enum {
+ Flags = 0
+ };
+};
+
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+struct eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, Eigen::Dense>
+{
+ typedef const TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>& type;
+};
+
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+struct nested<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, 1, typename eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> >::type>
+{
+ typedef TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+class TensorCwiseTernaryOp : public TensorBase<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, ReadOnlyAccessors>
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef Scalar CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorCwiseTernaryOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseTernaryOp(const Arg1XprType& arg1, const Arg2XprType& arg2, const Arg3XprType& arg3, const TernaryOp& func = TernaryOp())
+ : m_arg1_xpr(arg1), m_arg2_xpr(arg2), m_arg3_xpr(arg3), m_functor(func) {}
+
+ EIGEN_DEVICE_FUNC
+ const TernaryOp& functor() const { return m_functor; }
+
+ /** \returns the nested expressions */
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename Arg1XprType::Nested>::type&
+ arg1Expression() const { return m_arg1_xpr; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename Arg1XprType::Nested>::type&
+ arg2Expression() const { return m_arg2_xpr; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename Arg3XprType::Nested>::type&
+ arg3Expression() const { return m_arg3_xpr; }
+
+ protected:
+ typename Arg1XprType::Nested m_arg1_xpr;
+ typename Arg1XprType::Nested m_arg2_xpr;
+ typename Arg3XprType::Nested m_arg3_xpr;
+ const TernaryOp m_functor;
+};
+
+
+namespace internal {
template<typename IfXprType, typename ThenXprType, typename ElseXprType>
struct traits<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >
: traits<ThenXprType>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index ece2ed91b..08eb5595a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -329,7 +329,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
for (Index i = 0; i < n; ++i) {
if(FFTDir == FFT_FORWARD) {
- a[i] = data[i] * std::conj(pos_j_base_powered[i]);
+ a[i] = data[i] * numext::conj(pos_j_base_powered[i]);
}
else {
a[i] = data[i] * pos_j_base_powered[i];
@@ -344,7 +344,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
b[i] = pos_j_base_powered[i];
}
else {
- b[i] = std::conj(pos_j_base_powered[i]);
+ b[i] = numext::conj(pos_j_base_powered[i]);
}
}
for (Index i = n; i < m - n; ++i) {
@@ -355,7 +355,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
b[i] = pos_j_base_powered[m-i];
}
else {
- b[i] = std::conj(pos_j_base_powered[m-i]);
+ b[i] = numext::conj(pos_j_base_powered[m-i]);
}
}
@@ -379,7 +379,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
for (Index i = 0; i < n; ++i) {
if(FFTDir == FFT_FORWARD) {
- data[i] = a[i] * std::conj(pos_j_base_powered[i]);
+ data[i] = a[i] * numext::conj(pos_j_base_powered[i]);
}
else {
data[i] = a[i] * pos_j_base_powered[i];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index 5d0548b84..c23ecdbc4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -102,7 +102,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
- const Index numValues = m_impl.dimensions().TotalSize();
+ const Index numValues = internal::array_prod(m_impl.dimensions());
m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType));
// Should initialize the memory in case we're dealing with non POD types.
if (NumTraits<CoeffReturnType>::RequireInitialization) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index a1a18d938..490ddd8bd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -16,11 +16,12 @@ template<typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType
template<typename Scalar_, typename Dimensions, int Options_ = 0, typename IndexType = DenseIndex> class TensorFixedSize;
template<typename PlainObjectType, int Options_ = Unaligned> class TensorMap;
template<typename PlainObjectType> class TensorRef;
-template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value> class TensorBase;
+template<typename Derived, int AccessLevel> class TensorBase;
template<typename NullaryOp, typename PlainObjectType> class TensorCwiseNullaryOp;
template<typename UnaryOp, typename XprType> class TensorCwiseUnaryOp;
template<typename BinaryOp, typename LeftXprType, typename RightXprType> class TensorCwiseBinaryOp;
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType> class TensorCwiseTernaryOp;
template<typename IfXprType, typename ThenXprType, typename ElseXprType> class TensorSelectOp;
template<typename Op, typename Dims, typename XprType> class TensorReductionOp;
template<typename XprType> class TensorIndexTupleOp;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index 3dd32e9d1..a8e48fced 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -84,6 +84,14 @@ struct functor_traits<scalar_sigmoid_op<T> > {
};
+template<typename Reducer, typename Device>
+struct reducer_traits {
+ enum {
+ Cost = 1,
+ PacketAccess = false
+ };
+};
+
// Standard reduction functors
template <typename T> struct SumReducer
{
@@ -119,6 +127,15 @@ template <typename T> struct SumReducer
}
};
+template <typename T, typename Device>
+struct reducer_traits<SumReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = PacketType<T, Device>::HasAdd
+ };
+};
+
+
template <typename T> struct MeanReducer
{
static const bool PacketAccess = packet_traits<T>::HasAdd && !NumTraits<T>::IsInteger;
@@ -162,6 +179,15 @@ template <typename T> struct MeanReducer
DenseIndex packetCount_;
};
+template <typename T, typename Device>
+struct reducer_traits<MeanReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = PacketType<T, Device>::HasAdd
+ };
+};
+
+
template <typename T> struct MaxReducer
{
static const bool PacketAccess = packet_traits<T>::HasMax;
@@ -195,6 +221,15 @@ template <typename T> struct MaxReducer
}
};
+template <typename T, typename Device>
+struct reducer_traits<MaxReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = PacketType<T, Device>::HasMax
+ };
+};
+
+
template <typename T> struct MinReducer
{
static const bool PacketAccess = packet_traits<T>::HasMin;
@@ -228,6 +263,14 @@ template <typename T> struct MinReducer
}
};
+template <typename T, typename Device>
+struct reducer_traits<MinReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = PacketType<T, Device>::HasMin
+ };
+};
+
template <typename T> struct ProdReducer
{
@@ -263,6 +306,14 @@ template <typename T> struct ProdReducer
}
};
+template <typename T, typename Device>
+struct reducer_traits<ProdReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::MulCost,
+ PacketAccess = PacketType<T, Device>::HasMul
+ };
+};
+
struct AndReducer
{
@@ -280,6 +331,15 @@ struct AndReducer
}
};
+template <typename Device>
+struct reducer_traits<AndReducer, Device> {
+ enum {
+ Cost = 1,
+ PacketAccess = false
+ };
+};
+
+
struct OrReducer {
static const bool PacketAccess = false;
static const bool IsStateful = false;
@@ -295,6 +355,15 @@ struct OrReducer {
}
};
+template <typename Device>
+struct reducer_traits<OrReducer, Device> {
+ enum {
+ Cost = 1,
+ PacketAccess = false
+ };
+};
+
+
// Argmin/Argmax reducers
template <typename T> struct ArgMaxTupleReducer
{
@@ -312,6 +381,15 @@ template <typename T> struct ArgMaxTupleReducer
}
};
+template <typename T, typename Device>
+struct reducer_traits<ArgMaxTupleReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = false
+ };
+};
+
+
template <typename T> struct ArgMinTupleReducer
{
static const bool PacketAccess = false;
@@ -328,6 +406,14 @@ template <typename T> struct ArgMinTupleReducer
}
};
+template <typename T, typename Device>
+struct reducer_traits<ArgMinTupleReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = false
+ };
+};
+
// Random number generation
namespace {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h
new file mode 100644
index 000000000..665b861cf
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h
@@ -0,0 +1,33 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
+
+namespace Eigen {
+
+/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given tensors.
+ *
+ * This function computes the regularized incomplete beta function (integral).
+ *
+ */
+template <typename ADerived, typename BDerived, typename XDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+ TensorCwiseTernaryOp<internal::scalar_betainc_op<typename XDerived::Scalar>,
+ const ADerived, const BDerived, const XDerived>
+ betainc(const ADerived& a, const BDerived& b, const XDerived& x) {
+ return TensorCwiseTernaryOp<
+ internal::scalar_betainc_op<typename XDerived::Scalar>, const ADerived,
+ const BDerived, const XDerived>(
+ a, b, x, internal::scalar_betainc_op<typename XDerived::Scalar>());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
index 38a833f82..a901c5dd4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
@@ -13,38 +13,61 @@
namespace Eigen {
namespace internal {
-template<>
-struct significant_decimals_impl<std::string>
- : significant_decimals_default_impl<std::string, true>
-{};
-}
+// Print the tensor as a 2d matrix
+template <typename Tensor, int Rank>
+struct TensorPrinter {
+ static void run (std::ostream& os, const Tensor& tensor) {
+ typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar;
+ typedef typename Tensor::Index Index;
+ const Index total_size = internal::array_prod(tensor.dimensions());
+ if (total_size > 0) {
+ const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions());
+ static const int layout = Tensor::Layout;
+ Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim);
+ os << matrix;
+ }
+ }
+};
+
+
+// Print the tensor as a vector
+template <typename Tensor>
+struct TensorPrinter<Tensor, 1> {
+ static void run (std::ostream& os, const Tensor& tensor) {
+ typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar;
+ typedef typename Tensor::Index Index;
+ const Index total_size = internal::array_prod(tensor.dimensions());
+ if (total_size > 0) {
+ Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size);
+ os << array;
+ }
+ }
+};
+
+
+// Print the tensor as a scalar
+template <typename Tensor>
+struct TensorPrinter<Tensor, 0> {
+ static void run (std::ostream& os, const Tensor& tensor) {
+ os << tensor.coeff(0);
+ }
+};
+}
template <typename T>
std::ostream& operator << (std::ostream& os, const TensorBase<T, ReadOnlyAccessors>& expr) {
+ typedef TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> Evaluator;
+ typedef typename Evaluator::Dimensions Dimensions;
+
// Evaluate the expression if needed
TensorForcedEvalOp<const T> eval = expr.eval();
- TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> tensor(eval, DefaultDevice());
+ Evaluator tensor(eval, DefaultDevice());
tensor.evalSubExprsIfNeeded(NULL);
- typedef typename internal::remove_const<typename T::Scalar>::type Scalar;
- typedef typename T::Index Index;
- typedef typename TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Dimensions Dimensions;
- const Index total_size = internal::array_prod(tensor.dimensions());
-
- // Print the tensor as a 1d vector or a 2d matrix.
+ // Print the result
static const int rank = internal::array_size<Dimensions>::value;
- if (rank == 0) {
- os << tensor.coeff(0);
- } else if (rank == 1) {
- Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size);
- os << array;
- } else {
- const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions());
- static const int layout = TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Layout;
- Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim);
- os << matrix;
- }
+ internal::TensorPrinter<Evaluator, rank>::run(os, tensor);
// Cleanup.
tensor.cleanup();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
index 33c6c1b0f..ede3939c2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -29,25 +29,47 @@ namespace Eigen {
namespace internal {
namespace {
+
// Note: result is undefined if val == 0
template <typename T>
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int count_leading_zeros(const T val)
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+ typename internal::enable_if<sizeof(T)==4,int>::type count_leading_zeros(const T val)
{
#ifdef __CUDA_ARCH__
- return (sizeof(T) == 8) ? __clzll(val) : __clz(val);
+ return __clz(val);
#elif EIGEN_COMP_MSVC
- unsigned long index;
- if (sizeof(T) == 8) {
- _BitScanReverse64(&index, val);
- } else {
- _BitScanReverse(&index, val);
- }
- return (sizeof(T) == 8) ? 63 - index : 31 - index;
+ unsigned long index;
+ _BitScanReverse(&index, val);
+ return 31 - index;
+#else
+ EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ return __builtin_clz(static_cast<uint32_t>(val));
+#endif
+ }
+
+ template <typename T>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+ typename internal::enable_if<sizeof(T)==8,int>::type count_leading_zeros(const T val)
+ {
+#ifdef __CUDA_ARCH__
+ return __clzll(val);
+#elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64
+ unsigned long index;
+ _BitScanReverse64(&index, val);
+ return 63 - index;
+#elif EIGEN_COMP_MSVC
+ // MSVC's _BitScanReverse64 is not available for 32bits builds.
+ unsigned int lo = (unsigned int)(val&0xffffffff);
+ unsigned int hi = (unsigned int)((val>>32)&0xffffffff);
+ int n;
+ if(hi==0)
+ n = 32 + count_leading_zeros<unsigned int>(lo);
+ else
+ n = count_leading_zeros<unsigned int>(hi);
+ return n;
#else
EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE);
- return (sizeof(T) == 8) ?
- __builtin_clzll(static_cast<uint64_t>(val)) :
- __builtin_clz(static_cast<uint32_t>(val));
+ return __builtin_clzll(static_cast<uint64_t>(val));
#endif
}
@@ -98,7 +120,9 @@ namespace {
return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
#else
const uint64_t shift = 1ULL << log_div;
- TensorUInt128<uint64_t, uint64_t> result = (TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider) - TensorUInt128<static_val<1>, static_val<0> >(1, 0) + TensorUInt128<static_val<0>, static_val<1> >(1));
+ TensorUInt128<uint64_t, uint64_t> result = TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider)
+ - TensorUInt128<static_val<1>, static_val<0> >(1, 0)
+ + TensorUInt128<static_val<0>, static_val<1> >(1);
return static_cast<uint64_t>(result);
#endif
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index b1645d56f..fdb5ee6b8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -47,22 +47,39 @@ template <> struct max_n_1<0> {
// Default packet types
template <typename Scalar, typename Device>
-struct PacketType {
+struct PacketType : internal::packet_traits<Scalar> {
typedef typename internal::packet_traits<Scalar>::type type;
- enum { size = internal::unpacket_traits<type>::size };
};
// For CUDA packet types when using a GpuDevice
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16)
template <>
-struct PacketType<float, GpuDevice> {
- typedef float4 type;
- static const int size = 4;
-};
-template <>
-struct PacketType<double, GpuDevice> {
- typedef double2 type;
+struct PacketType<half, GpuDevice> {
+ typedef half2 type;
static const int size = 2;
+ enum {
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasNegate = 1,
+ HasAbs = 1,
+ HasArg = 0,
+ HasAbs2 = 0,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 0,
+ HasSetLinear = 0,
+ HasBlend = 0,
+
+ HasDiv = 1,
+ HasSqrt = 1,
+ HasRsqrt = 1,
+ HasExp = 1,
+ HasLog = 1,
+ HasLog1p = 0,
+ HasLog10 = 0,
+ HasPow = 1,
+ };
};
#endif
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 52cfc2824..d34f1e328 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -148,7 +148,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); }
- const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+ EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
protected:
TensorEvaluator<ArgType, Device> m_impl;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 99a09c058..9df697e4c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -264,7 +264,7 @@ struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
eigen_assert(num_coeffs >= numblocks * blocksize);
- Barrier barrier(numblocks);
+ Barrier barrier(internal::convert_index<unsigned int>(numblocks));
MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
for (Index i = 0; i < numblocks; ++i) {
device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run,
@@ -316,7 +316,7 @@ struct OuterReducer {
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
template <int B, int N, typename S, typename R, typename I>
-__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
+__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
#ifdef EIGEN_HAS_CUDA_FP16
@@ -492,7 +492,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
}
// Attempt to use an optimized reduction.
- else if (RunningOnGPU && data && (m_device.majorDeviceVersion() >= 3)) {
+ else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) {
bool reducing_inner_dims = true;
for (int i = 0; i < NumReducedDims; ++i) {
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -505,8 +505,20 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
(reducing_inner_dims || ReducingInnerMostDims)) {
const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+ if (!data && num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) {
+ data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+ m_result = data;
+ }
Op reducer(m_reducer);
- return internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
+ if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
+ if (m_result) {
+ m_device.deallocate(m_result);
+ m_result = NULL;
+ }
+ return true;
+ } else {
+ return (m_result != NULL);
+ }
}
bool preserving_inner_dims = true;
@@ -521,8 +533,20 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
preserving_inner_dims) {
const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+ if (!data && num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) {
+ data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+ m_result = data;
+ }
Op reducer(m_reducer);
- return internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
+ if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
+ if (m_result) {
+ m_device.deallocate(m_result);
+ m_result = NULL;
+ }
+ return true;
+ } else {
+ return (m_result != NULL);
+ }
}
}
return true;
@@ -537,8 +561,8 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
- if (RunningFullReduction && m_result) {
- return *m_result;
+ if ((RunningFullReduction || RunningOnGPU) && m_result) {
+ return *(m_result + index);
}
Op reducer(m_reducer);
if (ReducingInnerMostDims || RunningFullReduction) {
@@ -558,7 +582,11 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
- eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+ eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions())));
+
+ if (RunningOnGPU && m_result) {
+ return internal::pload<PacketReturnType>(m_result + index);
+ }
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
if (ReducingInnerMostDims) {
@@ -616,7 +644,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
#endif
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
- template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
+ template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
#ifdef EIGEN_HAS_CUDA_FP16
template <typename S, typename R, typename I> friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 45087a9a4..65638b6a8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -67,11 +67,21 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
#endif
}
+// We extend atomicExch to support extra data types
+template <typename Type>
+__device__ inline Type atomicExchCustom(Type* address, Type val) {
+ return atomicExch(address, val);
+}
+
+template <>
+__device__ inline double atomicExchCustom(double* address, double val) {
+ unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
+ return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
+}
#ifdef EIGEN_HAS_CUDA_FP16
template <template <typename T> class R>
__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
-#if __CUDA_ARCH__ >= 300
unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
unsigned int newval = oldval;
reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
@@ -87,9 +97,6 @@ __device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer
return;
}
}
-#else
- assert(0 && "Shouldn't be called on unsupported device");
-#endif
}
#endif
@@ -112,18 +119,44 @@ __global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coe
}
}
+
template <int BlockSize, int NumPerThread, typename Self,
typename Reducer, typename Index>
__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
- typename Self::CoeffReturnType* output) {
+ typename Self::CoeffReturnType* output, unsigned int* semaphore) {
+#if __CUDA_ARCH__ >= 300
+ // Initialize the output value
const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
-
- // Initialize the output value if it wasn't initialized by the ReductionInitKernel
- if (gridDim.x == 1 && first_index == 0) {
- *output = reducer.initialize();
- __syncthreads();
+ if (gridDim.x == 1) {
+ if (first_index == 0) {
+ *output = reducer.initialize();
+ }
+ }
+ else {
+ if (threadIdx.x == 0) {
+ unsigned int block = atomicCAS(semaphore, 0u, 1u);
+ if (block == 0) {
+ // We're the first block to run, initialize the output value
+ atomicExchCustom(output, reducer.initialize());
+ __threadfence();
+ atomicExch(semaphore, 2u);
+ }
+ else {
+ // Wait for the first block to initialize the output value.
+ // Use atomicCAS here to ensure that the reads aren't cached
+ unsigned int val;
+ do {
+ val = atomicCAS(semaphore, 2u, 2u);
+ }
+ while (val < 2u);
+ }
+ }
}
+ __syncthreads();
+
+ eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
+
typename Self::CoeffReturnType accum = reducer.initialize();
Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
for (Index i = 0; i < max_iter; i+=BlockSize) {
@@ -141,6 +174,14 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
if ((threadIdx.x & (warpSize - 1)) == 0) {
atomicReduce(output, accum, reducer);
}
+
+ if (gridDim.x > 1 && threadIdx.x == 0) {
+ // Let the last block reset the semaphore
+ atomicInc(semaphore, gridDim.x + 1);
+ }
+#else
+ assert(0 && "Shouldn't be called on unsupported device");
+#endif
}
@@ -229,32 +270,35 @@ __global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2
#endif
-
-template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
struct FullReductionLauncher {
static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
- assert(false && "Should only be called on floats and half floats");
+ assert(false && "Should only be called on doubles, floats and half floats");
}
};
-template <typename Self, typename Op, bool PacketAccess>
-struct FullReductionLauncher<Self, Op, float, PacketAccess> {
- static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs) {
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct FullReductionLauncher<
+ Self, Op, OutputType, PacketAccess,
+ typename internal::enable_if<
+ internal::is_same<float, OutputType>::value ||
+ internal::is_same<double, OutputType>::value,
+ void>::type> {
+ static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
typedef typename Self::Index Index;
typedef typename Self::CoeffReturnType Scalar;
const int block_size = 256;
const int num_per_thread = 128;
const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+ unsigned int* semaphore = NULL;
if (num_blocks > 1) {
- // We initialize the outputs outside the reduction kernel when we can't be sure that there
- // won't be a race conditions between multiple thread blocks.
- LAUNCH_CUDA_KERNEL((ReductionInitKernel<Scalar, Index>),
- 1, 32, 0, device, reducer.initialize(), 1, output);
+ semaphore = device.semaphore();
}
LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
- num_blocks, block_size, 0, device, reducer, self, num_coeffs, output);
+ num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
}
};
@@ -298,27 +342,29 @@ struct FullReductionLauncher<Self, Op, Eigen::half, true> {
template <typename Self, typename Op, bool Vectorizable>
struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
// Unfortunately nvidia doesn't support well exotic types such as complex,
- // so reduce the scope of the optimized version of the code to the simple case
- // of floats and half floats.
- #ifdef EIGEN_HAS_CUDA_FP16
+ // so reduce the scope of the optimized version of the code to the simple cases
+ // of doubles, floats and half floats
+#ifdef EIGEN_HAS_CUDA_FP16
static const bool HasOptimizedImplementation = !Op::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
- (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && Op::PacketAccess));
+ internal::is_same<typename Self::CoeffReturnType, double>::value ||
+ (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
#else
static const bool HasOptimizedImplementation = !Op::IsStateful &&
- internal::is_same<typename Self::CoeffReturnType, float>::value;
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value);
#endif
template <typename OutputType>
static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
- assert(HasOptimizedImplementation && "Should only be called on floats or half floats");
+ assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
const Index num_coeffs = array_prod(self.m_impl.dimensions());
// Don't crash when we're called with an input tensor of size 0.
if (num_coeffs == 0) {
return;
}
- FullReductionLauncher<Self, Op, OutputType, Op::PacketAccess>::run(self, reducer, device, output, num_coeffs);
+ FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
}
};
@@ -327,6 +373,8 @@ template <int NumPerThread, typename Self,
typename Reducer, typename Index>
__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
typename Self::CoeffReturnType* output) {
+#if __CUDA_ARCH__ >= 300
+ typedef typename Self::CoeffReturnType Type;
eigen_assert(blockDim.y == 1);
eigen_assert(blockDim.z == 1);
eigen_assert(gridDim.y == 1);
@@ -356,13 +404,13 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
const Index col_block = i % input_col_blocks;
const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
- float reduced_val = reducer.initialize();
+ Type reduced_val = reducer.initialize();
for (Index j = 0; j < NumPerThread; j += unroll_times) {
const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
if (last_col >= num_coeffs_to_reduce) {
for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
- const float val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+ const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
reducer.reduce(val, &reduced_val);
}
break;
@@ -386,6 +434,9 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
}
}
}
+#else
+ assert(0 && "Shouldn't be called on unsupported device");
+#endif
}
#ifdef EIGEN_HAS_CUDA_FP16
@@ -485,17 +536,23 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
#endif
-template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
struct InnerReductionLauncher {
static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
- assert(false && "Should only be called to reduce floats and half floats on a gpu device");
+ assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
return true;
}
};
-template <typename Self, typename Op, bool PacketAccess>
-struct InnerReductionLauncher<Self, Op, float, PacketAccess> {
- static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct InnerReductionLauncher<
+ Self, Op, OutputType, PacketAccess,
+ typename internal::enable_if<
+ internal::is_same<float, OutputType>::value ||
+ internal::is_same<double, OutputType>::value,
+ void>::type> {
+ static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
typedef typename Self::Index Index;
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
@@ -513,7 +570,7 @@ struct InnerReductionLauncher<Self, Op, float, PacketAccess> {
const int max_blocks = device.getNumCudaMultiProcessors() *
device.maxCudaThreadsPerMultiProcessor() / 1024;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
- LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
+ LAUNCH_CUDA_KERNEL((ReductionInitKernel<OutputType, Index>),
num_blocks, 1024, 0, device, reducer.initialize(),
num_preserved_vals, output);
}
@@ -580,15 +637,17 @@ struct InnerReducer<Self, Op, GpuDevice> {
#ifdef EIGEN_HAS_CUDA_FP16
static const bool HasOptimizedImplementation = !Op::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
- (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && Op::PacketAccess));
+ internal::is_same<typename Self::CoeffReturnType, double>::value ||
+ (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
#else
static const bool HasOptimizedImplementation = !Op::IsStateful &&
- internal::is_same<typename Self::CoeffReturnType, float>::value;
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value);
#endif
template <typename OutputType>
static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
- assert(HasOptimizedImplementation && "Should only be called on floats or half floats");
+ assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
const Index num_coeffs = array_prod(self.m_impl.dimensions());
// Don't crash when we're called with an input tensor of size 0.
if (num_coeffs == 0) {
@@ -599,7 +658,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
return true;
}
- return InnerReductionLauncher<Self, Op, OutputType, Op::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
+ return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
}
};
@@ -639,11 +698,11 @@ struct OuterReducer<Self, Op, GpuDevice> {
// so reduce the scope of the optimized version of the code to the simple case
// of floats.
static const bool HasOptimizedImplementation = !Op::IsStateful &&
- internal::is_same<typename Self::CoeffReturnType, float>::value;
-
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value);
template <typename Device, typename OutputType>
static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
- assert(false && "Should only be called to reduce floats on a gpu device");
+ assert(false && "Should only be called to reduce doubles or floats on a gpu device");
return true;
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index 031dbf6f2..8501466ce 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -9,9 +9,11 @@
#ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
#define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
+
namespace Eigen {
namespace internal {
+
template <typename Op, typename XprType>
struct traits<TensorScanOp<Op, XprType> >
: public traits<XprType> {
@@ -42,9 +44,7 @@ struct nested<TensorScanOp<Op, XprType>, 1,
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor scan class.
- *
*/
-
template <typename Op, typename XprType>
class TensorScanOp
: public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> {
@@ -57,8 +57,8 @@ public:
typedef typename Eigen::internal::traits<TensorScanOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp(
- const XprType& expr, const Index& axis, const Op& op = Op())
- : m_expr(expr), m_axis(axis), m_accumulator(op) {}
+ const XprType& expr, const Index& axis, bool exclusive = false, const Op& op = Op())
+ : m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Index axis() const { return m_axis; }
@@ -66,13 +66,19 @@ public:
const XprType& expression() const { return m_expr; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Op accumulator() const { return m_accumulator; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ bool exclusive() const { return m_exclusive; }
protected:
typename XprType::Nested m_expr;
const Index m_axis;
const Op m_accumulator;
+ const bool m_exclusive;
};
+template <typename Self, typename Reducer, typename Device>
+struct ScanLauncher;
+
// Eval as rvalue
template <typename Op, typename ArgType, typename Device>
struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
@@ -81,13 +87,14 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
typedef typename XprType::Index Index;
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
typedef DSizes<Index, NumDims> Dimensions;
- typedef typename XprType::Scalar Scalar;
+ typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self;
enum {
IsAligned = false,
- PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+ PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
@@ -98,45 +105,71 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
const Device& device)
: m_impl(op.expression(), device),
m_device(device),
- m_axis(op.axis()),
+ m_exclusive(op.exclusive()),
m_accumulator(op.accumulator()),
- m_dimensions(m_impl.dimensions()),
- m_size(m_dimensions[m_axis]),
+ m_size(m_impl.dimensions()[op.axis()]),
m_stride(1),
m_output(NULL) {
// Accumulating a scalar isn't supported.
- EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
- eigen_assert(m_axis >= 0 && m_axis < NumDims);
+ EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ eigen_assert(op.axis() >= 0 && op.axis() < NumDims);
// Compute stride of scan axis
+ const Dimensions& dims = m_impl.dimensions();
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
- for (int i = 0; i < m_axis; ++i) {
- m_stride = m_stride * m_dimensions[i];
+ for (int i = 0; i < op.axis(); ++i) {
+ m_stride = m_stride * dims[i];
}
} else {
- for (int i = NumDims - 1; i > m_axis; --i) {
- m_stride = m_stride * m_dimensions[i];
+ for (int i = NumDims - 1; i > op.axis(); --i) {
+ m_stride = m_stride * dims[i];
}
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
- return m_dimensions;
+ return m_impl.dimensions();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const {
+ return m_stride;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const {
+ return m_size;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const {
+ return m_accumulator;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const {
+ return m_exclusive;
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const {
+ return m_impl;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const {
+ return m_device;
+ }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
m_impl.evalSubExprsIfNeeded(NULL);
+ ScanLauncher<Self, Op, Device> launcher;
if (data) {
- accumulateTo(data);
+ launcher(*this, data);
return false;
- } else {
- m_output = static_cast<CoeffReturnType*>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
- accumulateTo(m_output);
- return true;
}
+
+ const Index total_size = internal::array_prod(dimensions());
+ m_output = static_cast<CoeffReturnType*>(m_device.allocate(total_size * sizeof(Scalar)));
+ launcher(*this, m_output);
+ return true;
}
-
+
template<int LoadMode>
EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
@@ -152,6 +185,10 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
return m_output[index];
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
+ return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
+ }
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
if (m_output != NULL) {
m_device.deallocate(m_output);
@@ -163,35 +200,88 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
protected:
TensorEvaluator<ArgType, Device> m_impl;
const Device& m_device;
- const Index m_axis;
+ const bool m_exclusive;
Op m_accumulator;
- const Dimensions& m_dimensions;
- const Index& m_size;
+ const Index m_size;
Index m_stride;
CoeffReturnType* m_output;
+};
+
+// CPU implementation of scan
+// TODO(ibab) This single-threaded implementation should be parallelized,
+// at least by running multiple scans at the same time.
+template <typename Self, typename Reducer, typename Device>
+struct ScanLauncher {
+ void operator()(Self& self, typename Self::CoeffReturnType *data) {
+ Index total_size = internal::array_prod(self.dimensions());
- // TODO(ibab) Parallelize this single-threaded implementation if desired
- EIGEN_DEVICE_FUNC void accumulateTo(Scalar* data) {
- // We fix the index along the scan axis to 0 and perform an
+ // We fix the index along the scan axis to 0 and perform a
// scan per remaining entry. The iteration is split into two nested
// loops to avoid an integer division by keeping track of each idx1 and idx2.
- for (Index idx1 = 0; idx1 < dimensions().TotalSize() / m_size; idx1 += m_stride) {
- for (Index idx2 = 0; idx2 < m_stride; idx2++) {
- // Calculate the starting offset for the scan
- Index offset = idx1 * m_size + idx2;
-
- // Compute the prefix sum along the axis, starting at the calculated offset
- CoeffReturnType accum = m_accumulator.initialize();
- for (Index idx3 = 0; idx3 < m_size; idx3++) {
- Index curr = offset + idx3 * m_stride;
- m_accumulator.reduce(m_impl.coeff(curr), &accum);
- data[curr] = m_accumulator.finalize(accum);
+ for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
+ for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
+ // Calculate the starting offset for the scan
+ Index offset = idx1 + idx2;
+
+ // Compute the scan along the axis, starting at the calculated offset
+ typename Self::CoeffReturnType accum = self.accumulator().initialize();
+ for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+ Index curr = offset + idx3 * self.stride();
+
+ if (self.exclusive()) {
+ data[curr] = self.accumulator().finalize(accum);
+ self.accumulator().reduce(self.inner().coeff(curr), &accum);
+ } else {
+ self.accumulator().reduce(self.inner().coeff(curr), &accum);
+ data[curr] = self.accumulator().finalize(accum);
}
- }
+ }
+ }
}
}
};
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+
+// GPU implementation of scan
+// TODO(ibab) This placeholder implementation performs multiple scans in
+// parallel, but it would be better to use a parallel scan algorithm and
+// optimize memory access.
+template <typename Self, typename Reducer>
+__global__ void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
+ // Compute offset as in the CPU version
+ Index val = threadIdx.x + blockIdx.x * blockDim.x;
+ Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
+
+ if (offset + (self.size() - 1) * self.stride() < total_size) {
+ // Compute the scan along the axis, starting at the calculated offset
+ typename Self::CoeffReturnType accum = self.accumulator().initialize();
+ for (Index idx = 0; idx < self.size(); idx++) {
+ Index curr = offset + idx * self.stride();
+ if (self.exclusive()) {
+ data[curr] = self.accumulator().finalize(accum);
+ self.accumulator().reduce(self.inner().coeff(curr), &accum);
+ } else {
+ self.accumulator().reduce(self.inner().coeff(curr), &accum);
+ data[curr] = self.accumulator().finalize(accum);
+ }
+ }
+ }
+ __syncthreads();
+
+}
+
+template <typename Self, typename Reducer>
+struct ScanLauncher<Self, Reducer, GpuDevice> {
+ void operator()(const Self& self, typename Self::CoeffReturnType* data) {
+ Index total_size = internal::array_prod(self.dimensions());
+ Index num_blocks = (total_size / self.size() + 63) / 64;
+ Index block_size = 64;
+ LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
+ }
+};
+#endif // EIGEN_USE_GPU && __CUDACC__
+
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
index bdcd70fd9..3523e7c94 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
@@ -20,6 +20,7 @@ struct static_val {
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator uint64_t() const { return n; }
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val() { }
+
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) {
eigen_assert(v == n);
@@ -53,7 +54,7 @@ struct TensorUInt128
template<typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
explicit TensorUInt128(const T& x) : high(0), low(x) {
- eigen_assert((static_cast<typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type>(x) <= static_cast<typename conditional<sizeof(LOW) == 8, uint64_t, uint32_t>::type>(NumTraits<LOW>::highest())));
+ eigen_assert((static_cast<typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type>(x) <= NumTraits<uint64_t>::highest()));
eigen_assert(x >= 0);
}
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt b/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt
deleted file mode 100644
index 6e871a8da..000000000
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-FILE(GLOB Eigen_CXX11_TensorSymmetry_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_CXX11_TensorSymmetry_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/TensorSymmetry COMPONENT Devel
- )
-
-add_subdirectory(util)
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt
deleted file mode 100644
index dc9fc78ec..000000000
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_TensorSymmetry_util_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_CXX11_TensorSymmetry_util_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/TensorSymmetry/util COMPONENT Devel
- )
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt b/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt
deleted file mode 100644
index 88fef50c6..000000000
--- a/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_ThreadPool_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_CXX11_ThreadPool_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/ThreadPool COMPONENT Devel
- )
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
index 6dd64f185..12b80d6c4 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
@@ -169,7 +169,8 @@ class EventCount {
class Waiter {
friend class EventCount;
- std::atomic<Waiter*> next;
+ // Align to 128 byte boundary to prevent false sharing with other Waiter objects in the same vector.
+ EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic<Waiter*> next;
std::mutex mu;
std::condition_variable cv;
uint64_t epoch;
@@ -179,8 +180,6 @@ class EventCount {
kWaiting,
kSignaled,
};
- // Prevent false sharing with other Waiter objects in the same vector.
- char pad_[128];
};
private:
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
index c094563b7..33ae45131 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@@ -74,7 +74,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
PerThread* pt = GetPerThread();
if (pt->pool == this) {
// Worker thread of this pool, push onto the thread's queue.
- Queue* q = queues_[pt->index];
+ Queue* q = queues_[pt->thread_id];
t = q->PushFront(std::move(t));
} else {
// A free-standing thread (or worker of another pool), push onto a random
@@ -95,14 +95,28 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
env_.ExecuteTask(t); // Push failed, execute directly.
}
+ int NumThreads() const final {
+ return static_cast<int>(threads_.size());
+ }
+
+ int CurrentThreadId() const final {
+ const PerThread* pt =
+ const_cast<NonBlockingThreadPoolTempl*>(this)->GetPerThread();
+ if (pt->pool == this) {
+ return pt->thread_id;
+ } else {
+ return -1;
+ }
+ }
+
private:
typedef typename Environment::EnvThread Thread;
struct PerThread {
- bool inited;
+ constexpr PerThread() : pool(NULL), rand(0), thread_id(-1) { }
NonBlockingThreadPoolTempl* pool; // Parent pool, or null for normal threads.
- unsigned index; // Worker thread index in pool.
- unsigned rand; // Random generator state.
+ uint64_t rand; // Random generator state.
+ int thread_id; // Worker thread index in pool.
};
Environment env_;
@@ -116,12 +130,13 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
EventCount ec_;
// Main worker thread loop.
- void WorkerLoop(unsigned index) {
+ void WorkerLoop(int thread_id) {
PerThread* pt = GetPerThread();
pt->pool = this;
- pt->index = index;
- Queue* q = queues_[index];
- EventCount::Waiter* waiter = &waiters_[index];
+ pt->rand = std::hash<std::thread::id>()(std::this_thread::get_id());
+ pt->thread_id = thread_id;
+ Queue* q = queues_[thread_id];
+ EventCount::Waiter* waiter = &waiters_[thread_id];
for (;;) {
Task t = q->PopFront();
if (!t.f) {
@@ -235,17 +250,18 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
return -1;
}
- PerThread* GetPerThread() {
+ static EIGEN_STRONG_INLINE PerThread* GetPerThread() {
EIGEN_THREAD_LOCAL PerThread per_thread_;
PerThread* pt = &per_thread_;
- if (pt->inited) return pt;
- pt->inited = true;
- pt->rand = static_cast<unsigned>(std::hash<std::thread::id>()(std::this_thread::get_id()));
return pt;
}
- static unsigned Rand(unsigned* state) {
- return *state = *state * 1103515245 + 12345;
+ static EIGEN_STRONG_INLINE unsigned Rand(uint64_t* state) {
+ uint64_t current = *state;
+ // Update the internal state
+ *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+ // Generate the random output (using the PCG-XSH-RS scheme)
+ return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
}
};
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
index 17fd1658b..e75d0f467 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
@@ -24,7 +24,7 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface {
explicit SimpleThreadPoolTempl(int num_threads, Environment env = Environment())
: env_(env), threads_(num_threads), waiters_(num_threads) {
for (int i = 0; i < num_threads; i++) {
- threads_.push_back(env.CreateThread([this]() { WorkerLoop(); }));
+ threads_.push_back(env.CreateThread([this, i]() { WorkerLoop(i); }));
}
}
@@ -55,7 +55,7 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface {
// Schedule fn() for execution in the pool of threads. The functions are
// executed in the order in which they are scheduled.
- void Schedule(std::function<void()> fn) {
+ void Schedule(std::function<void()> fn) final {
Task t = env_.CreateTask(std::move(fn));
std::unique_lock<std::mutex> l(mu_);
if (waiters_.empty()) {
@@ -69,9 +69,25 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface {
}
}
+ int NumThreads() const final {
+ return static_cast<int>(threads_.size());
+ }
+
+ int CurrentThreadId() const final {
+ const PerThread* pt = this->GetPerThread();
+ if (pt->pool == this) {
+ return pt->thread_id;
+ } else {
+ return -1;
+ }
+ }
+
protected:
- void WorkerLoop() {
+ void WorkerLoop(int thread_id) {
std::unique_lock<std::mutex> l(mu_);
+ PerThread* pt = GetPerThread();
+ pt->pool = this;
+ pt->thread_id = thread_id;
Waiter w;
Task t;
while (!exiting_) {
@@ -111,13 +127,24 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface {
bool ready;
};
+ struct PerThread {
+ constexpr PerThread() : pool(NULL), thread_id(-1) { }
+ SimpleThreadPoolTempl* pool; // Parent pool, or null for normal threads.
+ int thread_id; // Worker thread index in pool.
+ };
+
Environment env_;
std::mutex mu_;
MaxSizeVector<Thread*> threads_; // All threads
MaxSizeVector<Waiter*> waiters_; // Stack of waiting threads.
- std::deque<Task> pending_; // Queue of pending work
- std::condition_variable empty_; // Signaled on pending_.empty()
+ std::deque<Task> pending_; // Queue of pending work
+ std::condition_variable empty_; // Signaled on pending_.empty()
bool exiting_ = false;
+
+ PerThread* GetPerThread() const {
+ EIGEN_THREAD_LOCAL PerThread per_thread;
+ return &per_thread;
+ }
};
typedef SimpleThreadPoolTempl<StlThreadEnvironment> SimpleThreadPool;
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
index d2204ad5b..399f95cc1 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
@@ -21,14 +21,14 @@ struct StlThreadEnvironment {
// destructor must join the thread.
class EnvThread {
public:
- EnvThread(std::function<void()> f) : thr_(f) {}
+ EnvThread(std::function<void()> f) : thr_(std::move(f)) {}
~EnvThread() { thr_.join(); }
private:
std::thread thr_;
};
- EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(f); }
+ EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(std::move(f)); }
Task CreateTask(std::function<void()> f) { return Task{std::move(f)}; }
void ExecuteTask(const Task& t) { t.f(); }
};
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
index 38b40aceb..a65ee97c9 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
@@ -18,6 +18,13 @@ class ThreadPoolInterface {
public:
virtual void Schedule(std::function<void()> fn) = 0;
+ // Returns the number of threads in the pool.
+ virtual int NumThreads() const = 0;
+
+ // Returns a logical thread index between 0 and NumThreads() - 1 if called
+ // from one of the threads in the pool. Returns -1 otherwise.
+ virtual int CurrentThreadId() const = 0;
+
virtual ~ThreadPoolInterface() {}
};
diff --git a/unsupported/Eigen/CXX11/src/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/util/CMakeLists.txt
deleted file mode 100644
index 7eab492d6..000000000
--- a/unsupported/Eigen/CXX11/src/util/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_util_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_CXX11_util_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/util COMPONENT Devel
- )
diff --git a/unsupported/Eigen/KroneckerProduct b/unsupported/Eigen/KroneckerProduct
index c932c06a6..5f5afb8cf 100644
--- a/unsupported/Eigen/KroneckerProduct
+++ b/unsupported/Eigen/KroneckerProduct
@@ -13,6 +13,8 @@
#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+#include "../../Eigen/src/SparseCore/SparseUtil.h"
+
namespace Eigen {
/**
diff --git a/unsupported/Eigen/MPRealSupport b/unsupported/Eigen/MPRealSupport
index 89036886b..7f0b70c63 100644
--- a/unsupported/Eigen/MPRealSupport
+++ b/unsupported/Eigen/MPRealSupport
@@ -67,27 +67,32 @@ int main()
IsSigned = 1,
IsComplex = 0,
RequireInitialization = 1,
- ReadCost = 10,
- AddCost = 10,
- MulCost = 40
+ ReadCost = HugeCost,
+ AddCost = HugeCost,
+ MulCost = HugeCost
};
typedef mpfr::mpreal Real;
typedef mpfr::mpreal NonInteger;
- inline static Real highest (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::maxval(Precision); }
- inline static Real lowest (long Precision = mpfr::mpreal::get_default_prec()) { return -mpfr::maxval(Precision); }
+ static inline Real highest (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::maxval(Precision); }
+ static inline Real lowest (long Precision = mpfr::mpreal::get_default_prec()) { return -mpfr::maxval(Precision); }
// Constants
- inline static Real Pi (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_pi(Precision); }
- inline static Real Euler (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_euler(Precision); }
- inline static Real Log2 (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_log2(Precision); }
- inline static Real Catalan (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_catalan(Precision); }
+ static inline Real Pi (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_pi(Precision); }
+ static inline Real Euler (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_euler(Precision); }
+ static inline Real Log2 (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_log2(Precision); }
+ static inline Real Catalan (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_catalan(Precision); }
- inline static Real epsilon (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::machine_epsilon(Precision); }
- inline static Real epsilon (const Real& x) { return mpfr::machine_epsilon(x); }
+ static inline Real epsilon (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::machine_epsilon(Precision); }
+ static inline Real epsilon (const Real& x) { return mpfr::machine_epsilon(x); }
- inline static Real dummy_precision()
+#ifdef MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS
+ static inline int digits10 (long Precision = mpfr::mpreal::get_default_prec()) { return std::numeric_limits<Real>::digits10(Precision); }
+ static inline int digits10 (const Real& x) { return std::numeric_limits<Real>::digits10(x); }
+#endif
+
+ static inline Real dummy_precision()
{
mpfr_prec_t weak_prec = ((mpfr::mpreal::get_default_prec()-1) * 90) / 100;
return mpfr::machine_epsilon(weak_prec);
diff --git a/unsupported/Eigen/SpecialFunctions b/unsupported/Eigen/SpecialFunctions
new file mode 100644
index 000000000..7c7493c56
--- /dev/null
+++ b/unsupported/Eigen/SpecialFunctions
@@ -0,0 +1,61 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_MODULE
+#define EIGEN_SPECIALFUNCTIONS_MODULE
+
+#include "../../Eigen/Core"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+namespace Eigen {
+
+/**
+ * \defgroup SpecialFunctions_Module Special math functions module
+ *
+ * This module features additional coefficient-wise math functions available
+ * within the numext:: namespace for the scalar version, and as method and/or free
+ * functions of Array. Those include:
+ *
+ * - erf
+ * - erfc
+ * - lgamma
+ * - igamma
+ * - igammac
+ * - digamma
+ * - polygamma
+ * - zeta
+ * - betainc
+ *
+ * \code
+ * #include <unsupported/Eigen/SpecialFunctions>
+ * \endcode
+ */
+//@{
+
+}
+
+#include "src/SpecialFunctions/SpecialFunctionsImpl.h"
+#include "src/SpecialFunctions/SpecialFunctionsPacketMath.h"
+#include "src/SpecialFunctions/SpecialFunctionsHalf.h"
+#include "src/SpecialFunctions/SpecialFunctionsFunctors.h"
+#include "src/SpecialFunctions/SpecialFunctionsArrayAPI.h"
+
+#if defined EIGEN_VECTORIZE_CUDA
+ #include "src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h"
+#endif
+
+namespace Eigen {
+//@}
+}
+
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_SPECIALFUNCTIONS_MODULE
diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
index 089042751..50fedf6ac 100755
--- a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
+++ b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
@@ -30,6 +30,13 @@ template<typename _DerType, bool Enable> struct auto_diff_special_op;
} // end namespace internal
+template<typename _DerType> class AutoDiffScalar;
+
+template<typename NewDerType>
+inline AutoDiffScalar<NewDerType> MakeAutoDiffScalar(const typename NewDerType::Scalar& value, const NewDerType &der) {
+ return AutoDiffScalar<NewDerType>(value,der);
+}
+
/** \class AutoDiffScalar
* \brief A scalar type replacement with automatic differentation capability
*
@@ -60,7 +67,7 @@ template<typename _DerType>
class AutoDiffScalar
: public internal::auto_diff_special_op
<_DerType, !internal::is_same<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar,
- typename NumTraits<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar>::Real>::value>
+ typename NumTraits<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar>::Real>::value>
{
public:
typedef internal::auto_diff_special_op
@@ -257,20 +264,16 @@ class AutoDiffScalar
-m_derivatives);
}
- inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >
+ inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) >
operator*(const Scalar& other) const
{
- return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >(
- m_value * other,
- (m_derivatives * other));
+ return MakeAutoDiffScalar(m_value * other, m_derivatives * other);
}
- friend inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >
+ friend inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) >
operator*(const Scalar& other, const AutoDiffScalar& a)
{
- return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >(
- a.value() * other,
- a.derivatives() * other);
+ return MakeAutoDiffScalar(a.value() * other, a.derivatives() * other);
}
// inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
@@ -289,20 +292,16 @@ class AutoDiffScalar
// a.derivatives() * other);
// }
- inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >
+ inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) >
operator/(const Scalar& other) const
{
- return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >(
- m_value / other,
- (m_derivatives * (Scalar(1)/other)));
+ return MakeAutoDiffScalar(m_value / other, (m_derivatives * (Scalar(1)/other)));
}
- friend inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >
+ friend inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) >
operator/(const Scalar& other, const AutoDiffScalar& a)
{
- return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >(
- other / a.value(),
- a.derivatives() * (Scalar(-other) / (a.value()*a.value())));
+ return MakeAutoDiffScalar(other / a.value(), a.derivatives() * (Scalar(-other) / (a.value()*a.value())));
}
// inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
@@ -322,34 +321,29 @@ class AutoDiffScalar
// }
template<typename OtherDerType>
- inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,
- const CwiseBinaryOp<internal::scalar_difference_op<Scalar>,
- const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>,
- const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type > > > >
+ inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(
+ CwiseBinaryOp<internal::scalar_difference_op<Scalar> EIGEN_COMMA
+ const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) EIGEN_COMMA
+ const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename internal::remove_all<OtherDerType>::type,Scalar,product) >,Scalar,product) >
operator/(const AutoDiffScalar<OtherDerType>& other) const
{
internal::make_coherent(m_derivatives, other.derivatives());
- return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,
- const CwiseBinaryOp<internal::scalar_difference_op<Scalar>,
- const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>,
- const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type > > > >(
+ return MakeAutoDiffScalar(
m_value / other.value(),
- ((m_derivatives * other.value()) - (m_value * other.derivatives()))
+ ((m_derivatives * other.value()) - (other.derivatives() * m_value))
* (Scalar(1)/(other.value()*other.value())));
}
template<typename OtherDerType>
inline const AutoDiffScalar<CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
- const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>,
- const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type> > >
+ const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product),
+ const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename internal::remove_all<OtherDerType>::type,Scalar,product) > >
operator*(const AutoDiffScalar<OtherDerType>& other) const
{
internal::make_coherent(m_derivatives, other.derivatives());
- return AutoDiffScalar<const CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
- const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>,
- const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type > > >(
+ return MakeAutoDiffScalar(
m_value * other.value(),
- (m_derivatives * other.value()) + (m_value * other.derivatives()));
+ (m_derivatives * other.value()) + (other.derivatives() * m_value));
}
inline AutoDiffScalar& operator*=(const Scalar& other)
@@ -426,18 +420,18 @@ struct auto_diff_special_op<_DerType, true>
}
- inline const AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >
+ inline const AutoDiffScalar<typename CwiseUnaryOp<bind2nd_op<scalar_product_op<Scalar,Real> >, DerType>::Type >
operator*(const Real& other) const
{
- return AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >(
+ return AutoDiffScalar<typename CwiseUnaryOp<bind2nd_op<scalar_product_op<Scalar,Real> >, DerType>::Type >(
derived().value() * other,
derived().derivatives() * other);
}
- friend inline const AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >
+ friend inline const AutoDiffScalar<typename CwiseUnaryOp<bind1st_op<scalar_product_op<Real,Scalar> >, DerType>::Type >
operator*(const Real& other, const AutoDiffScalar<_DerType>& a)
{
- return AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >(
+ return AutoDiffScalar<typename CwiseUnaryOp<bind1st_op<scalar_product_op<Real,Scalar> >, DerType>::Type >(
a.value() * other,
a.derivatives() * other);
}
@@ -501,43 +495,44 @@ struct make_coherent_impl<Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows,
}
};
-template<typename A_Scalar, int A_Rows, int A_Cols, int A_Options, int A_MaxRows, int A_MaxCols>
-struct scalar_product_traits<Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols>,A_Scalar>
-{
- enum { Defined = 1 };
- typedef Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> ReturnType;
-};
-
-template<typename A_Scalar, int A_Rows, int A_Cols, int A_Options, int A_MaxRows, int A_MaxCols>
-struct scalar_product_traits<A_Scalar, Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> >
-{
- enum { Defined = 1 };
- typedef Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> ReturnType;
-};
+} // end namespace internal
-template<typename DerType>
-struct scalar_product_traits<AutoDiffScalar<DerType>,typename DerType::Scalar>
+template<typename DerType, typename BinOp>
+struct ScalarBinaryOpTraits<AutoDiffScalar<DerType>,typename DerType::Scalar,BinOp>
{
- enum { Defined = 1 };
typedef AutoDiffScalar<DerType> ReturnType;
};
-template<typename DerType>
-struct scalar_product_traits<typename DerType::Scalar,AutoDiffScalar<DerType> >
+template<typename DerType, typename BinOp>
+struct ScalarBinaryOpTraits<typename DerType::Scalar,AutoDiffScalar<DerType>, BinOp>
{
- enum { Defined = 1 };
typedef AutoDiffScalar<DerType> ReturnType;
};
-} // end namespace internal
+
+// The following is an attempt to let Eigen's known about expression template, but that's more tricky!
+
+// template<typename DerType, typename BinOp>
+// struct ScalarBinaryOpTraits<AutoDiffScalar<DerType>,AutoDiffScalar<DerType>, BinOp>
+// {
+// enum { Defined = 1 };
+// typedef AutoDiffScalar<typename DerType::PlainObject> ReturnType;
+// };
+//
+// template<typename DerType1,typename DerType2, typename BinOp>
+// struct ScalarBinaryOpTraits<AutoDiffScalar<DerType1>,AutoDiffScalar<DerType2>, BinOp>
+// {
+// enum { Defined = 1 };//internal::is_same<typename DerType1::Scalar,typename DerType2::Scalar>::value };
+// typedef AutoDiffScalar<typename DerType1::PlainObject> ReturnType;
+// };
#define EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(FUNC,CODE) \
template<typename DerType> \
- inline const Eigen::AutoDiffScalar<Eigen::CwiseUnaryOp<Eigen::internal::scalar_multiple_op<typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar>, const typename Eigen::internal::remove_all<DerType>::type> > \
+ inline const Eigen::AutoDiffScalar< \
+ EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename Eigen::internal::remove_all<DerType>::type, typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar, product) > \
FUNC(const Eigen::AutoDiffScalar<DerType>& x) { \
using namespace Eigen; \
- typedef typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar Scalar; \
- typedef AutoDiffScalar<CwiseUnaryOp<Eigen::internal::scalar_multiple_op<Scalar>, const typename Eigen::internal::remove_all<DerType>::type> > ReturnType; \
+ EIGEN_UNUSED typedef typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar Scalar; \
CODE; \
}
@@ -567,49 +562,56 @@ inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::Plain
typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> ADS;
return (x > y ? ADS(x) : ADS(y));
}
+template<typename DerType>
+inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (min)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) {
+ return (x.value() < y.value() ? x : y);
+}
+template<typename DerType>
+inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (max)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) {
+ return (x.value() >= y.value() ? x : y);
+}
+
EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(abs,
using std::abs;
- return ReturnType(abs(x.value()), x.derivatives() * (x.value()<0 ? -1 : 1) );)
+ return Eigen::MakeAutoDiffScalar(abs(x.value()), x.derivatives() * (x.value()<0 ? -1 : 1) );)
EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(abs2,
using numext::abs2;
- return ReturnType(abs2(x.value()), x.derivatives() * (Scalar(2)*x.value()));)
+ return Eigen::MakeAutoDiffScalar(abs2(x.value()), x.derivatives() * (Scalar(2)*x.value()));)
EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sqrt,
using std::sqrt;
Scalar sqrtx = sqrt(x.value());
- return ReturnType(sqrtx,x.derivatives() * (Scalar(0.5) / sqrtx));)
+ return Eigen::MakeAutoDiffScalar(sqrtx,x.derivatives() * (Scalar(0.5) / sqrtx));)
EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(cos,
using std::cos;
using std::sin;
- return ReturnType(cos(x.value()), x.derivatives() * (-sin(x.value())));)
+ return Eigen::MakeAutoDiffScalar(cos(x.value()), x.derivatives() * (-sin(x.value())));)
EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sin,
using std::sin;
using std::cos;
- return ReturnType(sin(x.value()),x.derivatives() * cos(x.value()));)
+ return Eigen::MakeAutoDiffScalar(sin(x.value()),x.derivatives() * cos(x.value()));)
EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(exp,
using std::exp;
Scalar expx = exp(x.value());
- return ReturnType(expx,x.derivatives() * expx);)
+ return Eigen::MakeAutoDiffScalar(expx,x.derivatives() * expx);)
EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(log,
using std::log;
- return ReturnType(log(x.value()),x.derivatives() * (Scalar(1)/x.value()));)
+ return Eigen::MakeAutoDiffScalar(log(x.value()),x.derivatives() * (Scalar(1)/x.value()));)
template<typename DerType>
-inline const Eigen::AutoDiffScalar<Eigen::CwiseUnaryOp<Eigen::internal::scalar_multiple_op<typename internal::traits<typename internal::remove_all<DerType>::type>::Scalar>, const typename internal::remove_all<DerType>::type> >
-pow(const Eigen::AutoDiffScalar<DerType>& x, const typename internal::traits<typename internal::remove_all<DerType>::type>::Scalar &y)
+inline const Eigen::AutoDiffScalar<
+EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename internal::remove_all<DerType>::type,typename internal::traits<typename internal::remove_all<DerType>::type>::Scalar,product) >
+pow(const Eigen::AutoDiffScalar<DerType> &x, const typename internal::traits<typename internal::remove_all<DerType>::type>::Scalar &y)
{
using namespace Eigen;
- typedef typename internal::remove_all<DerType>::type DerTypeCleaned;
- typedef typename Eigen::internal::traits<DerTypeCleaned>::Scalar Scalar;
- return AutoDiffScalar<CwiseUnaryOp<Eigen::internal::scalar_multiple_op<Scalar>, const DerTypeCleaned> >(
- std::pow(x.value(),y),
- x.derivatives() * (y * std::pow(x.value(),y-1)));
+ using std::pow;
+ return Eigen::MakeAutoDiffScalar(pow(x.value(),y), x.derivatives() * (y * pow(x.value(),y-1)));
}
@@ -634,27 +636,44 @@ atan2(const AutoDiffScalar<DerTypeA>& a, const AutoDiffScalar<DerTypeB>& b)
EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(tan,
using std::tan;
using std::cos;
- return ReturnType(tan(x.value()),x.derivatives() * (Scalar(1)/numext::abs2(cos(x.value()))));)
+ return Eigen::MakeAutoDiffScalar(tan(x.value()),x.derivatives() * (Scalar(1)/numext::abs2(cos(x.value()))));)
EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(asin,
using std::sqrt;
using std::asin;
- return ReturnType(asin(x.value()),x.derivatives() * (Scalar(1)/sqrt(1-numext::abs2(x.value()))));)
+ return Eigen::MakeAutoDiffScalar(asin(x.value()),x.derivatives() * (Scalar(1)/sqrt(1-numext::abs2(x.value()))));)
EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(acos,
using std::sqrt;
using std::acos;
- return ReturnType(acos(x.value()),x.derivatives() * (Scalar(-1)/sqrt(1-numext::abs2(x.value()))));)
+ return Eigen::MakeAutoDiffScalar(acos(x.value()),x.derivatives() * (Scalar(-1)/sqrt(1-numext::abs2(x.value()))));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(tanh,
+ using std::cosh;
+ using std::tanh;
+ return Eigen::MakeAutoDiffScalar(tanh(x.value()),x.derivatives() * (Scalar(1)/numext::abs2(cosh(x.value()))));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sinh,
+ using std::sinh;
+ using std::cosh;
+ return Eigen::MakeAutoDiffScalar(sinh(x.value()),x.derivatives() * cosh(x.value()));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(cosh,
+ using std::sinh;
+ using std::cosh;
+ return Eigen::MakeAutoDiffScalar(cosh(x.value()),x.derivatives() * sinh(x.value()));)
#undef EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY
template<typename DerType> struct NumTraits<AutoDiffScalar<DerType> >
- : NumTraits< typename NumTraits<typename DerType::Scalar>::Real >
+ : NumTraits< typename NumTraits<typename internal::remove_all<DerType>::type::Scalar>::Real >
{
- typedef AutoDiffScalar<Matrix<typename NumTraits<typename DerType::Scalar>::Real,DerType::RowsAtCompileTime,DerType::ColsAtCompileTime,
- DerType::Options, DerType::MaxRowsAtCompileTime, DerType::MaxColsAtCompileTime> > Real;
+ typedef typename internal::remove_all<DerType>::type DerTypeCleaned;
+ typedef AutoDiffScalar<Matrix<typename NumTraits<typename DerTypeCleaned::Scalar>::Real,DerTypeCleaned::RowsAtCompileTime,DerTypeCleaned::ColsAtCompileTime,
+ 0, DerTypeCleaned::MaxRowsAtCompileTime, DerTypeCleaned::MaxColsAtCompileTime> > Real;
typedef AutoDiffScalar<DerType> NonInteger;
typedef AutoDiffScalar<DerType> Nested;
+ typedef typename NumTraits<typename DerTypeCleaned::Scalar>::Literal Literal;
enum{
RequireInitialization = 1
};
diff --git a/unsupported/Eigen/src/AutoDiff/CMakeLists.txt b/unsupported/Eigen/src/AutoDiff/CMakeLists.txt
deleted file mode 100644
index ad91fd9c4..000000000
--- a/unsupported/Eigen/src/AutoDiff/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_AutoDiff_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_AutoDiff_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/AutoDiff COMPONENT Devel
- )
diff --git a/unsupported/Eigen/src/BVH/CMakeLists.txt b/unsupported/Eigen/src/BVH/CMakeLists.txt
deleted file mode 100644
index b377d865c..000000000
--- a/unsupported/Eigen/src/BVH/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_BVH_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_BVH_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/BVH COMPONENT Devel
- )
diff --git a/unsupported/Eigen/src/CMakeLists.txt b/unsupported/Eigen/src/CMakeLists.txt
deleted file mode 100644
index 754953335..000000000
--- a/unsupported/Eigen/src/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-ADD_SUBDIRECTORY(AutoDiff)
-ADD_SUBDIRECTORY(BVH)
-ADD_SUBDIRECTORY(Eigenvalues)
-ADD_SUBDIRECTORY(FFT)
-ADD_SUBDIRECTORY(IterativeSolvers)
-ADD_SUBDIRECTORY(LevenbergMarquardt)
-ADD_SUBDIRECTORY(MatrixFunctions)
-ADD_SUBDIRECTORY(MoreVectorization)
-ADD_SUBDIRECTORY(NonLinearOptimization)
-ADD_SUBDIRECTORY(NumericalDiff)
-ADD_SUBDIRECTORY(Polynomials)
-ADD_SUBDIRECTORY(Skyline)
-ADD_SUBDIRECTORY(SparseExtra)
-ADD_SUBDIRECTORY(KroneckerProduct)
-ADD_SUBDIRECTORY(Splines)
-ADD_SUBDIRECTORY(EulerAngles)
diff --git a/unsupported/Eigen/src/Eigenvalues/CMakeLists.txt b/unsupported/Eigen/src/Eigenvalues/CMakeLists.txt
deleted file mode 100644
index 1d4387c82..000000000
--- a/unsupported/Eigen/src/Eigenvalues/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Eigenvalues_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_Eigenvalues_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Eigenvalues COMPONENT Devel
- )
diff --git a/unsupported/Eigen/src/FFT/CMakeLists.txt b/unsupported/Eigen/src/FFT/CMakeLists.txt
deleted file mode 100644
index edcffcb18..000000000
--- a/unsupported/Eigen/src/FFT/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_FFT_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_FFT_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/FFT COMPONENT Devel
- )
diff --git a/unsupported/Eigen/src/IterativeSolvers/CMakeLists.txt b/unsupported/Eigen/src/IterativeSolvers/CMakeLists.txt
deleted file mode 100644
index 7986afc5e..000000000
--- a/unsupported/Eigen/src/IterativeSolvers/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_IterativeSolvers_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_IterativeSolvers_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/IterativeSolvers COMPONENT Devel
- )
diff --git a/unsupported/Eigen/src/KroneckerProduct/CMakeLists.txt b/unsupported/Eigen/src/KroneckerProduct/CMakeLists.txt
deleted file mode 100644
index 4daefebee..000000000
--- a/unsupported/Eigen/src/KroneckerProduct/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_KroneckerProduct_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_KroneckerProduct_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/KroneckerProduct COMPONENT Devel
- )
diff --git a/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h b/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
index bf9727c21..582fa8512 100644
--- a/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
+++ b/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
@@ -203,7 +203,7 @@ struct traits<KroneckerProduct<_Lhs,_Rhs> >
{
typedef typename remove_all<_Lhs>::type Lhs;
typedef typename remove_all<_Rhs>::type Rhs;
- typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
+ typedef typename ScalarBinaryOpTraits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
typedef typename promote_index_type<typename Lhs::StorageIndex, typename Rhs::StorageIndex>::type StorageIndex;
enum {
@@ -222,7 +222,7 @@ struct traits<KroneckerProductSparse<_Lhs,_Rhs> >
typedef MatrixXpr XprKind;
typedef typename remove_all<_Lhs>::type Lhs;
typedef typename remove_all<_Rhs>::type Rhs;
- typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
+ typedef typename ScalarBinaryOpTraits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
typedef typename cwise_promote_storage_type<typename traits<Lhs>::StorageKind, typename traits<Rhs>::StorageKind, scalar_product_op<typename Lhs::Scalar, typename Rhs::Scalar> >::ret StorageKind;
typedef typename promote_index_type<typename Lhs::StorageIndex, typename Rhs::StorageIndex>::type StorageIndex;
diff --git a/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt b/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt
deleted file mode 100644
index d9690854d..000000000
--- a/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_LevenbergMarquardt_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_LevenbergMarquardt_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/LevenbergMarquardt COMPONENT Devel
- )
diff --git a/unsupported/Eigen/src/MatrixFunctions/CMakeLists.txt b/unsupported/Eigen/src/MatrixFunctions/CMakeLists.txt
deleted file mode 100644
index cdde64d2c..000000000
--- a/unsupported/Eigen/src/MatrixFunctions/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_MatrixFunctions_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_MatrixFunctions_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/MatrixFunctions COMPONENT Devel
- )
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h b/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
index 9f08c6162..afd88ec4d 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
@@ -65,21 +65,6 @@ void matrix_sqrt_quasi_triangular_2x1_off_diagonal_block(const MatrixType& T, ty
sqrtT.template block<2,1>(i,j) = A.fullPivLu().solve(rhs);
}
-// similar to compute1x1offDiagonalBlock()
-template <typename MatrixType, typename ResultType>
-void matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
-{
- typedef typename traits<MatrixType>::Scalar Scalar;
- Matrix<Scalar,2,2> A = sqrtT.template block<2,2>(i,i);
- Matrix<Scalar,2,2> B = sqrtT.template block<2,2>(j,j);
- Matrix<Scalar,2,2> C = T.template block<2,2>(i,j);
- if (j-i > 2)
- C -= sqrtT.block(i, i+2, 2, j-i-2) * sqrtT.block(i+2, j, j-i-2, 2);
- Matrix<Scalar,2,2> X;
- matrix_sqrt_quasi_triangular_solve_auxiliary_equation(X, A, B, C);
- sqrtT.template block<2,2>(i,j) = X;
-}
-
// solves the equation A X + X B = C where all matrices are 2-by-2
template <typename MatrixType>
void matrix_sqrt_quasi_triangular_solve_auxiliary_equation(MatrixType& X, const MatrixType& A, const MatrixType& B, const MatrixType& C)
@@ -98,13 +83,13 @@ void matrix_sqrt_quasi_triangular_solve_auxiliary_equation(MatrixType& X, const
coeffMatrix.coeffRef(2,3) = B.coeff(1,0);
coeffMatrix.coeffRef(3,1) = A.coeff(1,0);
coeffMatrix.coeffRef(3,2) = B.coeff(0,1);
-
+
Matrix<Scalar,4,1> rhs;
rhs.coeffRef(0) = C.coeff(0,0);
rhs.coeffRef(1) = C.coeff(0,1);
rhs.coeffRef(2) = C.coeff(1,0);
rhs.coeffRef(3) = C.coeff(1,1);
-
+
Matrix<Scalar,4,1> result;
result = coeffMatrix.fullPivLu().solve(rhs);
@@ -114,6 +99,20 @@ void matrix_sqrt_quasi_triangular_solve_auxiliary_equation(MatrixType& X, const
X.coeffRef(1,1) = result.coeff(3);
}
+// similar to compute1x1offDiagonalBlock()
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
+{
+ typedef typename traits<MatrixType>::Scalar Scalar;
+ Matrix<Scalar,2,2> A = sqrtT.template block<2,2>(i,i);
+ Matrix<Scalar,2,2> B = sqrtT.template block<2,2>(j,j);
+ Matrix<Scalar,2,2> C = T.template block<2,2>(i,j);
+ if (j-i > 2)
+ C -= sqrtT.block(i, i+2, 2, j-i-2) * sqrtT.block(i+2, j, j-i-2, 2);
+ Matrix<Scalar,2,2> X;
+ matrix_sqrt_quasi_triangular_solve_auxiliary_equation(X, A, B, C);
+ sqrtT.template block<2,2>(i,j) = X;
+}
// pre: T is quasi-upper-triangular and sqrtT is a zero matrix of the same size
// post: the diagonal blocks of sqrtT are the square roots of the diagonal blocks of T
diff --git a/unsupported/Eigen/src/MoreVectorization/CMakeLists.txt b/unsupported/Eigen/src/MoreVectorization/CMakeLists.txt
deleted file mode 100644
index 1b887cc8e..000000000
--- a/unsupported/Eigen/src/MoreVectorization/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_MoreVectorization_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_MoreVectorization_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/MoreVectorization COMPONENT Devel
- )
diff --git a/unsupported/Eigen/src/NonLinearOptimization/CMakeLists.txt b/unsupported/Eigen/src/NonLinearOptimization/CMakeLists.txt
deleted file mode 100644
index 9322ddadf..000000000
--- a/unsupported/Eigen/src/NonLinearOptimization/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_NonLinearOptimization_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_NonLinearOptimization_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/NonLinearOptimization COMPONENT Devel
- )
diff --git a/unsupported/Eigen/src/NumericalDiff/CMakeLists.txt b/unsupported/Eigen/src/NumericalDiff/CMakeLists.txt
deleted file mode 100644
index 1199aca2f..000000000
--- a/unsupported/Eigen/src/NumericalDiff/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_NumericalDiff_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_NumericalDiff_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/NumericalDiff COMPONENT Devel
- )
diff --git a/unsupported/Eigen/src/Polynomials/CMakeLists.txt b/unsupported/Eigen/src/Polynomials/CMakeLists.txt
deleted file mode 100644
index 51f13f3cb..000000000
--- a/unsupported/Eigen/src/Polynomials/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Polynomials_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_Polynomials_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Polynomials COMPONENT Devel
- )
diff --git a/unsupported/Eigen/src/Skyline/CMakeLists.txt b/unsupported/Eigen/src/Skyline/CMakeLists.txt
deleted file mode 100644
index 3bf1b0dd4..000000000
--- a/unsupported/Eigen/src/Skyline/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Skyline_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_Skyline_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Skyline COMPONENT Devel
- )
diff --git a/unsupported/Eigen/src/SparseExtra/CMakeLists.txt b/unsupported/Eigen/src/SparseExtra/CMakeLists.txt
deleted file mode 100644
index 7ea32ca5e..000000000
--- a/unsupported/Eigen/src/SparseExtra/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_SparseExtra_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_SparseExtra_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/SparseExtra COMPONENT Devel
- )
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
new file mode 100644
index 000000000..ed415db99
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
@@ -0,0 +1,124 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H
+#define EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H
+
+namespace Eigen {
+
+/** \cpp11 \returns an expression of the coefficient-wise igamma(\a a, \a x) to the given arrays.
+ *
+ * This function computes the coefficient-wise incomplete gamma function.
+ *
+ * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+ * or float/double in non c++11 mode, the user has to provide implementations of igammac(T,T) for any scalar
+ * type T to be supported.
+ *
+ * \sa Eigen::igammac(), Eigen::lgamma()
+ */
+template<typename Derived,typename ExponentDerived>
+inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
+igamma(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x)
+{
+ return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
+ a.derived(),
+ x.derived()
+ );
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise igammac(\a a, \a x) to the given arrays.
+ *
+ * This function computes the coefficient-wise complementary incomplete gamma function.
+ *
+ * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+ * or float/double in non c++11 mode, the user has to provide implementations of igammac(T,T) for any scalar
+ * type T to be supported.
+ *
+ * \sa Eigen::igamma(), Eigen::lgamma()
+ */
+template<typename Derived,typename ExponentDerived>
+inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
+igammac(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x)
+{
+ return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
+ a.derived(),
+ x.derived()
+ );
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise polygamma(\a n, \a x) to the given arrays.
+ *
+ * It returns the \a n -th derivative of the digamma(psi) evaluated at \c x.
+ *
+ * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+ * or float/double in non c++11 mode, the user has to provide implementations of polygamma(T,T) for any scalar
+ * type T to be supported.
+ *
+ * \sa Eigen::digamma()
+ */
+// * \warning Be careful with the order of the parameters: x.polygamma(n) is equivalent to polygamma(n,x)
+// * \sa ArrayBase::polygamma()
+template<typename DerivedN,typename DerivedX>
+inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX>
+polygamma(const Eigen::ArrayBase<DerivedN>& n, const Eigen::ArrayBase<DerivedX>& x)
+{
+ return Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX>(
+ n.derived(),
+ x.derived()
+ );
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given arrays.
+ *
+ * This function computes the regularized incomplete beta function (integral).
+ *
+ * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+ * or float/double in non c++11 mode, the user has to provide implementations of betainc(T,T,T) for any scalar
+ * type T to be supported.
+ *
+ * \sa Eigen::betainc(), Eigen::lgamma()
+ */
+template<typename ArgADerived, typename ArgBDerived, typename ArgXDerived>
+inline const Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived>
+betainc(const Eigen::ArrayBase<ArgADerived>& a, const Eigen::ArrayBase<ArgBDerived>& b, const Eigen::ArrayBase<ArgXDerived>& x)
+{
+ return Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived>(
+ a.derived(),
+ b.derived(),
+ x.derived()
+ );
+}
+
+
+/** \returns an expression of the coefficient-wise zeta(\a x, \a q) to the given arrays.
+ *
+ * It returns the Riemann zeta function of two arguments \a x and \a q:
+ *
+ * \param x is the exposent, it must be > 1
+ * \param q is the shift, it must be > 0
+ *
+ * \note This function supports only float and double scalar types. To support other scalar types, the user has
+ * to provide implementations of zeta(T,T) for any scalar type T to be supported.
+ *
+ * \sa ArrayBase::zeta()
+ */
+template<typename DerivedX,typename DerivedQ>
+inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>
+zeta(const Eigen::ArrayBase<DerivedX>& x, const Eigen::ArrayBase<DerivedQ>& q)
+{
+ return Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>(
+ x.derived(),
+ q.derived()
+ );
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
new file mode 100644
index 000000000..d8f2363be
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
@@ -0,0 +1,236 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_FUNCTORS_H
+#define EIGEN_SPECIALFUNCTIONS_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+
+/** \internal
+ * \brief Template functor to compute the incomplete gamma function igamma(a, x)
+ *
+ * \sa class CwiseBinaryOp, Cwise::igamma
+ */
+template<typename Scalar> struct scalar_igamma_op : binary_op_base<Scalar,Scalar>
+{
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_igamma_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const {
+ using numext::igamma; return igamma(a, x);
+ }
+ template<typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const {
+ return internal::pigamma(a, x);
+ }
+};
+template<typename Scalar>
+struct functor_traits<scalar_igamma_op<Scalar> > {
+ enum {
+ // Guesstimate
+ Cost = 20 * NumTraits<Scalar>::MulCost + 10 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasIGamma
+ };
+};
+
+
+/** \internal
+ * \brief Template functor to compute the complementary incomplete gamma function igammac(a, x)
+ *
+ * \sa class CwiseBinaryOp, Cwise::igammac
+ */
+template<typename Scalar> struct scalar_igammac_op : binary_op_base<Scalar,Scalar>
+{
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_igammac_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const {
+ using numext::igammac; return igammac(a, x);
+ }
+ template<typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const
+ {
+ return internal::pigammac(a, x);
+ }
+};
+template<typename Scalar>
+struct functor_traits<scalar_igammac_op<Scalar> > {
+ enum {
+ // Guesstimate
+ Cost = 20 * NumTraits<Scalar>::MulCost + 10 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasIGammac
+ };
+};
+
+
+/** \internal
+ * \brief Template functor to compute the incomplete beta integral betainc(a, b, x)
+ *
+ */
+template<typename Scalar> struct scalar_betainc_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_betainc_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& x, const Scalar& a, const Scalar& b) const {
+ using numext::betainc; return betainc(x, a, b);
+ }
+ template<typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& x, const Packet& a, const Packet& b) const
+ {
+ return internal::pbetainc(x, a, b);
+ }
+};
+template<typename Scalar>
+struct functor_traits<scalar_betainc_op<Scalar> > {
+ enum {
+ // Guesstimate
+ Cost = 400 * NumTraits<Scalar>::MulCost + 400 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasBetaInc
+ };
+};
+
+
+/** \internal
+ * \brief Template functor to compute the natural log of the absolute
+ * value of Gamma of a scalar
+ * \sa class CwiseUnaryOp, Cwise::lgamma()
+ */
+template<typename Scalar> struct scalar_lgamma_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_lgamma_op)
+ EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+ using numext::lgamma; return lgamma(a);
+ }
+ typedef typename packet_traits<Scalar>::type Packet;
+ EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plgamma(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_lgamma_op<Scalar> >
+{
+ enum {
+ // Guesstimate
+ Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasLGamma
+ };
+};
+
+/** \internal
+ * \brief Template functor to compute psi, the derivative of lgamma of a scalar.
+ * \sa class CwiseUnaryOp, Cwise::digamma()
+ */
+template<typename Scalar> struct scalar_digamma_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_digamma_op)
+ EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+ using numext::digamma; return digamma(a);
+ }
+ typedef typename packet_traits<Scalar>::type Packet;
+ EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pdigamma(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_digamma_op<Scalar> >
+{
+ enum {
+ // Guesstimate
+ Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasDiGamma
+ };
+};
+
+/** \internal
+ * \brief Template functor to compute the Riemann Zeta function of two arguments.
+ * \sa class CwiseUnaryOp, Cwise::zeta()
+ */
+template<typename Scalar> struct scalar_zeta_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_zeta_op)
+ EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& x, const Scalar& q) const {
+ using numext::zeta; return zeta(x, q);
+ }
+ typedef typename packet_traits<Scalar>::type Packet;
+ EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x, const Packet& q) const { return internal::pzeta(x, q); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_zeta_op<Scalar> >
+{
+ enum {
+ // Guesstimate
+ Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasZeta
+ };
+};
+
+/** \internal
+ * \brief Template functor to compute the polygamma function.
+ * \sa class CwiseUnaryOp, Cwise::polygamma()
+ */
+template<typename Scalar> struct scalar_polygamma_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_polygamma_op)
+ EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& n, const Scalar& x) const {
+ using numext::polygamma; return polygamma(n, x);
+ }
+ typedef typename packet_traits<Scalar>::type Packet;
+ EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& n, const Packet& x) const { return internal::ppolygamma(n, x); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_polygamma_op<Scalar> >
+{
+ enum {
+ // Guesstimate
+ Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasPolygamma
+ };
+};
+
+/** \internal
+ * \brief Template functor to compute the Gauss error function of a
+ * scalar
+ * \sa class CwiseUnaryOp, Cwise::erf()
+ */
+template<typename Scalar> struct scalar_erf_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_erf_op)
+ EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+ using numext::erf; return erf(a);
+ }
+ typedef typename packet_traits<Scalar>::type Packet;
+ EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perf(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_erf_op<Scalar> >
+{
+ enum {
+ // Guesstimate
+ Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasErf
+ };
+};
+
+/** \internal
+ * \brief Template functor to compute the Complementary Error Function
+ * of a scalar
+ * \sa class CwiseUnaryOp, Cwise::erfc()
+ */
+template<typename Scalar> struct scalar_erfc_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_erfc_op)
+ EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+ using numext::erfc; return erfc(a);
+ }
+ typedef typename packet_traits<Scalar>::type Packet;
+ EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perfc(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_erfc_op<Scalar> >
+{
+ enum {
+ // Guesstimate
+ Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasErfc
+ };
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPECIALFUNCTIONS_FUNCTORS_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
new file mode 100644
index 000000000..553bcda6a
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
@@ -0,0 +1,47 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_HALF_H
+#define EIGEN_SPECIALFUNCTIONS_HALF_H
+
+namespace Eigen {
+namespace numext {
+
+#if EIGEN_HAS_C99_MATH
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half lgamma(const Eigen::half& a) {
+ return Eigen::half(Eigen::numext::lgamma(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half digamma(const Eigen::half& a) {
+ return Eigen::half(Eigen::numext::digamma(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half zeta(const Eigen::half& x, const Eigen::half& q) {
+ return Eigen::half(Eigen::numext::zeta(static_cast<float>(x), static_cast<float>(q)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half polygamma(const Eigen::half& n, const Eigen::half& x) {
+ return Eigen::half(Eigen::numext::polygamma(static_cast<float>(n), static_cast<float>(x)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erf(const Eigen::half& a) {
+ return Eigen::half(Eigen::numext::erf(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erfc(const Eigen::half& a) {
+ return Eigen::half(Eigen::numext::erfc(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma(const Eigen::half& a, const Eigen::half& x) {
+ return Eigen::half(Eigen::numext::igamma(static_cast<float>(a), static_cast<float>(x)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igammac(const Eigen::half& a, const Eigen::half& x) {
+ return Eigen::half(Eigen::numext::igammac(static_cast<float>(a), static_cast<float>(x)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half betainc(const Eigen::half& a, const Eigen::half& b, const Eigen::half& x) {
+ return Eigen::half(Eigen::numext::betainc(static_cast<float>(a), static_cast<float>(b), static_cast<float>(x)));
+}
+#endif
+
+} // end namespace numext
+} // end namespace Eigen
+
+#endif // EIGEN_SPECIALFUNCTIONS_HALF_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
new file mode 100644
index 000000000..52619fc0c
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
@@ -0,0 +1,1551 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIAL_FUNCTIONS_H
+#define EIGEN_SPECIAL_FUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+// Parts of this code are based on the Cephes Math Library.
+//
+// Cephes Math Library Release 2.8: June, 2000
+// Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier
+//
+// Permission has been kindly provided by the original author
+// to incorporate the Cephes software into the Eigen codebase:
+//
+// From: Stephen Moshier
+// To: Eugene Brevdo
+// Subject: Re: Permission to wrap several cephes functions in Eigen
+//
+// Hello Eugene,
+//
+// Thank you for writing.
+//
+// If your licensing is similar to BSD, the formal way that has been
+// handled is simply to add a statement to the effect that you are incorporating
+// the Cephes software by permission of the author.
+//
+// Good luck with your project,
+// Steve
+
+namespace cephes {
+
+/* polevl (modified for Eigen)
+ *
+ * Evaluate polynomial
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * int N;
+ * Scalar x, y, coef[N+1];
+ *
+ * y = polevl<decltype(x), N>( x, coef);
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Evaluates polynomial of degree N:
+ *
+ * 2 N
+ * y = C + C x + C x +...+ C x
+ * 0 1 2 N
+ *
+ * Coefficients are stored in reverse order:
+ *
+ * coef[0] = C , ..., coef[N] = C .
+ * N 0
+ *
+ * The function p1evl() assumes that coef[N] = 1.0 and is
+ * omitted from the array. Its calling arguments are
+ * otherwise the same as polevl().
+ *
+ *
+ * The Eigen implementation is templatized. For best speed, store
+ * coef as a const array (constexpr), e.g.
+ *
+ * const double coef[] = {1.0, 2.0, 3.0, ...};
+ *
+ */
+template <typename Scalar, int N>
+struct polevl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(const Scalar x, const Scalar coef[]) {
+ EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ return polevl<Scalar, N - 1>::run(x, coef) * x + coef[N];
+ }
+};
+
+template <typename Scalar>
+struct polevl<Scalar, 0> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(const Scalar, const Scalar coef[]) {
+ return coef[0];
+ }
+};
+
+} // end namespace cephes
+
+/****************************************************************************
+ * Implementation of lgamma, requires C++11/C99 *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct lgamma_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return Scalar(0);
+ }
+};
+
+template <typename Scalar>
+struct lgamma_retval {
+ typedef Scalar type;
+};
+
+#if EIGEN_HAS_C99_MATH
+template <>
+struct lgamma_impl<float> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE float run(float x) { return ::lgammaf(x); }
+};
+
+template <>
+struct lgamma_impl<double> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE double run(double x) { return ::lgamma(x); }
+};
+#endif
+
+/****************************************************************************
+ * Implementation of digamma (psi), based on Cephes *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct digamma_retval {
+ typedef Scalar type;
+};
+
+/*
+ *
+ * Polynomial evaluation helper for the Psi (digamma) function.
+ *
+ * digamma_impl_maybe_poly::run(s) evaluates the asymptotic Psi expansion for
+ * input Scalar s, assuming s is above 10.0.
+ *
+ * If s is above a certain threshold for the given Scalar type, zero
+ * is returned. Otherwise the polynomial is evaluated with enough
+ * coefficients for results matching Scalar machine precision.
+ *
+ *
+ */
+template <typename Scalar>
+struct digamma_impl_maybe_poly {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return Scalar(0);
+ }
+};
+
+
+template <>
+struct digamma_impl_maybe_poly<float> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE float run(const float s) {
+ const float A[] = {
+ -4.16666666666666666667E-3f,
+ 3.96825396825396825397E-3f,
+ -8.33333333333333333333E-3f,
+ 8.33333333333333333333E-2f
+ };
+
+ float z;
+ if (s < 1.0e8f) {
+ z = 1.0f / (s * s);
+ return z * cephes::polevl<float, 3>::run(z, A);
+ } else return 0.0f;
+ }
+};
+
+template <>
+struct digamma_impl_maybe_poly<double> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE double run(const double s) {
+ const double A[] = {
+ 8.33333333333333333333E-2,
+ -2.10927960927960927961E-2,
+ 7.57575757575757575758E-3,
+ -4.16666666666666666667E-3,
+ 3.96825396825396825397E-3,
+ -8.33333333333333333333E-3,
+ 8.33333333333333333333E-2
+ };
+
+ double z;
+ if (s < 1.0e17) {
+ z = 1.0 / (s * s);
+ return z * cephes::polevl<double, 6>::run(z, A);
+ }
+ else return 0.0;
+ }
+};
+
+template <typename Scalar>
+struct digamma_impl {
+ EIGEN_DEVICE_FUNC
+ static Scalar run(Scalar x) {
+ /*
+ *
+ * Psi (digamma) function (modified for Eigen)
+ *
+ *
+ * SYNOPSIS:
+ *
+ * double x, y, psi();
+ *
+ * y = psi( x );
+ *
+ *
+ * DESCRIPTION:
+ *
+ * d -
+ * psi(x) = -- ln | (x)
+ * dx
+ *
+ * is the logarithmic derivative of the gamma function.
+ * For integer x,
+ * n-1
+ * -
+ * psi(n) = -EUL + > 1/k.
+ * -
+ * k=1
+ *
+ * If x is negative, it is transformed to a positive argument by the
+ * reflection formula psi(1-x) = psi(x) + pi cot(pi x).
+ * For general positive x, the argument is made greater than 10
+ * using the recurrence psi(x+1) = psi(x) + 1/x.
+ * Then the following asymptotic expansion is applied:
+ *
+ * inf. B
+ * - 2k
+ * psi(x) = log(x) - 1/2x - > -------
+ * - 2k
+ * k=1 2k x
+ *
+ * where the B2k are Bernoulli numbers.
+ *
+ * ACCURACY (float):
+ * Relative error (except absolute when |psi| < 1):
+ * arithmetic domain # trials peak rms
+ * IEEE 0,30 30000 1.3e-15 1.4e-16
+ * IEEE -30,0 40000 1.5e-15 2.2e-16
+ *
+ * ACCURACY (double):
+ * Absolute error, relative when |psi| > 1 :
+ * arithmetic domain # trials peak rms
+ * IEEE -33,0 30000 8.2e-7 1.2e-7
+ * IEEE 0,33 100000 7.3e-7 7.7e-8
+ *
+ * ERROR MESSAGES:
+ * message condition value returned
+ * psi singularity x integer <=0 INFINITY
+ */
+
+ Scalar p, q, nz, s, w, y;
+ bool negative = false;
+
+ const Scalar maxnum = NumTraits<Scalar>::infinity();
+ const Scalar m_pi = Scalar(EIGEN_PI);
+
+ const Scalar zero = Scalar(0);
+ const Scalar one = Scalar(1);
+ const Scalar half = Scalar(0.5);
+ nz = zero;
+
+ if (x <= zero) {
+ negative = true;
+ q = x;
+ p = numext::floor(q);
+ if (p == q) {
+ return maxnum;
+ }
+ /* Remove the zeros of tan(m_pi x)
+ * by subtracting the nearest integer from x
+ */
+ nz = q - p;
+ if (nz != half) {
+ if (nz > half) {
+ p += one;
+ nz = q - p;
+ }
+ nz = m_pi / numext::tan(m_pi * nz);
+ }
+ else {
+ nz = zero;
+ }
+ x = one - x;
+ }
+
+ /* use the recurrence psi(x+1) = psi(x) + 1/x. */
+ s = x;
+ w = zero;
+ while (s < Scalar(10)) {
+ w += one / s;
+ s += one;
+ }
+
+ y = digamma_impl_maybe_poly<Scalar>::run(s);
+
+ y = numext::log(s) - (half / s) - y - w;
+
+ return (negative) ? y - nz : y;
+ }
+};
+
+/****************************************************************************
+ * Implementation of erf, requires C++11/C99 *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct erf_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return Scalar(0);
+ }
+};
+
+template <typename Scalar>
+struct erf_retval {
+ typedef Scalar type;
+};
+
+#if EIGEN_HAS_C99_MATH
+template <>
+struct erf_impl<float> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE float run(float x) { return ::erff(x); }
+};
+
+template <>
+struct erf_impl<double> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE double run(double x) { return ::erf(x); }
+};
+#endif // EIGEN_HAS_C99_MATH
+
+/***************************************************************************
+* Implementation of erfc, requires C++11/C99 *
+****************************************************************************/
+
+template <typename Scalar>
+struct erfc_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return Scalar(0);
+ }
+};
+
+template <typename Scalar>
+struct erfc_retval {
+ typedef Scalar type;
+};
+
+#if EIGEN_HAS_C99_MATH
+template <>
+struct erfc_impl<float> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE float run(const float x) { return ::erfcf(x); }
+};
+
+template <>
+struct erfc_impl<double> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE double run(const double x) { return ::erfc(x); }
+};
+#endif // EIGEN_HAS_C99_MATH
+
+/**************************************************************************************************************
+ * Implementation of igammac (complemented incomplete gamma integral), based on Cephes but requires C++11/C99 *
+ **************************************************************************************************************/
+
+template <typename Scalar>
+struct igammac_retval {
+ typedef Scalar type;
+};
+
+// NOTE: cephes_helper is also used to implement zeta
+template <typename Scalar>
+struct cephes_helper {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar machep() { assert(false && "machep not supported for this type"); return 0.0; }
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar big() { assert(false && "big not supported for this type"); return 0.0; }
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar biginv() { assert(false && "biginv not supported for this type"); return 0.0; }
+};
+
+template <>
+struct cephes_helper<float> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE float machep() {
+ return NumTraits<float>::epsilon() / 2; // 1.0 - machep == 1.0
+ }
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE float big() {
+ // use epsneg (1.0 - epsneg == 1.0)
+ return 1.0f / (NumTraits<float>::epsilon() / 2);
+ }
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE float biginv() {
+ // epsneg
+ return machep();
+ }
+};
+
+template <>
+struct cephes_helper<double> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE double machep() {
+ return NumTraits<double>::epsilon() / 2; // 1.0 - machep == 1.0
+ }
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE double big() {
+ return 1.0 / NumTraits<double>::epsilon();
+ }
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE double biginv() {
+ // inverse of eps
+ return NumTraits<double>::epsilon();
+ }
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct igammac_impl {
+ EIGEN_DEVICE_FUNC
+ static Scalar run(Scalar a, Scalar x) {
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return Scalar(0);
+ }
+};
+
+#else
+
+template <typename Scalar> struct igamma_impl; // predeclare igamma_impl
+
+template <typename Scalar>
+struct igammac_impl {
+ EIGEN_DEVICE_FUNC
+ static Scalar run(Scalar a, Scalar x) {
+ /* igamc()
+ *
+ * Incomplete gamma integral (modified for Eigen)
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * double a, x, y, igamc();
+ *
+ * y = igamc( a, x );
+ *
+ * DESCRIPTION:
+ *
+ * The function is defined by
+ *
+ *
+ * igamc(a,x) = 1 - igam(a,x)
+ *
+ * inf.
+ * -
+ * 1 | | -t a-1
+ * = ----- | e t dt.
+ * - | |
+ * | (a) -
+ * x
+ *
+ *
+ * In this implementation both arguments must be positive.
+ * The integral is evaluated by either a power series or
+ * continued fraction expansion, depending on the relative
+ * values of a and x.
+ *
+ * ACCURACY (float):
+ *
+ * Relative error:
+ * arithmetic domain # trials peak rms
+ * IEEE 0,30 30000 7.8e-6 5.9e-7
+ *
+ *
+ * ACCURACY (double):
+ *
+ * Tested at random a, x.
+ * a x Relative error:
+ * arithmetic domain domain # trials peak rms
+ * IEEE 0.5,100 0,100 200000 1.9e-14 1.7e-15
+ * IEEE 0.01,0.5 0,100 200000 1.4e-13 1.6e-15
+ *
+ */
+ /*
+ Cephes Math Library Release 2.2: June, 1992
+ Copyright 1985, 1987, 1992 by Stephen L. Moshier
+ Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+ */
+ const Scalar zero = 0;
+ const Scalar one = 1;
+ const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+
+ if ((x < zero) || (a <= zero)) {
+ // domain error
+ return nan;
+ }
+
+ if ((x < one) || (x < a)) {
+ /* The checks above ensure that we meet the preconditions for
+ * igamma_impl::Impl(), so call it, rather than igamma_impl::Run().
+ * Calling Run() would also work, but in that case the compiler may not be
+ * able to prove that igammac_impl::Run and igamma_impl::Run are not
+ * mutually recursive. This leads to worse code, particularly on
+ * platforms like nvptx, where recursion is allowed only begrudgingly.
+ */
+ return (one - igamma_impl<Scalar>::Impl(a, x));
+ }
+
+ return Impl(a, x);
+ }
+
+ private:
+ /* igamma_impl calls igammac_impl::Impl. */
+ friend struct igamma_impl<Scalar>;
+
+ /* Actually computes igamc(a, x).
+ *
+ * Preconditions:
+ * a > 0
+ * x >= 1
+ * x >= a
+ */
+ EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) {
+ const Scalar zero = 0;
+ const Scalar one = 1;
+ const Scalar two = 2;
+ const Scalar machep = cephes_helper<Scalar>::machep();
+ const Scalar maxlog = numext::log(NumTraits<Scalar>::highest());
+ const Scalar big = cephes_helper<Scalar>::big();
+ const Scalar biginv = cephes_helper<Scalar>::biginv();
+ const Scalar inf = NumTraits<Scalar>::infinity();
+
+ Scalar ans, ax, c, yc, r, t, y, z;
+ Scalar pk, pkm1, pkm2, qk, qkm1, qkm2;
+
+ if (x == inf) return zero; // std::isinf crashes on CUDA
+
+ /* Compute x**a * exp(-x) / gamma(a) */
+ ax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a);
+ if (ax < -maxlog) { // underflow
+ return zero;
+ }
+ ax = numext::exp(ax);
+
+ // continued fraction
+ y = one - a;
+ z = x + y + one;
+ c = zero;
+ pkm2 = one;
+ qkm2 = x;
+ pkm1 = x + one;
+ qkm1 = z * x;
+ ans = pkm1 / qkm1;
+
+ while (true) {
+ c += one;
+ y += one;
+ z += two;
+ yc = y * c;
+ pk = pkm1 * z - pkm2 * yc;
+ qk = qkm1 * z - qkm2 * yc;
+ if (qk != zero) {
+ r = pk / qk;
+ t = numext::abs((ans - r) / r);
+ ans = r;
+ } else {
+ t = one;
+ }
+ pkm2 = pkm1;
+ pkm1 = pk;
+ qkm2 = qkm1;
+ qkm1 = qk;
+ if (numext::abs(pk) > big) {
+ pkm2 *= biginv;
+ pkm1 *= biginv;
+ qkm2 *= biginv;
+ qkm1 *= biginv;
+ }
+ if (t <= machep) {
+ break;
+ }
+ }
+
+ return (ans * ax);
+ }
+};
+
+#endif // EIGEN_HAS_C99_MATH
+
+/************************************************************************************************
+ * Implementation of igamma (incomplete gamma integral), based on Cephes but requires C++11/C99 *
+ ************************************************************************************************/
+
+template <typename Scalar>
+struct igamma_retval {
+ typedef Scalar type;
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct igamma_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar x) {
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return Scalar(0);
+ }
+};
+
+#else
+
+template <typename Scalar>
+struct igamma_impl {
+ EIGEN_DEVICE_FUNC
+ static Scalar run(Scalar a, Scalar x) {
+ /* igam()
+ * Incomplete gamma integral
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * double a, x, y, igam();
+ *
+ * y = igam( a, x );
+ *
+ * DESCRIPTION:
+ *
+ * The function is defined by
+ *
+ * x
+ * -
+ * 1 | | -t a-1
+ * igam(a,x) = ----- | e t dt.
+ * - | |
+ * | (a) -
+ * 0
+ *
+ *
+ * In this implementation both arguments must be positive.
+ * The integral is evaluated by either a power series or
+ * continued fraction expansion, depending on the relative
+ * values of a and x.
+ *
+ * ACCURACY (double):
+ *
+ * Relative error:
+ * arithmetic domain # trials peak rms
+ * IEEE 0,30 200000 3.6e-14 2.9e-15
+ * IEEE 0,100 300000 9.9e-14 1.5e-14
+ *
+ *
+ * ACCURACY (float):
+ *
+ * Relative error:
+ * arithmetic domain # trials peak rms
+ * IEEE 0,30 20000 7.8e-6 5.9e-7
+ *
+ */
+ /*
+ Cephes Math Library Release 2.2: June, 1992
+ Copyright 1985, 1987, 1992 by Stephen L. Moshier
+ Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+ */
+
+
+ /* left tail of incomplete gamma function:
+ *
+ * inf. k
+ * a -x - x
+ * x e > ----------
+ * - -
+ * k=0 | (a+k+1)
+ *
+ */
+ const Scalar zero = 0;
+ const Scalar one = 1;
+ const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+
+ if (x == zero) return zero;
+
+ if ((x < zero) || (a <= zero)) { // domain error
+ return nan;
+ }
+
+ if ((x > one) && (x > a)) {
+ /* The checks above ensure that we meet the preconditions for
+ * igammac_impl::Impl(), so call it, rather than igammac_impl::Run().
+ * Calling Run() would also work, but in that case the compiler may not be
+ * able to prove that igammac_impl::Run and igamma_impl::Run are not
+ * mutually recursive. This leads to worse code, particularly on
+ * platforms like nvptx, where recursion is allowed only begrudgingly.
+ */
+ return (one - igammac_impl<Scalar>::Impl(a, x));
+ }
+
+ return Impl(a, x);
+ }
+
+ private:
+ /* igammac_impl calls igamma_impl::Impl. */
+ friend struct igammac_impl<Scalar>;
+
+ /* Actually computes igam(a, x).
+ *
+ * Preconditions:
+ * x > 0
+ * a > 0
+ * !(x > 1 && x > a)
+ */
+ EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) {
+ const Scalar zero = 0;
+ const Scalar one = 1;
+ const Scalar machep = cephes_helper<Scalar>::machep();
+ const Scalar maxlog = numext::log(NumTraits<Scalar>::highest());
+
+ Scalar ans, ax, c, r;
+
+ /* Compute x**a * exp(-x) / gamma(a) */
+ ax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a);
+ if (ax < -maxlog) {
+ // underflow
+ return zero;
+ }
+ ax = numext::exp(ax);
+
+ /* power series */
+ r = a;
+ c = one;
+ ans = one;
+
+ while (true) {
+ r += one;
+ c *= x/r;
+ ans += c;
+ if (c/ans <= machep) {
+ break;
+ }
+ }
+
+ return (ans * ax / a);
+ }
+};
+
+#endif // EIGEN_HAS_C99_MATH
+
+/*****************************************************************************
+ * Implementation of Riemann zeta function of two arguments, based on Cephes *
+ *****************************************************************************/
+
+template <typename Scalar>
+struct zeta_retval {
+ typedef Scalar type;
+};
+
+template <typename Scalar>
+struct zeta_impl_series {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return Scalar(0);
+ }
+};
+
+template <>
+struct zeta_impl_series<float> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE bool run(float& a, float& b, float& s, const float x, const float machep) {
+ int i = 0;
+ while(i < 9)
+ {
+ i += 1;
+ a += 1.0f;
+ b = numext::pow( a, -x );
+ s += b;
+ if( numext::abs(b/s) < machep )
+ return true;
+ }
+
+ //Return whether we are done
+ return false;
+ }
+};
+
+template <>
+struct zeta_impl_series<double> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE bool run(double& a, double& b, double& s, const double x, const double machep) {
+ int i = 0;
+ while( (i < 9) || (a <= 9.0) )
+ {
+ i += 1;
+ a += 1.0;
+ b = numext::pow( a, -x );
+ s += b;
+ if( numext::abs(b/s) < machep )
+ return true;
+ }
+
+ //Return whether we are done
+ return false;
+ }
+};
+
+template <typename Scalar>
+struct zeta_impl {
+ EIGEN_DEVICE_FUNC
+ static Scalar run(Scalar x, Scalar q) {
+ /* zeta.c
+ *
+ * Riemann zeta function of two arguments
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * double x, q, y, zeta();
+ *
+ * y = zeta( x, q );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ *
+ *
+ * inf.
+ * - -x
+ * zeta(x,q) = > (k+q)
+ * -
+ * k=0
+ *
+ * where x > 1 and q is not a negative integer or zero.
+ * The Euler-Maclaurin summation formula is used to obtain
+ * the expansion
+ *
+ * n
+ * - -x
+ * zeta(x,q) = > (k+q)
+ * -
+ * k=1
+ *
+ * 1-x inf. B x(x+1)...(x+2j)
+ * (n+q) 1 - 2j
+ * + --------- - ------- + > --------------------
+ * x-1 x - x+2j+1
+ * 2(n+q) j=1 (2j)! (n+q)
+ *
+ * where the B2j are Bernoulli numbers. Note that (see zetac.c)
+ * zeta(x,1) = zetac(x) + 1.
+ *
+ *
+ *
+ * ACCURACY:
+ *
+ * Relative error for single precision:
+ * arithmetic domain # trials peak rms
+ * IEEE 0,25 10000 6.9e-7 1.0e-7
+ *
+ * Large arguments may produce underflow in powf(), in which
+ * case the results are inaccurate.
+ *
+ * REFERENCE:
+ *
+ * Gradshteyn, I. S., and I. M. Ryzhik, Tables of Integrals,
+ * Series, and Products, p. 1073; Academic Press, 1980.
+ *
+ */
+
+ int i;
+ Scalar p, r, a, b, k, s, t, w;
+
+ const Scalar A[] = {
+ Scalar(12.0),
+ Scalar(-720.0),
+ Scalar(30240.0),
+ Scalar(-1209600.0),
+ Scalar(47900160.0),
+ Scalar(-1.8924375803183791606e9), /*1.307674368e12/691*/
+ Scalar(7.47242496e10),
+ Scalar(-2.950130727918164224e12), /*1.067062284288e16/3617*/
+ Scalar(1.1646782814350067249e14), /*5.109094217170944e18/43867*/
+ Scalar(-4.5979787224074726105e15), /*8.028576626982912e20/174611*/
+ Scalar(1.8152105401943546773e17), /*1.5511210043330985984e23/854513*/
+ Scalar(-7.1661652561756670113e18) /*1.6938241367317436694528e27/236364091*/
+ };
+
+ const Scalar maxnum = NumTraits<Scalar>::infinity();
+ const Scalar zero = 0.0, half = 0.5, one = 1.0;
+ const Scalar machep = cephes_helper<Scalar>::machep();
+ const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+
+ if( x == one )
+ return maxnum;
+
+ if( x < one )
+ {
+ return nan;
+ }
+
+ if( q <= zero )
+ {
+ if(q == numext::floor(q))
+ {
+ return maxnum;
+ }
+ p = x;
+ r = numext::floor(p);
+ if (p != r)
+ return nan;
+ }
+
+ /* Permit negative q but continue sum until n+q > +9 .
+ * This case should be handled by a reflection formula.
+ * If q<0 and x is an integer, there is a relation to
+ * the polygamma function.
+ */
+ s = numext::pow( q, -x );
+ a = q;
+ b = zero;
+ // Run the summation in a helper function that is specific to the floating precision
+ if (zeta_impl_series<Scalar>::run(a, b, s, x, machep)) {
+ return s;
+ }
+
+ w = a;
+ s += b*w/(x-one);
+ s -= half * b;
+ a = one;
+ k = zero;
+ for( i=0; i<12; i++ )
+ {
+ a *= x + k;
+ b /= w;
+ t = a*b/A[i];
+ s = s + t;
+ t = numext::abs(t/s);
+ if( t < machep ) {
+ break;
+ }
+ k += one;
+ a *= x + k;
+ b /= w;
+ k += one;
+ }
+ return s;
+ }
+};
+
+/****************************************************************************
+ * Implementation of polygamma function, requires C++11/C99 *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct polygamma_retval {
+ typedef Scalar type;
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct polygamma_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(Scalar n, Scalar x) {
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return Scalar(0);
+ }
+};
+
+#else
+
+template <typename Scalar>
+struct polygamma_impl {
+ EIGEN_DEVICE_FUNC
+ static Scalar run(Scalar n, Scalar x) {
+ Scalar zero = 0.0, one = 1.0;
+ Scalar nplus = n + one;
+ const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+
+ // Check that n is an integer
+ if (numext::floor(n) != n) {
+ return nan;
+ }
+ // Just return the digamma function for n = 1
+ else if (n == zero) {
+ return digamma_impl<Scalar>::run(x);
+ }
+ // Use the same implementation as scipy
+ else {
+ Scalar factorial = numext::exp(lgamma_impl<Scalar>::run(nplus));
+ return numext::pow(-one, nplus) * factorial * zeta_impl<Scalar>::run(nplus, x);
+ }
+ }
+};
+
+#endif // EIGEN_HAS_C99_MATH
+
+/************************************************************************************************
+ * Implementation of betainc (incomplete beta integral), based on Cephes but requires C++11/C99 *
+ ************************************************************************************************/
+
+template <typename Scalar>
+struct betainc_retval {
+ typedef Scalar type;
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct betainc_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar b, Scalar x) {
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return Scalar(0);
+ }
+};
+
+#else
+
+template <typename Scalar>
+struct betainc_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(Scalar, Scalar, Scalar) {
+ /* betaincf.c
+ *
+ * Incomplete beta integral
+ *
+ *
+ * SYNOPSIS:
+ *
+ * float a, b, x, y, betaincf();
+ *
+ * y = betaincf( a, b, x );
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Returns incomplete beta integral of the arguments, evaluated
+ * from zero to x. The function is defined as
+ *
+ * x
+ * - -
+ * | (a+b) | | a-1 b-1
+ * ----------- | t (1-t) dt.
+ * - - | |
+ * | (a) | (b) -
+ * 0
+ *
+ * The domain of definition is 0 <= x <= 1. In this
+ * implementation a and b are restricted to positive values.
+ * The integral from x to 1 may be obtained by the symmetry
+ * relation
+ *
+ * 1 - betainc( a, b, x ) = betainc( b, a, 1-x ).
+ *
+ * The integral is evaluated by a continued fraction expansion.
+ * If a < 1, the function calls itself recursively after a
+ * transformation to increase a to a+1.
+ *
+ * ACCURACY (float):
+ *
+ * Tested at random points (a,b,x) with a and b in the indicated
+ * interval and x between 0 and 1.
+ *
+ * arithmetic domain # trials peak rms
+ * Relative error:
+ * IEEE 0,30 10000 3.7e-5 5.1e-6
+ * IEEE 0,100 10000 1.7e-4 2.5e-5
+ * The useful domain for relative error is limited by underflow
+ * of the single precision exponential function.
+ * Absolute error:
+ * IEEE 0,30 100000 2.2e-5 9.6e-7
+ * IEEE 0,100 10000 6.5e-5 3.7e-6
+ *
+ * Larger errors may occur for extreme ratios of a and b.
+ *
+ * ACCURACY (double):
+ * arithmetic domain # trials peak rms
+ * IEEE 0,5 10000 6.9e-15 4.5e-16
+ * IEEE 0,85 250000 2.2e-13 1.7e-14
+ * IEEE 0,1000 30000 5.3e-12 6.3e-13
+ * IEEE 0,10000 250000 9.3e-11 7.1e-12
+ * IEEE 0,100000 10000 8.7e-10 4.8e-11
+ * Outputs smaller than the IEEE gradual underflow threshold
+ * were excluded from these statistics.
+ *
+ * ERROR MESSAGES:
+ * message condition value returned
+ * incbet domain x<0, x>1 nan
+ * incbet underflow nan
+ */
+
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return Scalar(0);
+ }
+};
+
+/* Continued fraction expansion #1 for incomplete beta integral (small_branch = True)
+ * Continued fraction expansion #2 for incomplete beta integral (small_branch = False)
+ */
+template <typename Scalar>
+struct incbeta_cfe {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar b, Scalar x, bool small_branch) {
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, float>::value ||
+ internal::is_same<Scalar, double>::value),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ const Scalar big = cephes_helper<Scalar>::big();
+ const Scalar machep = cephes_helper<Scalar>::machep();
+ const Scalar biginv = cephes_helper<Scalar>::biginv();
+
+ const Scalar zero = 0;
+ const Scalar one = 1;
+ const Scalar two = 2;
+
+ Scalar xk, pk, pkm1, pkm2, qk, qkm1, qkm2;
+ Scalar k1, k2, k3, k4, k5, k6, k7, k8, k26update;
+ Scalar ans;
+ int n;
+
+ const int num_iters = (internal::is_same<Scalar, float>::value) ? 100 : 300;
+ const Scalar thresh =
+ (internal::is_same<Scalar, float>::value) ? machep : Scalar(3) * machep;
+ Scalar r = (internal::is_same<Scalar, float>::value) ? zero : one;
+
+ if (small_branch) {
+ k1 = a;
+ k2 = a + b;
+ k3 = a;
+ k4 = a + one;
+ k5 = one;
+ k6 = b - one;
+ k7 = k4;
+ k8 = a + two;
+ k26update = one;
+ } else {
+ k1 = a;
+ k2 = b - one;
+ k3 = a;
+ k4 = a + one;
+ k5 = one;
+ k6 = a + b;
+ k7 = a + one;
+ k8 = a + two;
+ k26update = -one;
+ x = x / (one - x);
+ }
+
+ pkm2 = zero;
+ qkm2 = one;
+ pkm1 = one;
+ qkm1 = one;
+ ans = one;
+ n = 0;
+
+ do {
+ xk = -(x * k1 * k2) / (k3 * k4);
+ pk = pkm1 + pkm2 * xk;
+ qk = qkm1 + qkm2 * xk;
+ pkm2 = pkm1;
+ pkm1 = pk;
+ qkm2 = qkm1;
+ qkm1 = qk;
+
+ xk = (x * k5 * k6) / (k7 * k8);
+ pk = pkm1 + pkm2 * xk;
+ qk = qkm1 + qkm2 * xk;
+ pkm2 = pkm1;
+ pkm1 = pk;
+ qkm2 = qkm1;
+ qkm1 = qk;
+
+ if (qk != zero) {
+ r = pk / qk;
+ if (numext::abs(ans - r) < numext::abs(r) * thresh) {
+ return r;
+ }
+ ans = r;
+ }
+
+ k1 += one;
+ k2 += k26update;
+ k3 += two;
+ k4 += two;
+ k5 += one;
+ k6 -= k26update;
+ k7 += two;
+ k8 += two;
+
+ if ((numext::abs(qk) + numext::abs(pk)) > big) {
+ pkm2 *= biginv;
+ pkm1 *= biginv;
+ qkm2 *= biginv;
+ qkm1 *= biginv;
+ }
+ if ((numext::abs(qk) < biginv) || (numext::abs(pk) < biginv)) {
+ pkm2 *= big;
+ pkm1 *= big;
+ qkm2 *= big;
+ qkm1 *= big;
+ }
+ } while (++n < num_iters);
+
+ return ans;
+ }
+};
+
+/* Helper functions depending on the Scalar type */
+template <typename Scalar>
+struct betainc_helper {};
+
+template <>
+struct betainc_helper<float> {
+ /* Core implementation, assumes a large (> 1.0) */
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float incbsa(float aa, float bb,
+ float xx) {
+ float ans, a, b, t, x, onemx;
+ bool reversed_a_b = false;
+
+ onemx = 1.0f - xx;
+
+ /* see if x is greater than the mean */
+ if (xx > (aa / (aa + bb))) {
+ reversed_a_b = true;
+ a = bb;
+ b = aa;
+ t = xx;
+ x = onemx;
+ } else {
+ a = aa;
+ b = bb;
+ t = onemx;
+ x = xx;
+ }
+
+ /* Choose expansion for optimal convergence */
+ if (b > 10.0f) {
+ if (numext::abs(b * x / a) < 0.3f) {
+ t = betainc_helper<float>::incbps(a, b, x);
+ if (reversed_a_b) t = 1.0f - t;
+ return t;
+ }
+ }
+
+ ans = x * (a + b - 2.0f) / (a - 1.0f);
+ if (ans < 1.0f) {
+ ans = incbeta_cfe<float>::run(a, b, x, true /* small_branch */);
+ t = b * numext::log(t);
+ } else {
+ ans = incbeta_cfe<float>::run(a, b, x, false /* small_branch */);
+ t = (b - 1.0f) * numext::log(t);
+ }
+
+ t += a * numext::log(x) + lgamma_impl<float>::run(a + b) -
+ lgamma_impl<float>::run(a) - lgamma_impl<float>::run(b);
+ t += numext::log(ans / a);
+ t = numext::exp(t);
+
+ if (reversed_a_b) t = 1.0f - t;
+ return t;
+ }
+
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE float incbps(float a, float b, float x) {
+ float t, u, y, s;
+ const float machep = cephes_helper<float>::machep();
+
+ y = a * numext::log(x) + (b - 1.0f) * numext::log1p(-x) - numext::log(a);
+ y -= lgamma_impl<float>::run(a) + lgamma_impl<float>::run(b);
+ y += lgamma_impl<float>::run(a + b);
+
+ t = x / (1.0f - x);
+ s = 0.0f;
+ u = 1.0f;
+ do {
+ b -= 1.0f;
+ if (b == 0.0f) {
+ break;
+ }
+ a += 1.0f;
+ u *= t * b / a;
+ s += u;
+ } while (numext::abs(u) > machep);
+
+ return numext::exp(y) * (1.0f + s);
+ }
+};
+
+template <>
+struct betainc_impl<float> {
+ EIGEN_DEVICE_FUNC
+ static float run(float a, float b, float x) {
+ const float nan = NumTraits<float>::quiet_NaN();
+ float ans, t;
+
+ if (a <= 0.0f) return nan;
+ if (b <= 0.0f) return nan;
+ if ((x <= 0.0f) || (x >= 1.0f)) {
+ if (x == 0.0f) return 0.0f;
+ if (x == 1.0f) return 1.0f;
+ // mtherr("betaincf", DOMAIN);
+ return nan;
+ }
+
+ /* transformation for small aa */
+ if (a <= 1.0f) {
+ ans = betainc_helper<float>::incbsa(a + 1.0f, b, x);
+ t = a * numext::log(x) + b * numext::log1p(-x) +
+ lgamma_impl<float>::run(a + b) - lgamma_impl<float>::run(a + 1.0f) -
+ lgamma_impl<float>::run(b);
+ return (ans + numext::exp(t));
+ } else {
+ return betainc_helper<float>::incbsa(a, b, x);
+ }
+ }
+};
+
+template <>
+struct betainc_helper<double> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE double incbps(double a, double b, double x) {
+ const double machep = cephes_helper<double>::machep();
+
+ double s, t, u, v, n, t1, z, ai;
+
+ ai = 1.0 / a;
+ u = (1.0 - b) * x;
+ v = u / (a + 1.0);
+ t1 = v;
+ t = u;
+ n = 2.0;
+ s = 0.0;
+ z = machep * ai;
+ while (numext::abs(v) > z) {
+ u = (n - b) * x / n;
+ t *= u;
+ v = t / (a + n);
+ s += v;
+ n += 1.0;
+ }
+ s += t1;
+ s += ai;
+
+ u = a * numext::log(x);
+ // TODO: gamma() is not directly implemented in Eigen.
+ /*
+ if ((a + b) < maxgam && numext::abs(u) < maxlog) {
+ t = gamma(a + b) / (gamma(a) * gamma(b));
+ s = s * t * pow(x, a);
+ } else {
+ */
+ t = lgamma_impl<double>::run(a + b) - lgamma_impl<double>::run(a) -
+ lgamma_impl<double>::run(b) + u + numext::log(s);
+ return s = numext::exp(t);
+ }
+};
+
+template <>
+struct betainc_impl<double> {
+ EIGEN_DEVICE_FUNC
+ static double run(double aa, double bb, double xx) {
+ const double nan = NumTraits<double>::quiet_NaN();
+ const double machep = cephes_helper<double>::machep();
+ // const double maxgam = 171.624376956302725;
+
+ double a, b, t, x, xc, w, y;
+ bool reversed_a_b = false;
+
+ if (aa <= 0.0 || bb <= 0.0) {
+ return nan; // goto domerr;
+ }
+
+ if ((xx <= 0.0) || (xx >= 1.0)) {
+ if (xx == 0.0) return (0.0);
+ if (xx == 1.0) return (1.0);
+ // mtherr("incbet", DOMAIN);
+ return nan;
+ }
+
+ if ((bb * xx) <= 1.0 && xx <= 0.95) {
+ return betainc_helper<double>::incbps(aa, bb, xx);
+ }
+
+ w = 1.0 - xx;
+
+ /* Reverse a and b if x is greater than the mean. */
+ if (xx > (aa / (aa + bb))) {
+ reversed_a_b = true;
+ a = bb;
+ b = aa;
+ xc = xx;
+ x = w;
+ } else {
+ a = aa;
+ b = bb;
+ xc = w;
+ x = xx;
+ }
+
+ if (reversed_a_b && (b * x) <= 1.0 && x <= 0.95) {
+ t = betainc_helper<double>::incbps(a, b, x);
+ if (t <= machep) {
+ t = 1.0 - machep;
+ } else {
+ t = 1.0 - t;
+ }
+ return t;
+ }
+
+ /* Choose expansion for better convergence. */
+ y = x * (a + b - 2.0) - (a - 1.0);
+ if (y < 0.0) {
+ w = incbeta_cfe<double>::run(a, b, x, true /* small_branch */);
+ } else {
+ w = incbeta_cfe<double>::run(a, b, x, false /* small_branch */) / xc;
+ }
+
+ /* Multiply w by the factor
+ a b _ _ _
+ x (1-x) | (a+b) / ( a | (a) | (b) ) . */
+
+ y = a * numext::log(x);
+ t = b * numext::log(xc);
+ // TODO: gamma is not directly implemented in Eigen.
+ /*
+ if ((a + b) < maxgam && numext::abs(y) < maxlog && numext::abs(t) < maxlog)
+ {
+ t = pow(xc, b);
+ t *= pow(x, a);
+ t /= a;
+ t *= w;
+ t *= gamma(a + b) / (gamma(a) * gamma(b));
+ } else {
+ */
+ /* Resort to logarithms. */
+ y += t + lgamma_impl<double>::run(a + b) - lgamma_impl<double>::run(a) -
+ lgamma_impl<double>::run(b);
+ y += numext::log(w / a);
+ t = numext::exp(y);
+
+ /* } */
+ // done:
+
+ if (reversed_a_b) {
+ if (t <= machep) {
+ t = 1.0 - machep;
+ } else {
+ t = 1.0 - t;
+ }
+ }
+ return t;
+ }
+};
+
+#endif // EIGEN_HAS_C99_MATH
+
+} // end namespace internal
+
+namespace numext {
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(lgamma, Scalar)
+ lgamma(const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(lgamma, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(digamma, Scalar)
+ digamma(const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(digamma, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(zeta, Scalar)
+zeta(const Scalar& x, const Scalar& q) {
+ return EIGEN_MATHFUNC_IMPL(zeta, Scalar)::run(x, q);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(polygamma, Scalar)
+polygamma(const Scalar& n, const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(polygamma, Scalar)::run(n, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erf, Scalar)
+ erf(const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(erf, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erfc, Scalar)
+ erfc(const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(erfc, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igamma, Scalar)
+ igamma(const Scalar& a, const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(igamma, Scalar)::run(a, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igammac, Scalar)
+ igammac(const Scalar& a, const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(igammac, Scalar)::run(a, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(betainc, Scalar)
+ betainc(const Scalar& a, const Scalar& b, const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(betainc, Scalar)::run(a, b, x);
+}
+
+} // end namespace numext
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPECIAL_FUNCTIONS_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
new file mode 100644
index 000000000..46d60d323
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
@@ -0,0 +1,58 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_PACKETMATH_H
+#define EIGEN_SPECIALFUNCTIONS_PACKETMATH_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal \returns the ln(|gamma(\a a)|) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet plgamma(const Packet& a) { using numext::lgamma; return lgamma(a); }
+
+/** \internal \returns the derivative of lgamma, psi(\a a) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pdigamma(const Packet& a) { using numext::digamma; return digamma(a); }
+
+/** \internal \returns the zeta function of two arguments (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pzeta(const Packet& x, const Packet& q) { using numext::zeta; return zeta(x, q); }
+
+/** \internal \returns the polygamma function (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet ppolygamma(const Packet& n, const Packet& x) { using numext::polygamma; return polygamma(n, x); }
+
+/** \internal \returns the erf(\a a) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet perf(const Packet& a) { using numext::erf; return erf(a); }
+
+/** \internal \returns the erfc(\a a) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet perfc(const Packet& a) { using numext::erfc; return erfc(a); }
+
+/** \internal \returns the incomplete gamma function igamma(\a a, \a x) */
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Packet pigamma(const Packet& a, const Packet& x) { using numext::igamma; return igamma(a, x); }
+
+/** \internal \returns the complementary incomplete gamma function igammac(\a a, \a x) */
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Packet pigammac(const Packet& a, const Packet& x) { using numext::igammac; return igammac(a, x); }
+
+/** \internal \returns the complementary incomplete gamma function betainc(\a a, \a b, \a x) */
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Packet pbetainc(const Packet& a, const Packet& b,const Packet& x) { using numext::betainc; return betainc(a, b, x); }
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPECIALFUNCTIONS_PACKETMATH_H
+
diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h
new file mode 100644
index 000000000..ec4fa8448
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h
@@ -0,0 +1,165 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CUDA_SPECIALFUNCTIONS_H
+#define EIGEN_CUDA_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 plgamma<float4>(const float4& a)
+{
+ return make_float4(lgammaf(a.x), lgammaf(a.y), lgammaf(a.z), lgammaf(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 plgamma<double2>(const double2& a)
+{
+ using numext::lgamma;
+ return make_double2(lgamma(a.x), lgamma(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pdigamma<float4>(const float4& a)
+{
+ using numext::digamma;
+ return make_float4(digamma(a.x), digamma(a.y), digamma(a.z), digamma(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pdigamma<double2>(const double2& a)
+{
+ using numext::digamma;
+ return make_double2(digamma(a.x), digamma(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pzeta<float4>(const float4& x, const float4& q)
+{
+ using numext::zeta;
+ return make_float4(zeta(x.x, q.x), zeta(x.y, q.y), zeta(x.z, q.z), zeta(x.w, q.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pzeta<double2>(const double2& x, const double2& q)
+{
+ using numext::zeta;
+ return make_double2(zeta(x.x, q.x), zeta(x.y, q.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 ppolygamma<float4>(const float4& n, const float4& x)
+{
+ using numext::polygamma;
+ return make_float4(polygamma(n.x, x.x), polygamma(n.y, x.y), polygamma(n.z, x.z), polygamma(n.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 ppolygamma<double2>(const double2& n, const double2& x)
+{
+ using numext::polygamma;
+ return make_double2(polygamma(n.x, x.x), polygamma(n.y, x.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 perf<float4>(const float4& a)
+{
+ return make_float4(erff(a.x), erff(a.y), erff(a.z), erff(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 perf<double2>(const double2& a)
+{
+ using numext::erf;
+ return make_double2(erf(a.x), erf(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 perfc<float4>(const float4& a)
+{
+ using numext::erfc;
+ return make_float4(erfc(a.x), erfc(a.y), erfc(a.z), erfc(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 perfc<double2>(const double2& a)
+{
+ using numext::erfc;
+ return make_double2(erfc(a.x), erfc(a.y));
+}
+
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pigamma<float4>(const float4& a, const float4& x)
+{
+ using numext::igamma;
+ return make_float4(
+ igamma(a.x, x.x),
+ igamma(a.y, x.y),
+ igamma(a.z, x.z),
+ igamma(a.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pigamma<double2>(const double2& a, const double2& x)
+{
+ using numext::igamma;
+ return make_double2(igamma(a.x, x.x), igamma(a.y, x.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pigammac<float4>(const float4& a, const float4& x)
+{
+ using numext::igammac;
+ return make_float4(
+ igammac(a.x, x.x),
+ igammac(a.y, x.y),
+ igammac(a.z, x.z),
+ igammac(a.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pigammac<double2>(const double2& a, const double2& x)
+{
+ using numext::igammac;
+ return make_double2(igammac(a.x, x.x), igammac(a.y, x.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pbetainc<float4>(const float4& a, const float4& b, const float4& x)
+{
+ using numext::betainc;
+ return make_float4(
+ betainc(a.x, b.x, x.x),
+ betainc(a.y, b.y, x.y),
+ betainc(a.z, b.z, x.z),
+ betainc(a.w, b.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pbetainc<double2>(const double2& a, const double2& b, const double2& x)
+{
+ using numext::betainc;
+ return make_double2(betainc(a.x, b.x, x.x), betainc(a.y, b.y, x.y));
+}
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CUDA_SPECIALFUNCTIONS_H
diff --git a/unsupported/Eigen/src/Splines/CMakeLists.txt b/unsupported/Eigen/src/Splines/CMakeLists.txt
deleted file mode 100644
index 55c6271e9..000000000
--- a/unsupported/Eigen/src/Splines/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Splines_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_Splines_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Splines COMPONENT Devel
- )
diff --git a/unsupported/Eigen/src/Splines/Spline.h b/unsupported/Eigen/src/Splines/Spline.h
index ddcddfc9a..627f6e482 100644
--- a/unsupported/Eigen/src/Splines/Spline.h
+++ b/unsupported/Eigen/src/Splines/Spline.h
@@ -94,7 +94,7 @@ namespace Eigen
const KnotVectorType& knots() const { return m_knots; }
/**
- * \brief Returns the knots of the underlying spline.
+ * \brief Returns the ctrls of the underlying spline.
**/
const ControlPointVectorType& ctrls() const { return m_ctrls; }
diff --git a/unsupported/doc/examples/BVH_Example.cpp b/unsupported/doc/examples/BVH_Example.cpp
index 6b6fac075..afb0c94c2 100644
--- a/unsupported/doc/examples/BVH_Example.cpp
+++ b/unsupported/doc/examples/BVH_Example.cpp
@@ -6,9 +6,7 @@ using namespace Eigen;
typedef AlignedBox<double, 2> Box2d;
namespace Eigen {
- namespace internal {
- Box2d bounding_box(const Vector2d &v) { return Box2d(v, v); } //compute the bounding box of a single point
- }
+ Box2d bounding_box(const Vector2d &v) { return Box2d(v, v); } //compute the bounding box of a single point
}
struct PointPointMinimizer //how to compute squared distances between points and rectangles
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index fab140871..c0b321617 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -111,10 +111,14 @@ ei_add_test(gmres)
ei_add_test(minres)
ei_add_test(levenberg_marquardt)
ei_add_test(kronecker_product)
+ei_add_test(special_functions)
# TODO: The following test names are prefixed with the cxx11 string, since historically
# the tests depended on c++11. This isn't the case anymore so we ought to rename them.
-ei_add_test(cxx11_float16)
+# FIXME: Old versions of MSVC fail to compile this code, so we just disable these tests
+# when using visual studio. We should make the check more strict to enable the tests for
+# newer versions of MSVC.
+if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
ei_add_test(cxx11_tensor_dimension)
ei_add_test(cxx11_tensor_map)
ei_add_test(cxx11_tensor_assign)
@@ -132,7 +136,8 @@ ei_add_test(cxx11_tensor_io)
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
# This test requires __uint128_t which is only available on 64bit systems
ei_add_test(cxx11_tensor_uint128)
-endif()
+endif()
+endif()
if(EIGEN_TEST_CXX11)
# It should be safe to always run these tests as there is some fallback code for
@@ -188,10 +193,12 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
# Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor
# and -fno-check-new flags since they trigger thousands of compilation warnings
# in the CUDA runtime
+ # Also remove -ansi that is incompatible with std=c++11.
string(REPLACE "-pedantic" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
string(REPLACE "-Wundef" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
string(REPLACE "-Wnon-virtual-dtor" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+ string(REPLACE "-ansi" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
message(STATUS "Flags used to compile cuda code: " ${CMAKE_CXX_FLAGS})
@@ -207,7 +214,14 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr")
endif()
- set(CUDA_NVCC_FLAGS "-std=c++11 ${EIGEN_CUDA_RELAXED_CONSTEXPR} -arch compute_${EIGEN_CUDA_COMPUTE_ARCH} -Xcudafe \"--display_error_number\"")
+ if( (NOT EIGEN_TEST_CXX11) OR (CMAKE_VERSION VERSION_LESS 3.3))
+ set(EIGEN_CUDA_CXX11_FLAG "-std=c++11")
+ else()
+ # otherwise the flag has already been added because of the above set(CMAKE_CXX_STANDARD 11)
+ set(EIGEN_CUDA_CXX11_FLAG "")
+ endif()
+
+ set(CUDA_NVCC_FLAGS "${EIGEN_CUDA_CXX11_FLAG} ${EIGEN_CUDA_RELAXED_CONSTEXPR} -arch compute_${EIGEN_CUDA_COMPUTE_ARCH} -Xcudafe \"--display_error_number\" ${CUDA_NVCC_FLAGS}")
cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include")
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
@@ -217,6 +231,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
ei_add_test(cxx11_tensor_reduction_cuda)
ei_add_test(cxx11_tensor_argmax_cuda)
ei_add_test(cxx11_tensor_cast_float16_cuda)
+ ei_add_test(cxx11_tensor_scan_cuda)
# The random number generation code requires arch 3.5 or greater.
if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 34)
diff --git a/unsupported/test/FFTW.cpp b/unsupported/test/FFTW.cpp
index 1dd6dc97d..8b7528fb7 100644
--- a/unsupported/test/FFTW.cpp
+++ b/unsupported/test/FFTW.cpp
@@ -18,11 +18,11 @@ using namespace Eigen;
template < typename T>
-complex<long double> promote(complex<T> x) { return complex<long double>(x.real(),x.imag()); }
+complex<long double> promote(complex<T> x) { return complex<long double>((long double)x.real(),(long double)x.imag()); }
-complex<long double> promote(float x) { return complex<long double>( x); }
-complex<long double> promote(double x) { return complex<long double>( x); }
-complex<long double> promote(long double x) { return complex<long double>( x); }
+complex<long double> promote(float x) { return complex<long double>((long double)x); }
+complex<long double> promote(double x) { return complex<long double>((long double)x); }
+complex<long double> promote(long double x) { return complex<long double>((long double)x); }
template <typename VT1,typename VT2>
@@ -33,7 +33,7 @@ complex<long double> promote(long double x) { return complex<long double>( x);
long double pi = acos((long double)-1 );
for (size_t k0=0;k0<(size_t)fftbuf.size();++k0) {
complex<long double> acc = 0;
- long double phinc = -2.*k0* pi / timebuf.size();
+ long double phinc = (long double)(-2.)*k0* pi / timebuf.size();
for (size_t k1=0;k1<(size_t)timebuf.size();++k1) {
acc += promote( timebuf[k1] ) * exp( complex<long double>(0,k1*phinc) );
}
@@ -54,8 +54,8 @@ complex<long double> promote(long double x) { return complex<long double>( x);
long double difpower=0;
size_t n = (min)( buf1.size(),buf2.size() );
for (size_t k=0;k<n;++k) {
- totalpower += (numext::abs2( buf1[k] ) + numext::abs2(buf2[k]) )/2;
- difpower += numext::abs2(buf1[k] - buf2[k]);
+ totalpower += (long double)((numext::abs2( buf1[k] ) + numext::abs2(buf2[k]) )/2);
+ difpower += (long double)(numext::abs2(buf1[k] - buf2[k]));
}
return sqrt(difpower/totalpower);
}
@@ -93,19 +93,19 @@ void test_scalar_generic(int nfft)
fft.SetFlag(fft.HalfSpectrum );
fft.fwd( freqBuf,tbuf);
VERIFY((size_t)freqBuf.size() == (size_t)( (nfft>>1)+1) );
- VERIFY( fft_rmse(freqBuf,tbuf) < test_precision<T>() );// gross check
+ VERIFY( T(fft_rmse(freqBuf,tbuf)) < test_precision<T>() );// gross check
fft.ClearFlag(fft.HalfSpectrum );
fft.fwd( freqBuf,tbuf);
VERIFY( (size_t)freqBuf.size() == (size_t)nfft);
- VERIFY( fft_rmse(freqBuf,tbuf) < test_precision<T>() );// gross check
+ VERIFY( T(fft_rmse(freqBuf,tbuf)) < test_precision<T>() );// gross check
if (nfft&1)
return; // odd FFTs get the wrong size inverse FFT
ScalarVector tbuf2;
fft.inv( tbuf2 , freqBuf);
- VERIFY( dif_rmse(tbuf,tbuf2) < test_precision<T>() );// gross check
+ VERIFY( T(dif_rmse(tbuf,tbuf2)) < test_precision<T>() );// gross check
// verify that the Unscaled flag takes effect
@@ -121,12 +121,12 @@ void test_scalar_generic(int nfft)
//for (size_t i=0;i<(size_t) tbuf.size();++i)
// cout << "freqBuf=" << freqBuf[i] << " in2=" << tbuf3[i] << " - in=" << tbuf[i] << " => " << (tbuf3[i] - tbuf[i] ) << endl;
- VERIFY( dif_rmse(tbuf,tbuf3) < test_precision<T>() );// gross check
+ VERIFY( T(dif_rmse(tbuf,tbuf3)) < test_precision<T>() );// gross check
// verify that ClearFlag works
fft.ClearFlag(fft.Unscaled);
fft.inv( tbuf2 , freqBuf);
- VERIFY( dif_rmse(tbuf,tbuf2) < test_precision<T>() );// gross check
+ VERIFY( T(dif_rmse(tbuf,tbuf2)) < test_precision<T>() );// gross check
}
template <typename T>
@@ -152,10 +152,10 @@ void test_complex_generic(int nfft)
inbuf[k]= Complex( (T)(rand()/(double)RAND_MAX - .5), (T)(rand()/(double)RAND_MAX - .5) );
fft.fwd( outbuf , inbuf);
- VERIFY( fft_rmse(outbuf,inbuf) < test_precision<T>() );// gross check
+ VERIFY( T(fft_rmse(outbuf,inbuf)) < test_precision<T>() );// gross check
fft.inv( buf3 , outbuf);
- VERIFY( dif_rmse(inbuf,buf3) < test_precision<T>() );// gross check
+ VERIFY( T(dif_rmse(inbuf,buf3)) < test_precision<T>() );// gross check
// verify that the Unscaled flag takes effect
ComplexVector buf4;
@@ -163,12 +163,12 @@ void test_complex_generic(int nfft)
fft.inv( buf4 , outbuf);
for (int k=0;k<nfft;++k)
buf4[k] *= T(1./nfft);
- VERIFY( dif_rmse(inbuf,buf4) < test_precision<T>() );// gross check
+ VERIFY( T(dif_rmse(inbuf,buf4)) < test_precision<T>() );// gross check
// verify that ClearFlag works
fft.ClearFlag(fft.Unscaled);
fft.inv( buf3 , outbuf);
- VERIFY( dif_rmse(inbuf,buf3) < test_precision<T>() );// gross check
+ VERIFY( T(dif_rmse(inbuf,buf3)) < test_precision<T>() );// gross check
}
template <typename T>
diff --git a/unsupported/test/autodiff.cpp b/unsupported/test/autodiff.cpp
index b59fd1c43..2da6dd8f3 100644
--- a/unsupported/test/autodiff.cpp
+++ b/unsupported/test/autodiff.cpp
@@ -205,6 +205,10 @@ void test_autodiff_hessian()
VERIFY_IS_APPROX(y.value().derivatives()(1), s4*std::cos(s1*s3+s2*s4));
VERIFY_IS_APPROX(y.derivatives()(0).derivatives(), -std::sin(s1*s3+s2*s4)*Vector2d(s3*s3,s4*s3));
VERIFY_IS_APPROX(y.derivatives()(1).derivatives(), -std::sin(s1*s3+s2*s4)*Vector2d(s3*s4,s4*s4));
+
+ ADD z = x(0)*x(1);
+ VERIFY_IS_APPROX(z.derivatives()(0).derivatives(), Vector2d(0,1));
+ VERIFY_IS_APPROX(z.derivatives()(1).derivatives(), Vector2d(1,0));
}
double bug_1222() {
@@ -234,6 +238,32 @@ double bug_1223() {
return t.value() + t2.value();
}
+// regression test for some compilation issues with specializations of ScalarBinaryOpTraits
+void bug_1260() {
+ Matrix4d A;
+ Vector4d v;
+ A*v;
+}
+
+// check a compilation issue with numext::max
+double bug_1261() {
+ typedef AutoDiffScalar<Matrix2d> AD;
+ typedef Matrix<AD,2,1> VectorAD;
+
+ VectorAD v;
+ const AD maxVal = v.maxCoeff();
+ const AD minVal = v.minCoeff();
+ return maxVal.value() + minVal.value();
+}
+
+double bug_1264() {
+ typedef AutoDiffScalar<Vector2d> AD;
+ const AD s;
+ const Matrix<AD, 3, 1> v1;
+ const Matrix<AD, 3, 1> v2 = (s + 3.0) * v1;
+ return v2(0).value();
+}
+
void test_autodiff()
{
for(int i = 0; i < g_repeat; i++) {
@@ -245,5 +275,7 @@ void test_autodiff()
bug_1222();
bug_1223();
+ bug_1260();
+ bug_1261();
}
diff --git a/unsupported/test/autodiff_scalar.cpp b/unsupported/test/autodiff_scalar.cpp
index c631c734a..4df2f5c57 100644
--- a/unsupported/test/autodiff_scalar.cpp
+++ b/unsupported/test/autodiff_scalar.cpp
@@ -36,13 +36,48 @@ template<typename Scalar> void check_atan2()
VERIFY_IS_APPROX(res.derivatives(), x.derivatives());
}
+template<typename Scalar> void check_hyperbolic_functions()
+{
+ using std::sinh;
+ using std::cosh;
+ using std::tanh;
+ typedef Matrix<Scalar, 1, 1> Deriv1;
+ typedef AutoDiffScalar<Deriv1> AD;
+ Deriv1 p = Deriv1::Random();
+ AD val(p.x(),Deriv1::UnitX());
+
+ Scalar cosh_px = std::cosh(p.x());
+ AD res1 = tanh(val);
+ VERIFY_IS_APPROX(res1.value(), std::tanh(p.x()));
+ VERIFY_IS_APPROX(res1.derivatives().x(), Scalar(1.0) / (cosh_px * cosh_px));
+ AD res2 = sinh(val);
+ VERIFY_IS_APPROX(res2.value(), std::sinh(p.x()));
+ VERIFY_IS_APPROX(res2.derivatives().x(), cosh_px);
+ AD res3 = cosh(val);
+ VERIFY_IS_APPROX(res3.value(), cosh_px);
+ VERIFY_IS_APPROX(res3.derivatives().x(), std::sinh(p.x()));
+
+ // Check constant values.
+ const Scalar sample_point = Scalar(1) / Scalar(3);
+ val = AD(sample_point,Deriv1::UnitX());
+ res1 = tanh(val);
+ VERIFY_IS_APPROX(res1.derivatives().x(), Scalar(0.896629559604914));
+
+ res2 = sinh(val);
+ VERIFY_IS_APPROX(res2.derivatives().x(), Scalar(1.056071867829939));
+
+ res3 = cosh(val);
+ VERIFY_IS_APPROX(res3.derivatives().x(), Scalar(0.339540557256150));
+}
void test_autodiff_scalar()
{
for(int i = 0; i < g_repeat; i++) {
CALL_SUBTEST_1( check_atan2<float>() );
CALL_SUBTEST_2( check_atan2<double>() );
+ CALL_SUBTEST_3( check_hyperbolic_functions<float>() );
+ CALL_SUBTEST_4( check_hyperbolic_functions<double>() );
}
}
diff --git a/unsupported/test/cxx11_float16.cpp b/unsupported/test/cxx11_float16.cpp
deleted file mode 100644
index e39a7f83c..000000000
--- a/unsupported/test/cxx11_float16.cpp
+++ /dev/null
@@ -1,203 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_float16
-
-#include "main.h"
-#include <Eigen/src/Core/arch/CUDA/Half.h>
-
-using Eigen::half;
-
-void test_conversion()
-{
- // Conversion from float.
- VERIFY_IS_EQUAL(half(1.0f).x, 0x3c00);
- VERIFY_IS_EQUAL(half(0.5f).x, 0x3800);
- VERIFY_IS_EQUAL(half(0.33333f).x, 0x3555);
- VERIFY_IS_EQUAL(half(0.0f).x, 0x0000);
- VERIFY_IS_EQUAL(half(-0.0f).x, 0x8000);
- VERIFY_IS_EQUAL(half(65504.0f).x, 0x7bff);
- VERIFY_IS_EQUAL(half(65536.0f).x, 0x7c00); // Becomes infinity.
-
- // Denormals.
- VERIFY_IS_EQUAL(half(-5.96046e-08f).x, 0x8001);
- VERIFY_IS_EQUAL(half(5.96046e-08f).x, 0x0001);
- VERIFY_IS_EQUAL(half(1.19209e-07f).x, 0x0002);
-
- // Verify round-to-nearest-even behavior.
- float val1 = float(half(__half(0x3c00)));
- float val2 = float(half(__half(0x3c01)));
- float val3 = float(half(__half(0x3c02)));
- VERIFY_IS_EQUAL(half(0.5f * (val1 + val2)).x, 0x3c00);
- VERIFY_IS_EQUAL(half(0.5f * (val2 + val3)).x, 0x3c02);
-
- // Conversion from int.
- VERIFY_IS_EQUAL(half(-1).x, 0xbc00);
- VERIFY_IS_EQUAL(half(0).x, 0x0000);
- VERIFY_IS_EQUAL(half(1).x, 0x3c00);
- VERIFY_IS_EQUAL(half(2).x, 0x4000);
- VERIFY_IS_EQUAL(half(3).x, 0x4200);
-
- // Conversion from bool.
- VERIFY_IS_EQUAL(half(false).x, 0x0000);
- VERIFY_IS_EQUAL(half(true).x, 0x3c00);
-
- // Conversion to float.
- VERIFY_IS_EQUAL(float(half(__half(0x0000))), 0.0f);
- VERIFY_IS_EQUAL(float(half(__half(0x3c00))), 1.0f);
-
- // Denormals.
- VERIFY_IS_APPROX(float(half(__half(0x8001))), -5.96046e-08f);
- VERIFY_IS_APPROX(float(half(__half(0x0001))), 5.96046e-08f);
- VERIFY_IS_APPROX(float(half(__half(0x0002))), 1.19209e-07f);
-
- // NaNs and infinities.
- VERIFY(!(numext::isinf)(float(half(65504.0f)))); // Largest finite number.
- VERIFY(!(numext::isnan)(float(half(0.0f))));
- VERIFY((numext::isinf)(float(half(__half(0xfc00)))));
- VERIFY((numext::isnan)(float(half(__half(0xfc01)))));
- VERIFY((numext::isinf)(float(half(__half(0x7c00)))));
- VERIFY((numext::isnan)(float(half(__half(0x7c01)))));
-
-#if !EIGEN_COMP_MSVC
- // Visual Studio errors out on divisions by 0
- VERIFY((numext::isnan)(float(half(0.0 / 0.0))));
- VERIFY((numext::isinf)(float(half(1.0 / 0.0))));
- VERIFY((numext::isinf)(float(half(-1.0 / 0.0))));
-#endif
-
- // Exactly same checks as above, just directly on the half representation.
- VERIFY(!(numext::isinf)(half(__half(0x7bff))));
- VERIFY(!(numext::isnan)(half(__half(0x0000))));
- VERIFY((numext::isinf)(half(__half(0xfc00))));
- VERIFY((numext::isnan)(half(__half(0xfc01))));
- VERIFY((numext::isinf)(half(__half(0x7c00))));
- VERIFY((numext::isnan)(half(__half(0x7c01))));
-
-#if !EIGEN_COMP_MSVC
- // Visual Studio errors out on divisions by 0
- VERIFY((numext::isnan)(half(0.0 / 0.0)));
- VERIFY((numext::isinf)(half(1.0 / 0.0)));
- VERIFY((numext::isinf)(half(-1.0 / 0.0)));
-#endif
-}
-
-void test_numtraits()
-{
- std::cout << "expsilin = " << NumTraits<half>::epsilon() << std::endl;
- std::cout << "highest = " << NumTraits<half>::highest() << std::endl;
- std::cout << "lowest = " << NumTraits<half>::lowest() << std::endl;
- std::cout << "inifinty = " << NumTraits<half>::infinity() << std::endl;
- std::cout << "nan = " << NumTraits<half>::quiet_NaN() << std::endl;
-
-}
-
-void test_arithmetic()
-{
- VERIFY_IS_EQUAL(float(half(2) + half(2)), 4);
- VERIFY_IS_EQUAL(float(half(2) + half(-2)), 0);
- VERIFY_IS_APPROX(float(half(0.33333f) + half(0.66667f)), 1.0f);
- VERIFY_IS_EQUAL(float(half(2.0f) * half(-5.5f)), -11.0f);
- VERIFY_IS_APPROX(float(half(1.0f) / half(3.0f)), 0.33333f);
- VERIFY_IS_EQUAL(float(-half(4096.0f)), -4096.0f);
- VERIFY_IS_EQUAL(float(-half(-4096.0f)), 4096.0f);
-}
-
-void test_comparison()
-{
- VERIFY(half(1.0f) > half(0.5f));
- VERIFY(half(0.5f) < half(1.0f));
- VERIFY(!(half(1.0f) < half(0.5f)));
- VERIFY(!(half(0.5f) > half(1.0f)));
-
- VERIFY(!(half(4.0f) > half(4.0f)));
- VERIFY(!(half(4.0f) < half(4.0f)));
-
- VERIFY(!(half(0.0f) < half(-0.0f)));
- VERIFY(!(half(-0.0f) < half(0.0f)));
- VERIFY(!(half(0.0f) > half(-0.0f)));
- VERIFY(!(half(-0.0f) > half(0.0f)));
-
- VERIFY(half(0.2f) > half(-1.0f));
- VERIFY(half(-1.0f) < half(0.2f));
- VERIFY(half(-16.0f) < half(-15.0f));
-
- VERIFY(half(1.0f) == half(1.0f));
- VERIFY(half(1.0f) != half(2.0f));
-
- // Comparisons with NaNs and infinities.
-#if !EIGEN_COMP_MSVC
- // Visual Studio errors out on divisions by 0
- VERIFY(!(half(0.0 / 0.0) == half(0.0 / 0.0)));
- VERIFY(half(0.0 / 0.0) != half(0.0 / 0.0));
-
- VERIFY(!(half(1.0) == half(0.0 / 0.0)));
- VERIFY(!(half(1.0) < half(0.0 / 0.0)));
- VERIFY(!(half(1.0) > half(0.0 / 0.0)));
- VERIFY(half(1.0) != half(0.0 / 0.0));
-
- VERIFY(half(1.0) < half(1.0 / 0.0));
- VERIFY(half(1.0) > half(-1.0 / 0.0));
-#endif
-}
-
-void test_basic_functions()
-{
- VERIFY_IS_EQUAL(float(numext::abs(half(3.5f))), 3.5f);
- VERIFY_IS_EQUAL(float(numext::abs(half(-3.5f))), 3.5f);
-
- VERIFY_IS_EQUAL(float(numext::floor(half(3.5f))), 3.0f);
- VERIFY_IS_EQUAL(float(numext::floor(half(-3.5f))), -4.0f);
-
- VERIFY_IS_EQUAL(float(numext::ceil(half(3.5f))), 4.0f);
- VERIFY_IS_EQUAL(float(numext::ceil(half(-3.5f))), -3.0f);
-
- VERIFY_IS_APPROX(float(numext::sqrt(half(0.0f))), 0.0f);
- VERIFY_IS_APPROX(float(numext::sqrt(half(4.0f))), 2.0f);
-
- VERIFY_IS_APPROX(float(numext::pow(half(0.0f), half(1.0f))), 0.0f);
- VERIFY_IS_APPROX(float(numext::pow(half(2.0f), half(2.0f))), 4.0f);
-
- VERIFY_IS_EQUAL(float(numext::exp(half(0.0f))), 1.0f);
- VERIFY_IS_APPROX(float(numext::exp(half(EIGEN_PI))), float(20.0 + EIGEN_PI));
-
- VERIFY_IS_EQUAL(float(numext::log(half(1.0f))), 0.0f);
- VERIFY_IS_APPROX(float(numext::log(half(10.0f))), 2.30273f);
-}
-
-void test_trigonometric_functions()
-{
- VERIFY_IS_APPROX(numext::cos(half(0.0f)), half(cosf(0.0f)));
- VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI)), half(cosf(EIGEN_PI)));
- //VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI/2)), half(cosf(EIGEN_PI/2)));
- //VERIFY_IS_APPROX(numext::cos(half(3*EIGEN_PI/2)), half(cosf(3*EIGEN_PI/2)));
- VERIFY_IS_APPROX(numext::cos(half(3.5f)), half(cosf(3.5f)));
-
- VERIFY_IS_APPROX(numext::sin(half(0.0f)), half(sinf(0.0f)));
- // VERIFY_IS_APPROX(numext::sin(half(EIGEN_PI)), half(sinf(EIGEN_PI)));
- VERIFY_IS_APPROX(numext::sin(half(EIGEN_PI/2)), half(sinf(EIGEN_PI/2)));
- VERIFY_IS_APPROX(numext::sin(half(3*EIGEN_PI/2)), half(sinf(3*EIGEN_PI/2)));
- VERIFY_IS_APPROX(numext::sin(half(3.5f)), half(sinf(3.5f)));
-
- VERIFY_IS_APPROX(numext::tan(half(0.0f)), half(tanf(0.0f)));
- // VERIFY_IS_APPROX(numext::tan(half(EIGEN_PI)), half(tanf(EIGEN_PI)));
- // VERIFY_IS_APPROX(numext::tan(half(EIGEN_PI/2)), half(tanf(EIGEN_PI/2)));
- //VERIFY_IS_APPROX(numext::tan(half(3*EIGEN_PI/2)), half(tanf(3*EIGEN_PI/2)));
- VERIFY_IS_APPROX(numext::tan(half(3.5f)), half(tanf(3.5f)));
-}
-
-void test_cxx11_float16()
-{
- CALL_SUBTEST(test_conversion());
- CALL_SUBTEST(test_numtraits());
- CALL_SUBTEST(test_arithmetic());
- CALL_SUBTEST(test_comparison());
- CALL_SUBTEST(test_basic_functions());
- CALL_SUBTEST(test_trigonometric_functions());
-}
diff --git a/unsupported/test/cxx11_non_blocking_thread_pool.cpp b/unsupported/test/cxx11_non_blocking_thread_pool.cpp
index 6569218c4..5f9bb938b 100644
--- a/unsupported/test/cxx11_non_blocking_thread_pool.cpp
+++ b/unsupported/test/cxx11_non_blocking_thread_pool.cpp
@@ -27,6 +27,8 @@ static void test_parallelism()
// Test we never-ever fail to match available tasks with idle threads.
const int kThreads = 16; // code below expects that this is a multiple of 4
NonBlockingThreadPool tp(kThreads);
+ VERIFY_IS_EQUAL(tp.NumThreads(), kThreads);
+ VERIFY_IS_EQUAL(tp.CurrentThreadId(), -1);
for (int iter = 0; iter < 100; ++iter) {
std::atomic<int> running(0);
std::atomic<int> done(0);
@@ -34,6 +36,9 @@ static void test_parallelism()
// Schedule kThreads tasks and ensure that they all are running.
for (int i = 0; i < kThreads; ++i) {
tp.Schedule([&]() {
+ const int thread_id = tp.CurrentThreadId();
+ VERIFY_GE(thread_id, 0);
+ VERIFY_LE(thread_id, kThreads - 1);
running++;
while (phase < 1) {
}
diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu
index 4026f48f0..284b46803 100644
--- a/unsupported/test/cxx11_tensor_cuda.cu
+++ b/unsupported/test/cxx11_tensor_cuda.cu
@@ -1019,6 +1019,153 @@ void test_cuda_erfc(const Scalar stddev)
cudaFree(d_out);
}
+template <typename Scalar>
+void test_cuda_betainc()
+{
+ Tensor<Scalar, 1> in_x(125);
+ Tensor<Scalar, 1> in_a(125);
+ Tensor<Scalar, 1> in_b(125);
+ Tensor<Scalar, 1> out(125);
+ Tensor<Scalar, 1> expected_out(125);
+ out.setZero();
+
+ Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+
+ Array<Scalar, 1, Dynamic> x(125);
+ Array<Scalar, 1, Dynamic> a(125);
+ Array<Scalar, 1, Dynamic> b(125);
+ Array<Scalar, 1, Dynamic> v(125);
+
+ a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+ 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+ 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+ 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+ 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+ 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999;
+
+ b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
+ 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+ 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
+ 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
+ 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+ 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
+ 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
+ 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+ 999.999, 999.999, 999.999;
+
+ x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
+ 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+ 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
+ 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
+ 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1,
+ -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
+ 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+ 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
+ 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1;
+
+ v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
+ nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
+ nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
+ 0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan,
+ 0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan,
+ 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan,
+ nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256,
+ 0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001,
+ 0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403,
+ 0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999,
+ 0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
+ 1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan,
+ nan, 7.864342668429763e-23, 3.015969667594166e-10, 0.0008598571564165444,
+ nan, nan, 6.031987710123844e-08, 0.5000000000000007, 0.9999999396801229,
+ nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan,
+ nan, nan, nan, nan, nan, nan, 0.0, 7.029920380986636e-306,
+ 2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302,
+ 1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252,
+ 2.9303043666183996e-60, nan, nan, 2.248913486879199e-196,
+ 0.5000000000004947, 0.9999999999999999, nan;
+
+ for (int i = 0; i < 125; ++i) {
+ in_x(i) = x(i);
+ in_a(i) = a(i);
+ in_b(i) = b(i);
+ expected_out(i) = v(i);
+ }
+
+ std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+ Scalar* d_in_x;
+ Scalar* d_in_a;
+ Scalar* d_in_b;
+ Scalar* d_out;
+ cudaMalloc((void**)(&d_in_x), bytes);
+ cudaMalloc((void**)(&d_in_a), bytes);
+ cudaMalloc((void**)(&d_in_b), bytes);
+ cudaMalloc((void**)(&d_out), bytes);
+
+ cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
+ cudaMemcpy(d_in_a, in_a.data(), bytes, cudaMemcpyHostToDevice);
+ cudaMemcpy(d_in_b, in_b.data(), bytes, cudaMemcpyHostToDevice);
+
+ Eigen::CudaStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125);
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_a(d_in_a, 125);
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_b(d_in_b, 125);
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 125);
+
+ gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x);
+
+ assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+ assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+ for (int i = 1; i < 125; ++i) {
+ if ((std::isnan)(expected_out(i))) {
+ VERIFY((std::isnan)(out(i)));
+ } else {
+ VERIFY_IS_APPROX(out(i), expected_out(i));
+ }
+ }
+
+ cudaFree(d_in_x);
+ cudaFree(d_in_a);
+ cudaFree(d_in_b);
+ cudaFree(d_out);
+}
+
+
void test_cxx11_tensor_cuda()
{
CALL_SUBTEST_1(test_cuda_elementwise_small());
@@ -1086,5 +1233,8 @@ void test_cxx11_tensor_cuda()
CALL_SUBTEST_5(test_cuda_igamma<double>());
CALL_SUBTEST_5(test_cuda_igammac<double>());
+
+ CALL_SUBTEST_6(test_cuda_betainc<float>());
+ CALL_SUBTEST_6(test_cuda_betainc<double>());
#endif
}
diff --git a/unsupported/test/cxx11_tensor_dimension.cpp b/unsupported/test/cxx11_tensor_dimension.cpp
index 0bccc3396..16f168ed4 100644
--- a/unsupported/test/cxx11_tensor_dimension.cpp
+++ b/unsupported/test/cxx11_tensor_dimension.cpp
@@ -21,7 +21,7 @@ static void test_dynamic_size()
VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2);
VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3);
VERIFY_IS_EQUAL((int)Eigen::internal::array_get<2>(dimensions), 7);
- VERIFY_IS_EQUAL(dimensions.TotalSize(), 2*3*7);
+ VERIFY_IS_EQUAL((int)dimensions.TotalSize(), 2*3*7);
VERIFY_IS_EQUAL((int)dimensions[0], 2);
VERIFY_IS_EQUAL((int)dimensions[1], 3);
VERIFY_IS_EQUAL((int)dimensions[2], 7);
@@ -34,12 +34,12 @@ static void test_fixed_size()
VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2);
VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3);
VERIFY_IS_EQUAL((int)Eigen::internal::array_get<2>(dimensions), 7);
- VERIFY_IS_EQUAL(dimensions.TotalSize(), 2*3*7);
+ VERIFY_IS_EQUAL((int)dimensions.TotalSize(), 2*3*7);
}
static void test_match()
{
- Eigen::DSizes<int, 3> dyn(2,3,7);
+ Eigen::DSizes<unsigned int, 3> dyn((unsigned int)2,(unsigned int)3,(unsigned int)7);
Eigen::Sizes<2,3,7> stat;
VERIFY_IS_EQUAL(Eigen::dimensions_match(dyn, stat), true);
@@ -51,13 +51,13 @@ static void test_match()
static void test_rank_zero()
{
Eigen::Sizes<> scalar;
- VERIFY_IS_EQUAL(scalar.TotalSize(), 1);
- VERIFY_IS_EQUAL(scalar.rank(), 0);
- VERIFY_IS_EQUAL(internal::array_prod(scalar), 1);
+ VERIFY_IS_EQUAL((int)scalar.TotalSize(), 1);
+ VERIFY_IS_EQUAL((int)scalar.rank(), 0);
+ VERIFY_IS_EQUAL((int)internal::array_prod(scalar), 1);
Eigen::DSizes<ptrdiff_t, 0> dscalar;
- VERIFY_IS_EQUAL(dscalar.TotalSize(), 1);
- VERIFY_IS_EQUAL(dscalar.rank(), 0u);
+ VERIFY_IS_EQUAL((int)dscalar.TotalSize(), 1);
+ VERIFY_IS_EQUAL((int)dscalar.rank(), 0);
}
void test_cxx11_tensor_dimension()
diff --git a/unsupported/test/cxx11_tensor_io.cpp b/unsupported/test/cxx11_tensor_io.cpp
index 8bbcf7089..489960529 100644
--- a/unsupported/test/cxx11_tensor_io.cpp
+++ b/unsupported/test/cxx11_tensor_io.cpp
@@ -14,6 +14,20 @@
template<int DataLayout>
+static void test_output_0d()
+{
+ Tensor<int, 0, DataLayout> tensor;
+ tensor() = 123;
+
+ std::stringstream os;
+ os << tensor;
+
+ std::string expected("123");
+ VERIFY_IS_EQUAL(std::string(os.str()), expected);
+}
+
+
+template<int DataLayout>
static void test_output_1d()
{
Tensor<int, 1, DataLayout> tensor(5);
@@ -26,6 +40,12 @@ static void test_output_1d()
std::string expected("0\n1\n2\n3\n4");
VERIFY_IS_EQUAL(std::string(os.str()), expected);
+
+ Eigen::Tensor<double,1,DataLayout> empty_tensor(0);
+ std::stringstream empty_os;
+ empty_os << empty_tensor;
+ std::string empty_string;
+ VERIFY_IS_EQUAL(std::string(empty_os.str()), empty_string);
}
@@ -101,6 +121,8 @@ static void test_output_const()
void test_cxx11_tensor_io()
{
+ CALL_SUBTEST(test_output_0d<ColMajor>());
+ CALL_SUBTEST(test_output_0d<RowMajor>());
CALL_SUBTEST(test_output_1d<ColMajor>());
CALL_SUBTEST(test_output_1d<RowMajor>());
CALL_SUBTEST(test_output_2d<ColMajor>());
diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp
index c575d3fdc..f7de43110 100644
--- a/unsupported/test/cxx11_tensor_morphing.cpp
+++ b/unsupported/test/cxx11_tensor_morphing.cpp
@@ -13,6 +13,7 @@
using Eigen::Tensor;
+template<typename>
static void test_simple_reshape()
{
Tensor<float, 5> tensor1(2,3,1,7,1);
@@ -40,7 +41,7 @@ static void test_simple_reshape()
}
}
-
+template<typename>
static void test_reshape_in_expr() {
MatrixXf m1(2,3*5*7*11);
MatrixXf m2(3*5*7*11,13);
@@ -65,7 +66,7 @@ static void test_reshape_in_expr() {
}
}
-
+template<typename>
static void test_reshape_as_lvalue()
{
Tensor<float, 3> tensor(2,3,7);
@@ -114,6 +115,7 @@ static void test_simple_slice()
}
}
+template<typename=void>
static void test_const_slice()
{
const float b[1] = {42};
@@ -459,25 +461,25 @@ static void test_composition()
void test_cxx11_tensor_morphing()
{
- CALL_SUBTEST(test_simple_reshape());
- CALL_SUBTEST(test_reshape_in_expr());
- CALL_SUBTEST(test_reshape_as_lvalue());
-
- CALL_SUBTEST(test_simple_slice<ColMajor>());
- CALL_SUBTEST(test_simple_slice<RowMajor>());
- CALL_SUBTEST(test_const_slice());
- CALL_SUBTEST(test_slice_in_expr<ColMajor>());
- CALL_SUBTEST(test_slice_in_expr<RowMajor>());
- CALL_SUBTEST(test_slice_as_lvalue<ColMajor>());
- CALL_SUBTEST(test_slice_as_lvalue<RowMajor>());
- CALL_SUBTEST(test_slice_raw_data<ColMajor>());
- CALL_SUBTEST(test_slice_raw_data<RowMajor>());
-
- CALL_SUBTEST(test_strided_slice_write<ColMajor>());
- CALL_SUBTEST(test_strided_slice<ColMajor>());
- CALL_SUBTEST(test_strided_slice_write<RowMajor>());
- CALL_SUBTEST(test_strided_slice<RowMajor>());
-
- CALL_SUBTEST(test_composition<ColMajor>());
- CALL_SUBTEST(test_composition<RowMajor>());
+ CALL_SUBTEST_1(test_simple_reshape<void>());
+ CALL_SUBTEST_1(test_reshape_in_expr<void>());
+ CALL_SUBTEST_1(test_reshape_as_lvalue<void>());
+
+ CALL_SUBTEST_1(test_simple_slice<ColMajor>());
+ CALL_SUBTEST_1(test_simple_slice<RowMajor>());
+ CALL_SUBTEST_1(test_const_slice());
+ CALL_SUBTEST_2(test_slice_in_expr<ColMajor>());
+ CALL_SUBTEST_3(test_slice_in_expr<RowMajor>());
+ CALL_SUBTEST_4(test_slice_as_lvalue<ColMajor>());
+ CALL_SUBTEST_4(test_slice_as_lvalue<RowMajor>());
+ CALL_SUBTEST_5(test_slice_raw_data<ColMajor>());
+ CALL_SUBTEST_5(test_slice_raw_data<RowMajor>());
+
+ CALL_SUBTEST_6(test_strided_slice_write<ColMajor>());
+ CALL_SUBTEST_6(test_strided_slice<ColMajor>());
+ CALL_SUBTEST_6(test_strided_slice_write<RowMajor>());
+ CALL_SUBTEST_6(test_strided_slice<RowMajor>());
+
+ CALL_SUBTEST_7(test_composition<ColMajor>());
+ CALL_SUBTEST_7(test_composition<RowMajor>());
}
diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
index 34e9f54a0..a6375d34a 100644
--- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
@@ -13,14 +13,53 @@
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
-
+#include <cuda_fp16.h>
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
using Eigen::Tensor;
+template<typename>
+void test_cuda_numext() {
+ Eigen::CudaStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+ int num_elem = 101;
+
+ float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+ bool* d_res_half = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
+ bool* d_res_float = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
+
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+ d_float, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_half(
+ d_res_half, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_float(
+ d_res_float, num_elem);
+
+ gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
+ gpu_res_float.device(gpu_device) = gpu_float.unaryExpr(Eigen::internal::scalar_isnan_op<float>());
+ gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().unaryExpr(Eigen::internal::scalar_isnan_op<Eigen::half>());
+
+ Tensor<bool, 1> half_prec(num_elem);
+ Tensor<bool, 1> full_prec(num_elem);
+ gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(bool));
+ gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(bool));
+ gpu_device.synchronize();
+
+ for (int i = 0; i < num_elem; ++i) {
+ std::cout << "Checking numext " << i << std::endl;
+ VERIFY_IS_EQUAL(full_prec(i), half_prec(i));
+ }
+
+ gpu_device.deallocate(d_float);
+ gpu_device.deallocate(d_res_half);
+ gpu_device.deallocate(d_res_float);
+}
+
+
#ifdef EIGEN_HAS_CUDA_FP16
+template<typename>
void test_cuda_conversion() {
Eigen::CudaStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
@@ -55,7 +94,7 @@ void test_cuda_conversion() {
gpu_device.deallocate(d_conv);
}
-
+template<typename>
void test_cuda_unary() {
Eigen::CudaStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
@@ -92,7 +131,7 @@ void test_cuda_unary() {
gpu_device.deallocate(d_res_float);
}
-
+template<typename>
void test_cuda_elementwise() {
Eigen::CudaStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
@@ -134,6 +173,7 @@ void test_cuda_elementwise() {
gpu_device.deallocate(d_res_float);
}
+template<typename>
void test_cuda_trancendental() {
Eigen::CudaStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
@@ -141,30 +181,39 @@ void test_cuda_trancendental() {
float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+ float* d_float3 = (float*)gpu_device.allocate(num_elem * sizeof(float));
Eigen::half* d_res1_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
Eigen::half* d_res1_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
Eigen::half* d_res2_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
Eigen::half* d_res2_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
-
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(
- d_float1, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(
- d_float2, num_elem);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(
- d_res1_half, num_elem);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(
- d_res1_float, num_elem);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(
- d_res2_half, num_elem);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(
- d_res2_float, num_elem);
+ Eigen::half* d_res3_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+ Eigen::half* d_res3_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float3(d_float3, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem);
gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
+ gpu_float3.device(gpu_device) = gpu_float3.random();
gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>();
gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>();
- gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().exp();
- gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>().log();
+ gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>();
+
+ gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>();
+ gpu_res1_half.device(gpu_device) = gpu_res1_half.exp();
+
+ gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>();
+ gpu_res2_half.device(gpu_device) = gpu_res2_half.log();
+
+ gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
+ gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p();
Tensor<float, 1> input1(num_elem);
Tensor<Eigen::half, 1> half_prec1(num_elem);
@@ -172,12 +221,18 @@ void test_cuda_trancendental() {
Tensor<float, 1> input2(num_elem);
Tensor<Eigen::half, 1> half_prec2(num_elem);
Tensor<Eigen::half, 1> full_prec2(num_elem);
+ Tensor<float, 1> input3(num_elem);
+ Tensor<Eigen::half, 1> half_prec3(num_elem);
+ Tensor<Eigen::half, 1> full_prec3(num_elem);
gpu_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float));
gpu_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float));
+ gpu_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float));
gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half));
gpu_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half));
gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half));
gpu_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half));
+ gpu_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half));
+ gpu_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half));
gpu_device.synchronize();
for (int i = 0; i < num_elem; ++i) {
@@ -186,17 +241,27 @@ void test_cuda_trancendental() {
}
for (int i = 0; i < num_elem; ++i) {
std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl;
- VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
+ if(std::abs(input2(i)-1.f)<0.05f) // log lacks accurary nearby 1
+ VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f));
+ else
+ VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
+ }
+ for (int i = 0; i < num_elem; ++i) {
+ std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl;
+ VERIFY_IS_APPROX(full_prec3(i), half_prec3(i));
}
gpu_device.deallocate(d_float1);
gpu_device.deallocate(d_float2);
+ gpu_device.deallocate(d_float3);
gpu_device.deallocate(d_res1_half);
gpu_device.deallocate(d_res1_float);
gpu_device.deallocate(d_res2_half);
gpu_device.deallocate(d_res2_float);
+ gpu_device.deallocate(d_res3_float);
+ gpu_device.deallocate(d_res3_half);
}
-
+template<typename>
void test_cuda_contractions() {
Eigen::CudaStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
@@ -247,7 +312,7 @@ void test_cuda_contractions() {
gpu_device.deallocate(d_res_float);
}
-
+template<typename>
void test_cuda_reductions(int size1, int size2, int redux) {
std::cout << "Reducing " << size1 << " by " << size2
@@ -296,17 +361,19 @@ void test_cuda_reductions(int size1, int size2, int redux) {
gpu_device.deallocate(d_res_float);
}
+template<typename>
void test_cuda_reductions() {
- test_cuda_reductions(13, 13, 0);
- test_cuda_reductions(13, 13, 1);
+ test_cuda_reductions<void>(13, 13, 0);
+ test_cuda_reductions<void>(13, 13, 1);
- test_cuda_reductions(35, 36, 0);
- test_cuda_reductions(35, 36, 1);
+ test_cuda_reductions<void>(35, 36, 0);
+ test_cuda_reductions<void>(35, 36, 1);
- test_cuda_reductions(36, 35, 0);
- test_cuda_reductions(36, 35, 1);
+ test_cuda_reductions<void>(36, 35, 0);
+ test_cuda_reductions<void>(36, 35, 1);
}
+template<typename>
void test_cuda_full_reductions() {
Eigen::CudaStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
@@ -355,7 +422,7 @@ void test_cuda_full_reductions() {
gpu_device.deallocate(d_res_float);
}
-
+template<typename>
void test_cuda_forced_evals() {
Eigen::CudaStreamDevice stream;
@@ -408,15 +475,17 @@ void test_cuda_forced_evals() {
void test_cxx11_tensor_of_float16_cuda()
{
+ CALL_SUBTEST_1(test_cuda_numext<void>());
+
#ifdef EIGEN_HAS_CUDA_FP16
- CALL_SUBTEST_1(test_cuda_conversion());
- CALL_SUBTEST_1(test_cuda_unary());
- CALL_SUBTEST_1(test_cuda_elementwise());
- CALL_SUBTEST_1(test_cuda_trancendental());
- CALL_SUBTEST_2(test_cuda_contractions());
- CALL_SUBTEST_3(test_cuda_reductions());
- CALL_SUBTEST_4(test_cuda_full_reductions());
- CALL_SUBTEST_5(test_cuda_forced_evals());
+ CALL_SUBTEST_1(test_cuda_conversion<void>());
+ CALL_SUBTEST_1(test_cuda_unary<void>());
+ CALL_SUBTEST_1(test_cuda_elementwise<void>());
+ CALL_SUBTEST_1(test_cuda_trancendental<void>());
+ CALL_SUBTEST_2(test_cuda_contractions<void>());
+ CALL_SUBTEST_3(test_cuda_reductions<void>());
+ CALL_SUBTEST_4(test_cuda_full_reductions<void>());
+ CALL_SUBTEST_5(test_cuda_forced_evals<void>());
#else
std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl;
#endif
diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp
index ca483257b..1490ec3da 100644
--- a/unsupported/test/cxx11_tensor_reduction.cpp
+++ b/unsupported/test/cxx11_tensor_reduction.cpp
@@ -239,6 +239,33 @@ static void test_simple_reductions() {
}
}
+
+template <int DataLayout>
+static void test_reductions_in_expr() {
+ Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
+ tensor.setRandom();
+ array<ptrdiff_t, 2> reduction_axis2;
+ reduction_axis2[0] = 1;
+ reduction_axis2[1] = 3;
+
+ Tensor<float, 2, DataLayout> result(2, 5);
+ result = result.constant(1.0f) - tensor.sum(reduction_axis2);
+ VERIFY_IS_EQUAL(result.dimension(0), 2);
+ VERIFY_IS_EQUAL(result.dimension(1), 5);
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 5; ++j) {
+ float sum = 0.0f;
+ for (int k = 0; k < 3; ++k) {
+ for (int l = 0; l < 7; ++l) {
+ sum += tensor(i, k, j, l);
+ }
+ }
+ VERIFY_IS_APPROX(result(i, j), 1.0f - sum);
+ }
+ }
+}
+
+
template <int DataLayout>
static void test_full_reductions() {
Tensor<float, 2, DataLayout> tensor(2, 3);
@@ -462,6 +489,8 @@ void test_cxx11_tensor_reduction() {
CALL_SUBTEST(test_trivial_reductions<RowMajor>());
CALL_SUBTEST(test_simple_reductions<ColMajor>());
CALL_SUBTEST(test_simple_reductions<RowMajor>());
+ CALL_SUBTEST(test_reductions_in_expr<ColMajor>());
+ CALL_SUBTEST(test_reductions_in_expr<RowMajor>());
CALL_SUBTEST(test_full_reductions<ColMajor>());
CALL_SUBTEST(test_full_reductions<RowMajor>());
CALL_SUBTEST(test_user_defined_reductions<ColMajor>());
diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cu b/unsupported/test/cxx11_tensor_reduction_cuda.cu
index cad0c08e0..6d8f01c02 100644
--- a/unsupported/test/cxx11_tensor_reduction_cuda.cu
+++ b/unsupported/test/cxx11_tensor_reduction_cuda.cu
@@ -16,7 +16,7 @@
#include <unsupported/Eigen/CXX11/Tensor>
-template<int DataLayout>
+template<typename Type, int DataLayout>
static void test_full_reductions() {
Eigen::CudaStreamDevice stream;
@@ -25,24 +25,24 @@ static void test_full_reductions() {
const int num_rows = internal::random<int>(1024, 5*1024);
const int num_cols = internal::random<int>(1024, 5*1024);
- Tensor<float, 2, DataLayout> in(num_rows, num_cols);
+ Tensor<Type, 2, DataLayout> in(num_rows, num_cols);
in.setRandom();
- Tensor<float, 0, DataLayout> full_redux;
+ Tensor<Type, 0, DataLayout> full_redux;
full_redux = in.sum();
- std::size_t in_bytes = in.size() * sizeof(float);
- std::size_t out_bytes = full_redux.size() * sizeof(float);
- float* gpu_in_ptr = static_cast<float*>(gpu_device.allocate(in_bytes));
- float* gpu_out_ptr = static_cast<float*>(gpu_device.allocate(out_bytes));
+ std::size_t in_bytes = in.size() * sizeof(Type);
+ std::size_t out_bytes = full_redux.size() * sizeof(Type);
+ Type* gpu_in_ptr = static_cast<Type*>(gpu_device.allocate(in_bytes));
+ Type* gpu_out_ptr = static_cast<Type*>(gpu_device.allocate(out_bytes));
gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
- TensorMap<Tensor<float, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols);
- TensorMap<Tensor<float, 0, DataLayout> > out_gpu(gpu_out_ptr);
+ TensorMap<Tensor<Type, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols);
+ TensorMap<Tensor<Type, 0, DataLayout> > out_gpu(gpu_out_ptr);
out_gpu.device(gpu_device) = in_gpu.sum();
- Tensor<float, 0, DataLayout> full_redux_gpu;
+ Tensor<Type, 0, DataLayout> full_redux_gpu;
gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
gpu_device.synchronize();
@@ -54,6 +54,8 @@ static void test_full_reductions() {
}
void test_cxx11_tensor_reduction_cuda() {
- CALL_SUBTEST_1(test_full_reductions<ColMajor>());
- CALL_SUBTEST_2(test_full_reductions<RowMajor>());
+ CALL_SUBTEST_1((test_full_reductions<float, ColMajor>()));
+ CALL_SUBTEST_1((test_full_reductions<double, ColMajor>()));
+ CALL_SUBTEST_2((test_full_reductions<float, RowMajor>()));
+ CALL_SUBTEST_2((test_full_reductions<double, RowMajor>()));
}
diff --git a/unsupported/test/cxx11_tensor_scan.cpp b/unsupported/test/cxx11_tensor_scan.cpp
index dbd3023d7..af59aa3ef 100644
--- a/unsupported/test/cxx11_tensor_scan.cpp
+++ b/unsupported/test/cxx11_tensor_scan.cpp
@@ -14,63 +14,73 @@
using Eigen::Tensor;
-template <int DataLayout, typename Type=float>
+template <int DataLayout, typename Type=float, bool Exclusive = false>
static void test_1d_scan()
{
- int size = 50;
- Tensor<Type, 1, DataLayout> tensor(size);
- tensor.setRandom();
- Tensor<Type, 1, DataLayout> result = tensor.cumsum(0);
+ int size = 50;
+ Tensor<Type, 1, DataLayout> tensor(size);
+ tensor.setRandom();
+ Tensor<Type, 1, DataLayout> result = tensor.cumsum(0, Exclusive);
- VERIFY_IS_EQUAL(tensor.dimension(0), result.dimension(0));
+ VERIFY_IS_EQUAL(tensor.dimension(0), result.dimension(0));
- float accum = 0;
- for (int i = 0; i < size; i++) {
+ float accum = 0;
+ for (int i = 0; i < size; i++) {
+ if (Exclusive) {
+ VERIFY_IS_EQUAL(result(i), accum);
+ accum += tensor(i);
+ } else {
accum += tensor(i);
VERIFY_IS_EQUAL(result(i), accum);
}
+ }
- accum = 1;
- result = tensor.cumprod(0);
- for (int i = 0; i < size; i++) {
+ accum = 1;
+ result = tensor.cumprod(0, Exclusive);
+ for (int i = 0; i < size; i++) {
+ if (Exclusive) {
+ VERIFY_IS_EQUAL(result(i), accum);
+ accum *= tensor(i);
+ } else {
accum *= tensor(i);
VERIFY_IS_EQUAL(result(i), accum);
}
+ }
}
template <int DataLayout, typename Type=float>
static void test_4d_scan()
{
- int size = 5;
- Tensor<Type, 4, DataLayout> tensor(size, size, size, size);
- tensor.setRandom();
+ int size = 5;
+ Tensor<Type, 4, DataLayout> tensor(size, size, size, size);
+ tensor.setRandom();
- Tensor<Type, 4, DataLayout> result(size, size, size, size);
+ Tensor<Type, 4, DataLayout> result(size, size, size, size);
- result = tensor.cumsum(0);
- float accum = 0;
- for (int i = 0; i < size; i++) {
- accum += tensor(i, 0, 0, 0);
- VERIFY_IS_EQUAL(result(i, 0, 0, 0), accum);
- }
- result = tensor.cumsum(1);
- accum = 0;
- for (int i = 0; i < size; i++) {
- accum += tensor(0, i, 0, 0);
- VERIFY_IS_EQUAL(result(0, i, 0, 0), accum);
- }
- result = tensor.cumsum(2);
- accum = 0;
- for (int i = 0; i < size; i++) {
- accum += tensor(0, 0, i, 0);
- VERIFY_IS_EQUAL(result(0, 0, i, 0), accum);
- }
- result = tensor.cumsum(3);
- accum = 0;
- for (int i = 0; i < size; i++) {
- accum += tensor(0, 0, 0, i);
- VERIFY_IS_EQUAL(result(0, 0, 0, i), accum);
- }
+ result = tensor.cumsum(0);
+ float accum = 0;
+ for (int i = 0; i < size; i++) {
+ accum += tensor(i, 1, 2, 3);
+ VERIFY_IS_EQUAL(result(i, 1, 2, 3), accum);
+ }
+ result = tensor.cumsum(1);
+ accum = 0;
+ for (int i = 0; i < size; i++) {
+ accum += tensor(1, i, 2, 3);
+ VERIFY_IS_EQUAL(result(1, i, 2, 3), accum);
+ }
+ result = tensor.cumsum(2);
+ accum = 0;
+ for (int i = 0; i < size; i++) {
+ accum += tensor(1, 2, i, 3);
+ VERIFY_IS_EQUAL(result(1, 2, i, 3), accum);
+ }
+ result = tensor.cumsum(3);
+ accum = 0;
+ for (int i = 0; i < size; i++) {
+ accum += tensor(1, 2, 3, i);
+ VERIFY_IS_EQUAL(result(1, 2, 3, i), accum);
+ }
}
template <int DataLayout>
@@ -89,8 +99,10 @@ static void test_tensor_maps() {
}
void test_cxx11_tensor_scan() {
- CALL_SUBTEST(test_1d_scan<ColMajor>());
- CALL_SUBTEST(test_1d_scan<RowMajor>());
+ CALL_SUBTEST((test_1d_scan<ColMajor, float, true>()));
+ CALL_SUBTEST((test_1d_scan<ColMajor, float, false>()));
+ CALL_SUBTEST((test_1d_scan<RowMajor, float, true>()));
+ CALL_SUBTEST((test_1d_scan<RowMajor, float, false>()));
CALL_SUBTEST(test_4d_scan<ColMajor>());
CALL_SUBTEST(test_4d_scan<RowMajor>());
CALL_SUBTEST(test_tensor_maps<ColMajor>());
diff --git a/unsupported/test/cxx11_tensor_scan_cuda.cu b/unsupported/test/cxx11_tensor_scan_cuda.cu
new file mode 100644
index 000000000..35e19e51c
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_scan_cuda.cu
@@ -0,0 +1,77 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_scan_cuda
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+typedef Tensor<float, 1>::DimensionPair DimPair;
+
+template<int DataLayout>
+void test_cuda_cumsum(int m_size, int k_size, int n_size)
+{
+ std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
+ Tensor<float, 3, DataLayout> t_input(m_size, k_size, n_size);
+ Tensor<float, 3, DataLayout> t_result(m_size, k_size, n_size);
+ Tensor<float, 3, DataLayout> t_result_gpu(m_size, k_size, n_size);
+
+ t_input.setRandom();
+
+ std::size_t t_input_bytes = t_input.size() * sizeof(float);
+ std::size_t t_result_bytes = t_result.size() * sizeof(float);
+
+ float* d_t_input;
+ float* d_t_result;
+
+ cudaMalloc((void**)(&d_t_input), t_input_bytes);
+ cudaMalloc((void**)(&d_t_result), t_result_bytes);
+
+ cudaMemcpy(d_t_input, t_input.data(), t_input_bytes, cudaMemcpyHostToDevice);
+
+ Eigen::CudaStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
+ gpu_t_input(d_t_input, Eigen::array<int, 3>(m_size, k_size, n_size));
+ Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
+ gpu_t_result(d_t_result, Eigen::array<int, 3>(m_size, k_size, n_size));
+
+ gpu_t_result.device(gpu_device) = gpu_t_input.cumsum(1);
+ t_result = t_input.cumsum(1);
+
+ cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+ for (size_t i = 0; i < t_result.size(); i++) {
+ if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
+ continue;
+ }
+ if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
+ continue;
+ }
+ std::cout << "mismatch detected at index " << i << ": " << t_result(i)
+ << " vs " << t_result_gpu(i) << std::endl;
+ assert(false);
+ }
+
+ cudaFree((void*)d_t_input);
+ cudaFree((void*)d_t_result);
+}
+
+
+void test_cxx11_tensor_scan_cuda()
+{
+ CALL_SUBTEST_1(test_cuda_cumsum<ColMajor>(128, 128, 128));
+ CALL_SUBTEST_2(test_cuda_cumsum<RowMajor>(128, 128, 128));
+}
diff --git a/unsupported/test/cxx11_tensor_sugar.cpp b/unsupported/test/cxx11_tensor_sugar.cpp
index a03f75cfe..2f56eb495 100644
--- a/unsupported/test/cxx11_tensor_sugar.cpp
+++ b/unsupported/test/cxx11_tensor_sugar.cpp
@@ -33,7 +33,7 @@ static void test_comparison_sugar() {
}
-static void test_scalar_sugar() {
+static void test_scalar_sugar_add_mul() {
Tensor<float, 3> A(6, 7, 5);
Tensor<float, 3> B(6, 7, 5);
A.setRandom();
@@ -41,21 +41,41 @@ static void test_scalar_sugar() {
const float alpha = 0.43f;
const float beta = 0.21f;
+ const float gamma = 0.14f;
- Tensor<float, 3> R = A * A.constant(alpha) + B * B.constant(beta);
- Tensor<float, 3> S = A * alpha + B * beta;
-
- // TODO: add enough syntactic sugar to support this
- // Tensor<float, 3> T = alpha * A + beta * B;
+ Tensor<float, 3> R = A.constant(gamma) + A * A.constant(alpha) + B * B.constant(beta);
+ Tensor<float, 3> S = A * alpha + B * beta + gamma;
+ Tensor<float, 3> T = gamma + alpha * A + beta * B;
for (int i = 0; i < 6*7*5; ++i) {
VERIFY_IS_APPROX(R(i), S(i));
+ VERIFY_IS_APPROX(R(i), T(i));
}
}
+static void test_scalar_sugar_sub_div() {
+ Tensor<float, 3> A(6, 7, 5);
+ Tensor<float, 3> B(6, 7, 5);
+ A.setRandom();
+ B.setRandom();
+
+ const float alpha = 0.43f;
+ const float beta = 0.21f;
+ const float gamma = 0.14f;
+ const float delta = 0.32f;
+
+ Tensor<float, 3> R = A.constant(gamma) - A / A.constant(alpha)
+ - B.constant(beta) / B - A.constant(delta);
+ Tensor<float, 3> S = gamma - A / alpha - beta / B - delta;
+
+ for (int i = 0; i < 6*7*5; ++i) {
+ VERIFY_IS_APPROX(R(i), S(i));
+ }
+}
void test_cxx11_tensor_sugar()
{
CALL_SUBTEST(test_comparison_sugar());
- CALL_SUBTEST(test_scalar_sugar());
+ CALL_SUBTEST(test_scalar_sugar_add_mul());
+ CALL_SUBTEST(test_scalar_sugar_sub_div());
}
diff --git a/unsupported/test/kronecker_product.cpp b/unsupported/test/kronecker_product.cpp
index 02411a262..e770049e5 100644
--- a/unsupported/test/kronecker_product.cpp
+++ b/unsupported/test/kronecker_product.cpp
@@ -9,12 +9,12 @@
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#ifdef EIGEN_TEST_PART_1
#include "sparse.h"
#include <Eigen/SparseExtra>
#include <Eigen/KroneckerProduct>
-
template<typename MatrixType>
void check_dimension(const MatrixType& ab, const int rows, const int cols)
{
@@ -230,3 +230,23 @@ void test_kronecker_product()
VERIFY_IS_APPROX(MatrixXf(sC2),dC);
}
}
+
+#endif
+
+#ifdef EIGEN_TEST_PART_2
+
+// simply check that for a dense kronecker product, sparse module is not needed
+
+#include "main.h"
+#include <Eigen/KroneckerProduct>
+
+void test_kronecker_product()
+{
+ MatrixXd a(2,2), b(3,3), c;
+ a.setRandom();
+ b.setRandom();
+ c = kroneckerProduct(a,b);
+ VERIFY_IS_APPROX(c.block(3,3,3,3), a(1,1)*b);
+}
+
+#endif
diff --git a/unsupported/test/mpreal_support.cpp b/unsupported/test/mpreal_support.cpp
index 1aa9e786a..ffa5691eb 100644
--- a/unsupported/test/mpreal_support.cpp
+++ b/unsupported/test/mpreal_support.cpp
@@ -17,6 +17,7 @@ void test_mpreal_support()
std::cerr << "dummy_precision = " << NumTraits<mpreal>::dummy_precision() << "\n";
std::cerr << "highest = " << NumTraits<mpreal>::highest() << "\n";
std::cerr << "lowest = " << NumTraits<mpreal>::lowest() << "\n";
+ std::cerr << "digits10 = " << NumTraits<mpreal>::digits10() << "\n";
for(int i = 0; i < g_repeat; i++) {
int s = Eigen::internal::random<int>(1,100);
diff --git a/unsupported/test/special_functions.cpp b/unsupported/test/special_functions.cpp
new file mode 100644
index 000000000..057fb3e92
--- /dev/null
+++ b/unsupported/test/special_functions.cpp
@@ -0,0 +1,345 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include "../Eigen/SpecialFunctions"
+
+template<typename X, typename Y>
+void verify_component_wise(const X& x, const Y& y)
+{
+ for(Index i=0; i<x.size(); ++i)
+ {
+ if((numext::isfinite)(y(i)))
+ VERIFY_IS_APPROX( x(i), y(i) );
+ else if((numext::isnan)(y(i)))
+ VERIFY((numext::isnan)(x(i)));
+ else
+ VERIFY_IS_EQUAL( x(i), y(i) );
+ }
+}
+
+template<typename ArrayType> void array_special_functions()
+{
+ using std::abs;
+ using std::sqrt;
+ typedef typename ArrayType::Scalar Scalar;
+ typedef typename NumTraits<Scalar>::Real RealScalar;
+
+ Scalar plusinf = std::numeric_limits<Scalar>::infinity();
+ Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+
+ Index rows = internal::random<Index>(1,30);
+ Index cols = 1;
+
+ // API
+ {
+ ArrayType m1 = ArrayType::Random(rows,cols);
+#if EIGEN_HAS_C99_MATH
+ VERIFY_IS_APPROX(m1.lgamma(), lgamma(m1));
+ VERIFY_IS_APPROX(m1.digamma(), digamma(m1));
+ VERIFY_IS_APPROX(m1.erf(), erf(m1));
+ VERIFY_IS_APPROX(m1.erfc(), erfc(m1));
+#endif // EIGEN_HAS_C99_MATH
+ }
+
+
+#if EIGEN_HAS_C99_MATH
+ // check special functions (comparing against numpy implementation)
+ if (!NumTraits<Scalar>::IsComplex)
+ {
+
+ {
+ ArrayType m1 = ArrayType::Random(rows,cols);
+ ArrayType m2 = ArrayType::Random(rows,cols);
+
+ // Test various propreties of igamma & igammac. These are normalized
+ // gamma integrals where
+ // igammac(a, x) = Gamma(a, x) / Gamma(a)
+ // igamma(a, x) = gamma(a, x) / Gamma(a)
+ // where Gamma and gamma are considered the standard unnormalized
+ // upper and lower incomplete gamma functions, respectively.
+ ArrayType a = m1.abs() + 2;
+ ArrayType x = m2.abs() + 2;
+ ArrayType zero = ArrayType::Zero(rows, cols);
+ ArrayType one = ArrayType::Constant(rows, cols, Scalar(1.0));
+ ArrayType a_m1 = a - one;
+ ArrayType Gamma_a_x = Eigen::igammac(a, x) * a.lgamma().exp();
+ ArrayType Gamma_a_m1_x = Eigen::igammac(a_m1, x) * a_m1.lgamma().exp();
+ ArrayType gamma_a_x = Eigen::igamma(a, x) * a.lgamma().exp();
+ ArrayType gamma_a_m1_x = Eigen::igamma(a_m1, x) * a_m1.lgamma().exp();
+
+ // Gamma(a, 0) == Gamma(a)
+ VERIFY_IS_APPROX(Eigen::igammac(a, zero), one);
+
+ // Gamma(a, x) + gamma(a, x) == Gamma(a)
+ VERIFY_IS_APPROX(Gamma_a_x + gamma_a_x, a.lgamma().exp());
+
+ // Gamma(a, x) == (a - 1) * Gamma(a-1, x) + x^(a-1) * exp(-x)
+ VERIFY_IS_APPROX(Gamma_a_x, (a - 1) * Gamma_a_m1_x + x.pow(a-1) * (-x).exp());
+
+ // gamma(a, x) == (a - 1) * gamma(a-1, x) - x^(a-1) * exp(-x)
+ VERIFY_IS_APPROX(gamma_a_x, (a - 1) * gamma_a_m1_x - x.pow(a-1) * (-x).exp());
+ }
+
+ {
+ // Check exact values of igamma and igammac against a third party calculation.
+ Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+ Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+
+ // location i*6+j corresponds to a_s[i], x_s[j].
+ Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
+ {0.0, 0.6321205588285578, 0.7768698398515702,
+ 0.9816843611112658, 9.999500016666262e-05, 1.0},
+ {0.0, 0.4275932955291202, 0.608374823728911,
+ 0.9539882943107686, 7.522076445089201e-07, 1.0},
+ {0.0, 0.01898815687615381, 0.06564245437845008,
+ 0.5665298796332909, 4.166333347221828e-18, 1.0},
+ {0.0, 0.9999780593618628, 0.9999899967080838,
+ 0.9999996219837988, 0.9991370418689945, 1.0},
+ {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
+ Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
+ {1.0, 0.36787944117144233, 0.22313016014842982,
+ 0.018315638888734182, 0.9999000049998333, 0.0},
+ {1.0, 0.5724067044708798, 0.3916251762710878,
+ 0.04601170568923136, 0.9999992477923555, 0.0},
+ {1.0, 0.9810118431238462, 0.9343575456215499,
+ 0.4334701203667089, 1.0, 0.0},
+ {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
+ 3.7801620118431334e-07, 0.0008629581310054535,
+ 0.0},
+ {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
+ for (int i = 0; i < 6; ++i) {
+ for (int j = 0; j < 6; ++j) {
+ if ((std::isnan)(igamma_s[i][j])) {
+ VERIFY((std::isnan)(numext::igamma(a_s[i], x_s[j])));
+ } else {
+ VERIFY_IS_APPROX(numext::igamma(a_s[i], x_s[j]), igamma_s[i][j]);
+ }
+
+ if ((std::isnan)(igammac_s[i][j])) {
+ VERIFY((std::isnan)(numext::igammac(a_s[i], x_s[j])));
+ } else {
+ VERIFY_IS_APPROX(numext::igammac(a_s[i], x_s[j]), igammac_s[i][j]);
+ }
+ }
+ }
+ }
+ }
+#endif // EIGEN_HAS_C99_MATH
+
+ // Check the zeta function against scipy.special.zeta
+ {
+ ArrayType x(7), q(7), res(7), ref(7);
+ x << 1.5, 4, 10.5, 10000.5, 3, 1, 0.9;
+ q << 2, 1.5, 3, 1.0001, -2.5, 1.2345, 1.2345;
+ ref << 1.61237534869, 0.234848505667, 1.03086757337e-5, 0.367879440865, 0.054102025820864097, plusinf, nan;
+ CALL_SUBTEST( verify_component_wise(ref, ref); );
+ CALL_SUBTEST( res = x.zeta(q); verify_component_wise(res, ref); );
+ CALL_SUBTEST( res = zeta(x,q); verify_component_wise(res, ref); );
+ }
+
+ // digamma
+ {
+ ArrayType x(7), res(7), ref(7);
+ x << 1, 1.5, 4, -10.5, 10000.5, 0, -1;
+ ref << -0.5772156649015329, 0.03648997397857645, 1.2561176684318, 2.398239129535781, 9.210340372392849, plusinf, plusinf;
+ CALL_SUBTEST( verify_component_wise(ref, ref); );
+
+ CALL_SUBTEST( res = x.digamma(); verify_component_wise(res, ref); );
+ CALL_SUBTEST( res = digamma(x); verify_component_wise(res, ref); );
+ }
+
+
+#if EIGEN_HAS_C99_MATH
+ {
+ ArrayType n(11), x(11), res(11), ref(11);
+ n << 1, 1, 1, 1.5, 17, 31, 28, 8, 42, 147, 170;
+ x << 2, 3, 25.5, 1.5, 4.7, 11.8, 17.7, 30.2, 15.8, 54.1, 64;
+ ref << 0.644934066848, 0.394934066848, 0.0399946696496, nan, 293.334565435, 0.445487887616, -2.47810300902e-07, -8.29668781082e-09, -0.434562276666, 0.567742190178, -0.0108615497927;
+ CALL_SUBTEST( verify_component_wise(ref, ref); );
+
+ if(sizeof(RealScalar)>=8) { // double
+ // Reason for commented line: http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1232
+ // CALL_SUBTEST( res = x.polygamma(n); verify_component_wise(res, ref); );
+ CALL_SUBTEST( res = polygamma(n,x); verify_component_wise(res, ref); );
+ }
+ else {
+ // CALL_SUBTEST( res = x.polygamma(n); verify_component_wise(res.head(8), ref.head(8)); );
+ CALL_SUBTEST( res = polygamma(n,x); verify_component_wise(res.head(8), ref.head(8)); );
+ }
+ }
+#endif
+
+#if EIGEN_HAS_C99_MATH
+ {
+ // Inputs and ground truth generated with scipy via:
+ // a = np.logspace(-3, 3, 5) - 1e-3
+ // b = np.logspace(-3, 3, 5) - 1e-3
+ // x = np.linspace(-0.1, 1.1, 5)
+ // (full_a, full_b, full_x) = np.vectorize(lambda a, b, x: (a, b, x))(*np.ix_(a, b, x))
+ // full_a = full_a.flatten().tolist() # same for full_b, full_x
+ // v = scipy.special.betainc(full_a, full_b, full_x).flatten().tolist()
+ //
+ // Note in Eigen, we call betainc with arguments in the order (x, a, b).
+ ArrayType a(125);
+ ArrayType b(125);
+ ArrayType x(125);
+ ArrayType v(125);
+ ArrayType res(125);
+
+ a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+ 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+ 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+ 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+ 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+ 999.999, 999.999, 999.999;
+
+ b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
+ 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999,
+ 999.999, 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.999, 0.999, 0.999, 0.999,
+ 0.999, 31.62177660168379, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999,
+ 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999,
+ 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999,
+ 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+ 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+ 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999,
+ 31.62177660168379, 31.62177660168379, 31.62177660168379,
+ 31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999,
+ 999.999, 999.999;
+
+ x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+ 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
+ 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
+ 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1,
+ -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
+ 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+ 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
+ 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
+ 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+ 0.8, 1.1;
+
+ v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
+ nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
+ nan, nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
+ 0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan,
+ 0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan,
+ 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan,
+ nan, nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256,
+ 0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001,
+ 0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403,
+ 0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999,
+ 0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
+ 1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06,
+ nan, nan, 7.864342668429763e-23, 3.015969667594166e-10,
+ 0.0008598571564165444, nan, nan, 6.031987710123844e-08,
+ 0.5000000000000007, 0.9999999396801229, nan, nan, 0.9999999999999999,
+ 0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan, nan, nan,
+ nan, 0.0, 7.029920380986636e-306, 2.2450728208591345e-101, nan, nan,
+ 0.0, 9.275871147869727e-302, 1.2232913026152827e-97, nan, nan, 0.0,
+ 3.0891393081932924e-252, 2.9303043666183996e-60, nan, nan,
+ 2.248913486879199e-196, 0.5000000000004947, 0.9999999999999999, nan;
+
+ CALL_SUBTEST(res = betainc(a, b, x);
+ verify_component_wise(res, v););
+ }
+
+ // Test various properties of betainc
+ {
+ ArrayType m1 = ArrayType::Random(32);
+ ArrayType m2 = ArrayType::Random(32);
+ ArrayType m3 = ArrayType::Random(32);
+ ArrayType one = ArrayType::Constant(32, Scalar(1.0));
+ const Scalar eps = std::numeric_limits<Scalar>::epsilon();
+ ArrayType a = (m1 * 4.0).exp();
+ ArrayType b = (m2 * 4.0).exp();
+ ArrayType x = m3.abs();
+
+ // betainc(a, 1, x) == x**a
+ CALL_SUBTEST(
+ ArrayType test = betainc(a, one, x);
+ ArrayType expected = x.pow(a);
+ verify_component_wise(test, expected););
+
+ // betainc(1, b, x) == 1 - (1 - x)**b
+ CALL_SUBTEST(
+ ArrayType test = betainc(one, b, x);
+ ArrayType expected = one - (one - x).pow(b);
+ verify_component_wise(test, expected););
+
+ // betainc(a, b, x) == 1 - betainc(b, a, 1-x)
+ CALL_SUBTEST(
+ ArrayType test = betainc(a, b, x) + betainc(b, a, one - x);
+ ArrayType expected = one;
+ verify_component_wise(test, expected););
+
+ // betainc(a+1, b, x) = betainc(a, b, x) - x**a * (1 - x)**b / (a * beta(a, b))
+ CALL_SUBTEST(
+ ArrayType num = x.pow(a) * (one - x).pow(b);
+ ArrayType denom = a * (a.lgamma() + b.lgamma() - (a + b).lgamma()).exp();
+ // Add eps to rhs and lhs so that component-wise test doesn't result in
+ // nans when both outputs are zeros.
+ ArrayType expected = betainc(a, b, x) - num / denom + eps;
+ ArrayType test = betainc(a + one, b, x) + eps;
+ if (sizeof(Scalar) >= 8) { // double
+ verify_component_wise(test, expected);
+ } else {
+ // Reason for limited test: http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1232
+ verify_component_wise(test.head(8), expected.head(8));
+ });
+
+ // betainc(a, b+1, x) = betainc(a, b, x) + x**a * (1 - x)**b / (b * beta(a, b))
+ CALL_SUBTEST(
+ // Add eps to rhs and lhs so that component-wise test doesn't result in
+ // nans when both outputs are zeros.
+ ArrayType num = x.pow(a) * (one - x).pow(b);
+ ArrayType denom = b * (a.lgamma() + b.lgamma() - (a + b).lgamma()).exp();
+ ArrayType expected = betainc(a, b, x) + num / denom + eps;
+ ArrayType test = betainc(a, b + one, x) + eps;
+ verify_component_wise(test, expected););
+ }
+#endif
+}
+
+void test_special_functions()
+{
+ CALL_SUBTEST_1(array_special_functions<ArrayXf>());
+ CALL_SUBTEST_2(array_special_functions<ArrayXd>());
+}