aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen
diff options
context:
space:
mode:
Diffstat (limited to 'unsupported/Eigen')
-rw-r--r--unsupported/Eigen/CMakeLists.txt7
-rw-r--r--unsupported/Eigen/CXX11/CMakeLists.txt2
-rw-r--r--unsupported/Eigen/CXX11/Tensor14
-rw-r--r--unsupported/Eigen/CXX11/src/CMakeLists.txt4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/README.md42
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/Tensor.h20
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h8
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBase.h152
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h12
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h7
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h12
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h41
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h10
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h74
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h670
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h40
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h48
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h157
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h120
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h30
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h108
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h59
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h82
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h8
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h10
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h588
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h33
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorIO.h69
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h202
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h50
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMap.h6
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h51
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h284
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h112
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h276
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h104
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h495
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorRef.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h6
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorScan.h287
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h23
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h4
-rw-r--r--unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt8
-rw-r--r--unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h9
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h206
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h4
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h37
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h4
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h7
-rw-r--r--unsupported/Eigen/CXX11/src/util/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/CXX11/src/util/EmulateArray.h4
-rw-r--r--unsupported/Eigen/CXX11/src/util/MaxSizeVector.h11
-rw-r--r--unsupported/Eigen/EulerAngles43
-rw-r--r--unsupported/Eigen/KroneckerProduct2
-rw-r--r--unsupported/Eigen/MPRealSupport29
-rw-r--r--unsupported/Eigen/SpecialFunctions61
-rw-r--r--unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h49
-rwxr-xr-xunsupported/Eigen/src/AutoDiff/AutoDiffScalar.h199
-rw-r--r--unsupported/Eigen/src/AutoDiff/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/src/BVH/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/src/CMakeLists.txt15
-rw-r--r--unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h14
-rw-r--r--unsupported/Eigen/src/Eigenvalues/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/src/EulerAngles/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/src/EulerAngles/EulerAngles.h386
-rw-r--r--unsupported/Eigen/src/EulerAngles/EulerSystem.h326
-rw-r--r--unsupported/Eigen/src/FFT/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/src/IterativeSolvers/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/src/IterativeSolvers/GMRES.h5
-rw-r--r--unsupported/Eigen/src/KroneckerProduct/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h6
-rw-r--r--unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h2
-rw-r--r--unsupported/Eigen/src/MatrixFunctions/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h20
-rw-r--r--unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h9
-rw-r--r--unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h12
-rw-r--r--unsupported/Eigen/src/MatrixFunctions/MatrixPower.h16
-rw-r--r--unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h33
-rw-r--r--unsupported/Eigen/src/MoreVectorization/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/src/NonLinearOptimization/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h4
-rw-r--r--unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h6
-rw-r--r--unsupported/Eigen/src/NumericalDiff/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/src/Polynomials/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/src/Skyline/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/src/SparseExtra/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h124
-rw-r--r--unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h236
-rw-r--r--unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h47
-rw-r--r--unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h1551
-rw-r--r--unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h58
-rw-r--r--unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h165
-rw-r--r--unsupported/Eigen/src/Splines/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/src/Splines/Spline.h6
112 files changed, 6766 insertions, 1440 deletions
diff --git a/unsupported/Eigen/CMakeLists.txt b/unsupported/Eigen/CMakeLists.txt
index 6d0cf4f9d..631a06014 100644
--- a/unsupported/Eigen/CMakeLists.txt
+++ b/unsupported/Eigen/CMakeLists.txt
@@ -4,6 +4,7 @@ set(Eigen_HEADERS
ArpackSupport
AutoDiff
BVH
+ EulerAngles
FFT
IterativeSolvers
KroneckerProduct
@@ -17,6 +18,7 @@ set(Eigen_HEADERS
Polynomials
Skyline
SparseExtra
+ SpecialFunctions
Splines
)
@@ -25,5 +27,6 @@ install(FILES
DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel
)
-add_subdirectory(src)
-add_subdirectory(CXX11) \ No newline at end of file
+install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel FILES_MATCHING PATTERN "*.h")
+
+add_subdirectory(CXX11)
diff --git a/unsupported/Eigen/CXX11/CMakeLists.txt b/unsupported/Eigen/CXX11/CMakeLists.txt
index a40bc4715..385ed240c 100644
--- a/unsupported/Eigen/CXX11/CMakeLists.txt
+++ b/unsupported/Eigen/CXX11/CMakeLists.txt
@@ -5,4 +5,4 @@ install(FILES
DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel
)
-add_subdirectory(src)
+install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel FILES_MATCHING PATTERN "*.h")
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 1e97ad3c0..4976a1254 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -15,6 +15,7 @@
#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+#include "../SpecialFunctions"
#include "src/util/CXX11Meta.h"
#include "src/util/MaxSizeVector.h"
@@ -60,15 +61,17 @@ typedef unsigned __int64 uint64_t;
#ifdef EIGEN_USE_GPU
#include <iostream>
#include <cuda_runtime.h>
-#if defined(__CUDACC__)
-#include <curand_kernel.h>
+#if __cplusplus >= 201103L
+#include <atomic>
+#include <unistd.h>
#endif
#endif
-
#include "src/Tensor/TensorMacros.h"
#include "src/Tensor/TensorForwardDeclarations.h"
#include "src/Tensor/TensorMeta.h"
+#include "src/Tensor/TensorFunctors.h"
+#include "src/Tensor/TensorCostModel.h"
#include "src/Tensor/TensorDeviceDefault.h"
#include "src/Tensor/TensorDeviceThreadPool.h"
#include "src/Tensor/TensorDeviceCuda.h"
@@ -77,13 +80,13 @@ typedef unsigned __int64 uint64_t;
#include "src/Tensor/TensorDimensions.h"
#include "src/Tensor/TensorInitializer.h"
#include "src/Tensor/TensorTraits.h"
-#include "src/Tensor/TensorFunctors.h"
+#include "src/Tensor/TensorRandom.h"
#include "src/Tensor/TensorUInt128.h"
#include "src/Tensor/TensorIntDiv.h"
+#include "src/Tensor/TensorGlobalFunctions.h"
#include "src/Tensor/TensorBase.h"
-#include "src/Tensor/TensorCostModel.h"
#include "src/Tensor/TensorEvaluator.h"
#include "src/Tensor/TensorExpr.h"
#include "src/Tensor/TensorReduction.h"
@@ -115,6 +118,7 @@ typedef unsigned __int64 uint64_t;
#include "src/Tensor/TensorForcedEval.h"
#include "src/Tensor/TensorGenerator.h"
#include "src/Tensor/TensorAssign.h"
+#include "src/Tensor/TensorScan.h"
#include "src/Tensor/TensorExecutor.h"
#include "src/Tensor/TensorDevice.h"
diff --git a/unsupported/Eigen/CXX11/src/CMakeLists.txt b/unsupported/Eigen/CXX11/src/CMakeLists.txt
deleted file mode 100644
index 1734262bb..000000000
--- a/unsupported/Eigen/CXX11/src/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-add_subdirectory(util)
-add_subdirectory(ThreadPool)
-add_subdirectory(Tensor)
-add_subdirectory(TensorSymmetry)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt b/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt
deleted file mode 100644
index 6d4b3ea0d..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_Tensor_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_CXX11_Tensor_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/Tensor COMPONENT Devel
- )
diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md
index eeca2f69e..02146527b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/README.md
+++ b/unsupported/Eigen/CXX11/src/Tensor/README.md
@@ -1102,7 +1102,7 @@ Example: Reduction along two dimensions.
As a special case, if you pass no parameter to a reduction operation the
original tensor is reduced along *all* its dimensions. The result is a
-one-dimension tensor with a single value.
+scalar, represented as a zero-dimension tensor.
Eigen::Tensor<float, 3> a(2, 3, 4);
a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f},
@@ -1112,7 +1112,7 @@ one-dimension tensor with a single value.
{19.0f, 18.0f, 17.0f, 16.0f},
{20.0f, 21.0f, 22.0f, 23.0f}}});
// Reduce along all dimensions using the sum() operator.
- Eigen::Tensor<float, 1> b = a.sum();
+ Eigen::Tensor<float, 0> b = a.sum();
cout << "b" << endl << b << endl << endl;
=>
b
@@ -1168,6 +1168,44 @@ Reduce a tensor using a user-defined reduction operator. See ```SumReducer```
in TensorFunctors.h for information on how to implement a reduction operator.
+## Scan Operations
+
+A *Scan* operation returns a tensor with the same dimensions as the original
+tensor. The operation performs an inclusive scan along the specified
+axis, which means it computes a running total along the axis for a given
+reduction operation.
+If the reduction operation corresponds to summation, then this computes the
+prefix sum of the tensor along the given axis.
+
+Example:
+dd a comment to this line
+
+ // Create a tensor of 2 dimensions
+ Eigen::Tensor<int, 2> a(2, 3);
+ a.setValues({{1, 2, 3}, {4, 5, 6}});
+ // Scan it along the second dimension (1) using summation
+ Eigen::Tensor<int, 2> b = a.cumsum(1);
+ // The result is a tensor with the same size as the input
+ cout << "a" << endl << a << endl << endl;
+ cout << "b" << endl << b << endl << endl;
+ =>
+ a
+ 1 2 3
+ 6 5 4
+
+ b
+ 1 3 6
+ 4 9 15
+
+### <Operation> cumsum(const Index& axis)
+
+Perform a scan by summing consecutive entries.
+
+### <Operation> cumprod(const Index& axis)
+
+Perform a scan by multiplying consecutive entries.
+
+
## Convolutions
### <Operation> convolve(const Kernel& kernel, const Dimensions& dims)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
index 759dede3f..1940a9692 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
@@ -110,7 +110,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
inline Self& base() { return *this; }
inline const Self& base() const { return *this; }
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
{
@@ -150,7 +150,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
return m_storage.data()[index];
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
inline Scalar& coeffRef(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
{
@@ -190,7 +190,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
return m_storage.data()[index];
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
inline const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
{
@@ -257,7 +257,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
return coeff(index);
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
inline Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
{
@@ -336,7 +336,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
{
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index firstDimension, IndexTypes... otherDimensions)
: m_storage(firstDimension, otherDimensions...)
@@ -350,22 +350,22 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
{
EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1, Index dim2)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2)
: m_storage(dim1*dim2, array<Index, 2>(dim1, dim2))
{
EIGEN_STATIC_ASSERT(2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1, Index dim2, Index dim3)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3)
: m_storage(dim1*dim2*dim3, array<Index, 3>(dim1, dim2, dim3))
{
EIGEN_STATIC_ASSERT(3 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4)
: m_storage(dim1*dim2*dim3*dim4, array<Index, 4>(dim1, dim2, dim3, dim4))
{
EIGEN_STATIC_ASSERT(4 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5)
: m_storage(dim1*dim2*dim3*dim4*dim5, array<Index, 5>(dim1, dim2, dim3, dim4, dim5))
{
EIGEN_STATIC_ASSERT(5 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
@@ -418,7 +418,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
return *this;
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
void resize(Index firstDimension, IndexTypes... otherDimensions)
{
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
index babafe108..d06f40cd8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
@@ -254,6 +254,14 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ const double compute_cost = 1.0 +
+ (m_return_dim < 0 ? 0.0 : (TensorOpCost::ModCost<Index>() + TensorOpCost::DivCost<Index>()));
+ return m_orig_impl.costPerCoeff(vectorized) +
+ m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost);
+ }
+
private:
EIGEN_DEVICE_FUNC void gen_strides(const InputDimensions& dims, StrideDims& strides) {
if (m_return_dim < 0) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index 1a34f3ccc..7a45a5cf4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -192,6 +192,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
}
EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log1p_op<Scalar>, const Derived>
+ log1p() const {
+ return unaryExpr(internal::scalar_log1p_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived>
abs() const {
return unaryExpr(internal::scalar_abs_op<Scalar>());
@@ -204,34 +210,74 @@ class TensorBase<Derived, ReadOnlyAccessors>
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_pow_op<Scalar>, const Derived>
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_pow_op<Scalar,Scalar> >, const Derived>
pow(Scalar exponent) const {
- return unaryExpr(internal::scalar_pow_op<Scalar>(exponent));
+ return unaryExpr(internal::bind2nd_op<internal::scalar_pow_op<Scalar,Scalar> >(exponent));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_real_op<Scalar>, const Derived>
+ real() const {
+ return unaryExpr(internal::scalar_real_op<Scalar>());
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_imag_op<Scalar>, const Derived>
+ imag() const {
+ return unaryExpr(internal::scalar_imag_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_sum_op<Scalar,Scalar> >, const Derived>
operator+ (Scalar rhs) const {
- return unaryExpr(internal::scalar_add_op<Scalar>(rhs));
+ return unaryExpr(internal::bind2nd_op<internal::scalar_sum_op<Scalar,Scalar> >(rhs));
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sub_op<Scalar>, const Derived>
+ EIGEN_STRONG_INLINE friend
+ const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_sum_op<Scalar> >, const Derived>
+ operator+ (Scalar lhs, const Derived& rhs) {
+ return rhs.unaryExpr(internal::bind1st_op<internal::scalar_sum_op<Scalar> >(lhs));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_difference_op<Scalar,Scalar> >, const Derived>
operator- (Scalar rhs) const {
EIGEN_STATIC_ASSERT((NumTraits<Scalar>::IsSigned || internal::is_same<Scalar, const std::complex<float> >::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
- return unaryExpr(internal::scalar_sub_op<Scalar>(rhs));
+ return unaryExpr(internal::bind2nd_op<internal::scalar_difference_op<Scalar,Scalar> >(rhs));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE friend
+ const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_difference_op<Scalar> >, const Derived>
+ operator- (Scalar lhs, const Derived& rhs) {
+ return rhs.unaryExpr(internal::bind1st_op<internal::scalar_difference_op<Scalar> >(lhs));
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Derived>
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_product_op<Scalar,Scalar> >, const Derived>
operator* (Scalar rhs) const {
- return unaryExpr(internal::scalar_multiple_op<Scalar>(rhs));
+ return unaryExpr(internal::bind2nd_op<internal::scalar_product_op<Scalar,Scalar> >(rhs));
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_quotient1_op<Scalar>, const Derived>
+ EIGEN_STRONG_INLINE friend
+ const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_product_op<Scalar> >, const Derived>
+ operator* (Scalar lhs, const Derived& rhs) {
+ return rhs.unaryExpr(internal::bind1st_op<internal::scalar_product_op<Scalar> >(lhs));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_quotient_op<Scalar,Scalar> >, const Derived>
operator/ (Scalar rhs) const {
- return unaryExpr(internal::scalar_quotient1_op<Scalar>(rhs));
+ return unaryExpr(internal::bind2nd_op<internal::scalar_quotient_op<Scalar,Scalar> >(rhs));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE friend
+ const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_quotient_op<Scalar> >, const Derived>
+ operator/ (Scalar lhs, const Derived& rhs) {
+ return rhs.unaryExpr(internal::bind1st_op<internal::scalar_quotient_op<Scalar> >(lhs));
}
EIGEN_DEVICE_FUNC
@@ -277,7 +323,6 @@ class TensorBase<Derived, ReadOnlyAccessors>
return unaryExpr(internal::scalar_floor_op<Scalar>());
}
-
// Generic binary operation support.
template <typename CustomBinaryOp, typename OtherDerived> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>
@@ -342,66 +387,66 @@ class TensorBase<Derived, ReadOnlyAccessors>
// Comparisons and tests.
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LT>, const Derived, const OtherDerived>
+ const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const OtherDerived>
operator<(const OtherDerived& other) const {
- return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_LT>());
+ return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>());
}
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LE>, const Derived, const OtherDerived>
+ const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const OtherDerived>
operator<=(const OtherDerived& other) const {
- return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_LE>());
+ return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>());
}
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GT>, const Derived, const OtherDerived>
+ const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const OtherDerived>
operator>(const OtherDerived& other) const {
- return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_GT>());
+ return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>());
}
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GE>, const Derived, const OtherDerived>
+ const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const OtherDerived>
operator>=(const OtherDerived& other) const {
- return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_GE>());
+ return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>());
}
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_EQ>, const Derived, const OtherDerived>
+ const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const OtherDerived>
operator==(const OtherDerived& other) const {
- return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_EQ>());
+ return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>());
}
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>, const Derived, const OtherDerived>
+ const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const OtherDerived>
operator!=(const OtherDerived& other) const {
- return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>());
+ return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>());
}
// comparisons and tests for Scalars
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+ EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
operator<(Scalar threshold) const {
return operator<(constant(threshold));
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+ EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
operator<=(Scalar threshold) const {
return operator<=(constant(threshold));
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+ EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
operator>(Scalar threshold) const {
return operator>(constant(threshold));
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+ EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
operator>=(Scalar threshold) const {
return operator>=(constant(threshold));
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_EQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+ EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
operator==(Scalar threshold) const {
return operator==(constant(threshold));
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+ EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
operator!=(Scalar threshold) const {
return operator!=(constant(threshold));
}
@@ -453,6 +498,28 @@ class TensorBase<Derived, ReadOnlyAccessors>
return TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>(derived(), fft);
}
+ // Scan.
+ typedef TensorScanOp<internal::SumReducer<CoeffReturnType>, const Derived> TensorScanSumOp;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorScanSumOp
+ cumsum(const Index& axis, bool exclusive = false) const {
+ return TensorScanSumOp(derived(), axis, exclusive);
+ }
+
+ typedef TensorScanOp<internal::ProdReducer<CoeffReturnType>, const Derived> TensorScanProdOp;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorScanProdOp
+ cumprod(const Index& axis, bool exclusive = false) const {
+ return TensorScanProdOp(derived(), axis, exclusive);
+ }
+
+ template <typename Reducer>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorScanOp<Reducer, const Derived>
+ scan(const Index& axis, const Reducer& reducer, bool exclusive = false) const {
+ return TensorScanOp<Reducer, const Derived>(derived(), axis, exclusive, reducer);
+ }
+
// Reductions.
template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived>
@@ -676,6 +743,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
slice(const StartIndices& startIndices, const Sizes& sizes) const {
return TensorSlicingOp<const StartIndices, const Sizes, const Derived>(derived(), startIndices, sizes);
}
+ template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, const Derived>
+ stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const {
+ return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides,
+ const Derived>(derived(), startIndices, stopIndices, strides);
+ }
template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorChippingOp<DimId, const Derived>
chip(const Index offset) const {
@@ -750,8 +823,8 @@ class TensorBase<Derived, ReadOnlyAccessors>
EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
};
-template<typename Derived>
-class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyAccessors> {
+template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value>
+class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
public:
typedef internal::traits<Derived> DerivedTraits;
typedef typename DerivedTraits::Scalar Scalar;
@@ -761,7 +834,7 @@ class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyA
template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor;
template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize;
- template <typename OtherDerived, int AccessLevel> friend class TensorBase;
+ template <typename OtherDerived, int OtherAccessLevel> friend class TensorBase;
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Derived& setZero() {
@@ -780,7 +853,7 @@ class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyA
return derived() = this->template random<RandomGenerator>();
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Derived& setValues(
const typename internal::Initializer<Derived, NumDimensions>::InitList& vals) {
@@ -851,6 +924,19 @@ class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyA
return TensorSlicingOp<const StartIndices, const Sizes, Derived>(derived(), startIndices, sizes);
}
+ template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, const Derived>
+ stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const {
+ return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides,
+ const Derived>(derived(), startIndices, stopIndices, strides);
+ }
+ template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, Derived>
+ stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) {
+ return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides,
+ Derived>(derived(), startIndices, stopIndices, strides);
+ }
+
template <DenseIndex DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorChippingOp<DimId, const Derived>
chip(const Index offset) const {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index c771496e2..5d67f69f3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -106,7 +106,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
enum {
- IsAligned = false,
+ IsAligned = true,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = false
@@ -118,7 +118,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
// The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar
// and store the result in a scalar. Instead one should reshape the scalar into a a N-D
// tensor with N >= 1 of 1 element first and then broadcast.
- EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
const InputDimensions& input_dims = m_impl.dimensions();
const Broadcast& broadcast = op.broadcast();
for (int i = 0; i < NumDims; ++i) {
@@ -247,7 +247,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
const Index originalIndex = index;
@@ -299,7 +299,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
const Index originalIndex = index;
@@ -354,11 +354,11 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
if (NumDims > 0) {
for (int i = NumDims - 1; i > 0; --i) {
compute_cost += TensorOpCost::DivCost<Index>();
- if (internal::index_statically_eq<Broadcast>()(i, 1)) {
+ if (internal::index_statically_eq<Broadcast>(i, 1)) {
compute_cost +=
TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
} else {
- if (!internal::index_statically_eq<InputDimensions>()(i, 1)) {
+ if (!internal::index_statically_eq<InputDimensions>(i, 1)) {
compute_cost += TensorOpCost::MulCost<Index>() +
TensorOpCost::ModCost<Index>() +
TensorOpCost::AddCost<Index>();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index 2742dbb95..1ba7ef170 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -152,8 +152,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_dim(op.dim()), m_device(device)
{
- // We could also support the case where NumInputDims==1 if needed.
- EIGEN_STATIC_ASSERT(NumInputDims >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
eigen_assert(NumInputDims > m_dim.actualDim());
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
@@ -203,7 +202,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
@@ -342,7 +341,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == 0) ||
(static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index 839c6e3e5..59bf90d93 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -128,8 +128,8 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
: m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis())
{
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
- EIGEN_STATIC_ASSERT(NumDims == RightNumDims, YOU_MADE_A_PROGRAMMING_MISTAKE);
- EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((NumDims == RightNumDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
eigen_assert(0 <= m_axis && m_axis < NumDims);
const Dimensions& lhs_dims = m_leftImpl.dimensions();
@@ -248,8 +248,8 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
- EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+ EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
@@ -344,8 +344,8 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
{
- static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
- EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+ EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize());
EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 6f113b903..20b29e5fd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -25,8 +25,9 @@ template<typename Dimensions, typename LhsXprType, typename RhsXprType>
struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
{
// Type promotion to handle the case where the types of the lhs and the rhs are different.
- typedef typename internal::promote_storage_type<typename LhsXprType::Scalar,
- typename RhsXprType::Scalar>::ret Scalar;
+ typedef typename gebp_traits<typename remove_const<typename LhsXprType::Scalar>::type,
+ typename remove_const<typename RhsXprType::Scalar>::type>::ResScalar Scalar;
+
typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
typename traits<RhsXprType>::StorageKind>::ret StorageKind;
typedef typename promote_index_type<typename traits<LhsXprType>::Index,
@@ -37,7 +38,7 @@ struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
typedef typename remove_reference<RhsNested>::type _RhsNested;
// From NumDims below.
- static const int NumDimensions = max_n_1<traits<RhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value>::size;
+ static const int NumDimensions = traits<RhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value;
static const int Layout = traits<LhsXprType>::Layout;
enum {
@@ -65,7 +66,7 @@ struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_,
typedef Device_ Device;
// From NumDims below.
- static const int NumDimensions = max_n_1<traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value>::size;
+ static const int NumDimensions = traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value;
};
} // end namespace internal
@@ -75,8 +76,8 @@ class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXp
{
public:
typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar;
- typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
- typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
+ typedef typename internal::gebp_traits<typename LhsXprType::CoeffReturnType,
+ typename RhsXprType::CoeffReturnType>::ResScalar CoeffReturnType;
typedef typename Eigen::internal::nested<TensorContractionOp>::type Nested;
typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index;
@@ -140,11 +141,11 @@ struct TensorContractionEvaluatorBase
static const int RDims =
internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
static const int ContractDims = internal::array_size<Indices>::value;
- static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
+ static const int NumDims = LDims + RDims - 2 * ContractDims;
typedef array<Index, ContractDims> contract_t;
- typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
- typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+ typedef array<Index, LDims - ContractDims> left_nocontract_t;
+ typedef array<Index, RDims - ContractDims> right_nocontract_t;
typedef DSizes<Index, NumDims> Dimensions;
@@ -218,11 +219,9 @@ struct TensorContractionEvaluatorBase
rhs_strides[i+1] = rhs_strides[i] * eval_right_dims[i];
}
- m_i_strides[0] = 1;
- m_j_strides[0] = 1;
- if(ContractDims) {
- m_k_strides[0] = 1;
- }
+ if (m_i_strides.size() > 0) m_i_strides[0] = 1;
+ if (m_j_strides.size() > 0) m_j_strides[0] = 1;
+ if (m_k_strides.size() > 0) m_k_strides[0] = 1;
m_i_size = 1;
m_j_size = 1;
@@ -318,11 +317,6 @@ struct TensorContractionEvaluatorBase
}
}
- // Scalar case. We represent the result as a 1d tensor of size 1.
- if (LDims + RDims == 2 * ContractDims) {
- m_dimensions[0] = 1;
- }
-
// If the layout is RowMajor, we need to reverse the m_dimensions
if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) {
for (int i = 0, j = NumDims - 1; i < j; i++, j--) {
@@ -510,7 +504,7 @@ struct TensorContractionEvaluatorBase
// call gebp (matrix kernel)
// The parameters here are copied from Eigen's GEMM implementation
- gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, 1.0, -1, -1, 0, 0);
+ gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, Scalar(1), -1, -1, 0, 0);
}
}
}
@@ -607,15 +601,14 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
static const int ContractDims = internal::array_size<Indices>::value;
typedef array<Index, ContractDims> contract_t;
- typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
- typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+ typedef array<Index, LDims - ContractDims> left_nocontract_t;
+ typedef array<Index, RDims - ContractDims> right_nocontract_t;
- static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
+ static const int NumDims = LDims + RDims - 2 * ContractDims;
// Could we use NumDimensions here?
typedef DSizes<Index, NumDims> Dimensions;
-
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
Base(op, device) { }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index 6a3ef14ef..d65dbb40f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -461,8 +461,8 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
#undef writeResultShmem
#undef writeRow
- const int max_i_write = (min)((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
- const int max_j_write = (min)((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
+ const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
+ const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
if (threadIdx.x < max_i_write) {
if (max_j_write == 8) {
@@ -1240,10 +1240,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
typedef array<Index, RDims> right_dim_mapper_t;
typedef array<Index, ContractDims> contract_t;
- typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
- typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+ typedef array<Index, LDims - ContractDims> left_nocontract_t;
+ typedef array<Index, RDims - ContractDims> right_nocontract_t;
- static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
+ static const int NumDims = LDims + RDims - 2 * ContractDims;
typedef DSizes<Index, NumDims> Dimensions;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
index b27e1a1b4..9b2cb3ff6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@@ -130,19 +130,19 @@ class SimpleTensorContractionMapper {
}
Index contract_val = left ? col : row;
- for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
- const Index idx = contract_val / m_k_strides[i];
- linidx += idx * m_contract_strides[i];
- contract_val -= idx * m_k_strides[i];
- }
-
if(array_size<contract_t>::value > 0) {
- if (side == Rhs && inner_dim_contiguous) {
- eigen_assert(m_contract_strides[0] == 1);
- linidx += contract_val;
- } else {
- linidx += contract_val * m_contract_strides[0];
- }
+ for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
+ const Index idx = contract_val / m_k_strides[i];
+ linidx += idx * m_contract_strides[i];
+ contract_val -= idx * m_k_strides[i];
+ }
+
+ if (side == Rhs && inner_dim_contiguous) {
+ eigen_assert(m_contract_strides[0] == 1);
+ linidx += contract_val;
+ } else {
+ linidx += contract_val * m_contract_strides[0];
+ }
}
return linidx;
@@ -153,15 +153,15 @@ class SimpleTensorContractionMapper {
const bool left = (side == Lhs);
Index nocontract_val[2] = {left ? row : col, left ? row + distance : col};
Index linidx[2] = {0, 0};
- for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
- const Index idx0 = nocontract_val[0] / m_ij_strides[i];
- const Index idx1 = nocontract_val[1] / m_ij_strides[i];
- linidx[0] += idx0 * m_nocontract_strides[i];
- linidx[1] += idx1 * m_nocontract_strides[i];
- nocontract_val[0] -= idx0 * m_ij_strides[i];
- nocontract_val[1] -= idx1 * m_ij_strides[i];
- }
if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+ for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
+ const Index idx0 = nocontract_val[0] / m_ij_strides[i];
+ const Index idx1 = nocontract_val[1] / m_ij_strides[i];
+ linidx[0] += idx0 * m_nocontract_strides[i];
+ linidx[1] += idx1 * m_nocontract_strides[i];
+ nocontract_val[0] -= idx0 * m_ij_strides[i];
+ nocontract_val[1] -= idx1 * m_ij_strides[i];
+ }
if (side == Lhs && inner_dim_contiguous) {
eigen_assert(m_nocontract_strides[0] == 1);
linidx[0] += nocontract_val[0];
@@ -173,22 +173,24 @@ class SimpleTensorContractionMapper {
}
Index contract_val[2] = {left ? col : row, left ? col : row + distance};
- for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
- const Index idx0 = contract_val[0] / m_k_strides[i];
- const Index idx1 = contract_val[1] / m_k_strides[i];
- linidx[0] += idx0 * m_contract_strides[i];
- linidx[1] += idx1 * m_contract_strides[i];
- contract_val[0] -= idx0 * m_k_strides[i];
- contract_val[1] -= idx1 * m_k_strides[i];
- }
+ if (array_size<contract_t>::value> 0) {
+ for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
+ const Index idx0 = contract_val[0] / m_k_strides[i];
+ const Index idx1 = contract_val[1] / m_k_strides[i];
+ linidx[0] += idx0 * m_contract_strides[i];
+ linidx[1] += idx1 * m_contract_strides[i];
+ contract_val[0] -= idx0 * m_k_strides[i];
+ contract_val[1] -= idx1 * m_k_strides[i];
+ }
- if (side == Rhs && inner_dim_contiguous) {
- eigen_assert(m_contract_strides[0] == 1);
- linidx[0] += contract_val[0];
- linidx[1] += contract_val[1];
- } else {
- linidx[0] += contract_val[0] * m_contract_strides[0];
- linidx[1] += contract_val[1] * m_contract_strides[0];
+ if (side == Rhs && inner_dim_contiguous) {
+ eigen_assert(m_contract_strides[0] == 1);
+ linidx[0] += contract_val[0];
+ linidx[1] += contract_val[1];
+ } else {
+ linidx[0] += contract_val[0] * m_contract_strides[0];
+ linidx[1] += contract_val[1] * m_contract_strides[0];
+ }
}
return IndexPair<Index>(linidx[0], linidx[1]);
}
@@ -200,7 +202,7 @@ class SimpleTensorContractionMapper {
return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size;
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const {
- return ((side == Lhs) && inner_dim_contiguous) ? m_contract_strides[0] : 1;
+ return ((side == Lhs) && inner_dim_contiguous && array_size<contract_t>::value > 0) ? m_contract_strides[0] : 1;
}
protected:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index 9044454fd..ee16cde9b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -14,6 +14,8 @@
#ifdef EIGEN_USE_THREADS
namespace Eigen {
+
+#ifdef EIGEN_USE_SIMPLE_THREAD_POOL
namespace internal {
template<typename LhsScalar, typename LhsMapper, typename Index>
@@ -52,7 +54,7 @@ struct packRhsAndKernelArg {
};
} // end namespace internal
-
+#endif // EIGEN_USE_SIMPLE_THREAD_POOL
template<typename Indices, typename LeftArgType, typename RightArgType>
struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> :
@@ -92,10 +94,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
typedef array<Index, RDims> right_dim_mapper_t;
typedef array<Index, ContractDims> contract_t;
- typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
- typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+ typedef array<Index, LDims - ContractDims> left_nocontract_t;
+ typedef array<Index, RDims - ContractDims> right_nocontract_t;
- static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
+ static const int NumDims = LDims + RDims - 2 * ContractDims;
typedef DSizes<Index, NumDims> Dimensions;
@@ -110,6 +112,623 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
TensorEvaluator(const XprType& op, const Device& device) :
Base(op, device) {}
+#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
+ template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
+ bool rhs_inner_dim_reordered, int Alignment>
+ void evalProduct(Scalar* buffer) const {
+ typedef
+ typename internal::remove_const<typename EvalLeftArgType::Scalar>::type
+ LhsScalar;
+ typedef
+ typename internal::remove_const<typename EvalRightArgType::Scalar>::type
+ RhsScalar;
+ typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+ typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+ typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+ typedef internal::TensorContractionInputMapper<
+ LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
+ contract_t, internal::packet_traits<LhsScalar>::size,
+ lhs_inner_dim_contiguous, false, Unaligned>
+ LhsMapper;
+ typedef internal::TensorContractionInputMapper<
+ RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
+ contract_t, internal::packet_traits<RhsScalar>::size,
+ rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned>
+ RhsMapper;
+ typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+ typedef internal::gemm_pack_lhs<LhsScalar, Index,
+ typename LhsMapper::SubMapper, Traits::mr,
+ Traits::LhsProgress, ColMajor>
+ LhsPacker;
+ typedef internal::gemm_pack_rhs<
+ RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor>
+ RhsPacker;
+ typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
+ Traits::mr, Traits::nr, false, false>
+ GebpKernel;
+
+ const Index m = this->m_i_size;
+ const Index n = this->m_j_size;
+ const Index k = this->m_k_size;
+ if (m == 0 || n == 0 || k == 0) return;
+
+ // Compute a set of algorithm parameters:
+ // - kernel block sizes (bm, bn, bk)
+ // - task grain sizes (number of kernels executed per task: gm, gn)
+ // - number of threads
+ // - sharding by row/column
+ // - parallel packing or first lhs then rhs
+ // and some derived parameters:
+ // - number of tasks (nm, nn, nk)
+ // - number of kernels (nm0, nn0)
+ // Unfortunately, all these parameters are tightly interdependent.
+ // So in some cases we first compute approximate values, then compute other
+ // values based on these approximations and then refine the approximations.
+
+ // There are lots of heuristics here. There is some reasoning behind them,
+ // but ultimately they are just tuned on contraction benchmarks for
+ // different input configurations, thread counts and instruction sets.
+ // So feel free to question any of them.
+
+ // Compute whether we want to shard by row or by column.
+ // This is a first approximation, it will be refined later. Since we don't
+ // know number of threads yet we use 2, because what's we are most
+ // interested in at this point is whether it makes sense to use
+ // parallelization at all or not.
+ bool shard_by_col = shardByCol(m, n, 2);
+
+ // First approximation of kernel blocking sizes.
+ // Again, we don't know number of threads yet, so we use 2.
+ Index bm, bn, bk;
+ if (shard_by_col) {
+ internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+ internal::ShardByCol>
+ blocking(k, m, n, 2);
+ bm = blocking.mc();
+ bn = blocking.nc();
+ bk = blocking.kc();
+ } else {
+ internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+ internal::ShardByRow>
+ blocking(k, m, n, 2);
+ bm = blocking.mc();
+ bn = blocking.nc();
+ bk = blocking.kc();
+ }
+
+ // Compute optimal number of threads.
+ // Note: we use bk instead of k here because we are interested in amount of
+ // _parallelizable_ computations, and computations are not parallelizable
+ // across k dimension.
+ const TensorOpCost cost =
+ contractionCost(m, n, bm, bn, bk, shard_by_col, false);
+ int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
+ static_cast<double>(n) * m, cost, this->m_device.numThreads());
+
+ // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost
+ // model is not tuned. Remove this when the cost model is tuned.
+ if (n == 1) num_threads = 1;
+
+ if (num_threads == 1) {
+ // The single-threaded algorithm should be faster in this case.
+ if (n == 1)
+ this->template evalGemv<lhs_inner_dim_contiguous,
+ rhs_inner_dim_contiguous,
+ rhs_inner_dim_reordered, Alignment>(buffer);
+ else
+ this->template evalGemm<lhs_inner_dim_contiguous,
+ rhs_inner_dim_contiguous,
+ rhs_inner_dim_reordered, Alignment>(buffer);
+ return;
+ }
+
+ // Now that we know number of threads, recalculate sharding and blocking.
+ shard_by_col = shardByCol(m, n, num_threads);
+ if (shard_by_col) {
+ internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+ internal::ShardByCol>
+ blocking(k, m, n, num_threads);
+ bm = blocking.mc();
+ bn = blocking.nc();
+ bk = blocking.kc();
+ } else {
+ internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+ internal::ShardByRow>
+ blocking(k, m, n, num_threads);
+ bm = blocking.mc();
+ bn = blocking.nc();
+ bk = blocking.kc();
+ }
+
+ // Number of kernels for each dimension.
+ Index nm0 = divup(m, bm);
+ Index nn0 = divup(n, bn);
+ Index nk = divup(k, bk);
+
+ // Calculate task grain size (number of kernels executed per task).
+ // This task size coarsening serves two purposes:
+ // 1. It reduces per-task overheads including synchronization overheads.
+ // 2. It allows to use caches better (reuse the same packed rhs in several
+ // consecutive kernels).
+ Index gm = 1;
+ Index gn = 1;
+ // If we are sharding by column, then we prefer to reduce rows first.
+ if (shard_by_col) {
+ gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col);
+ gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col);
+ } else {
+ gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col);
+ gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col);
+ }
+ // Number of tasks in each dimension.
+ Index nm = divup(nm0, gm);
+ Index nn = divup(nn0, gn);
+
+ // Last by not least, decide whether we want to issue both lhs and rhs
+ // packing in parallel; or issue lhs packing first, and then issue rhs
+ // packing when lhs packing completes (for !shard_by_col lhs and rhs are
+ // swapped). Parallel packing allows more parallelism (for both packing and
+ // kernels), while sequential packing provides better locality (once
+ // a thread finishes rhs packing it proceed to kernels with that rhs).
+ // First, we are interested in parallel packing if there are few tasks.
+ bool parallel_pack = num_threads >= nm * nn;
+ // Also do parallel packing if all data fits into L2$.
+ if (m * bk * Index(sizeof(LhsScalar)) + n * bk * Index(sizeof(RhsScalar)) <=
+ l2CacheSize() * num_threads)
+ parallel_pack = true;
+ // But don't do it if we will use each rhs only once. Locality seems to be
+ // more important in this case.
+ if ((shard_by_col ? nm : nn) == 1) parallel_pack = false;
+
+ LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides,
+ this->m_i_strides, this->m_left_contracting_strides,
+ this->m_k_strides);
+
+ RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides,
+ this->m_j_strides, this->m_right_contracting_strides,
+ this->m_k_strides);
+
+ Context<LhsPacker, RhsPacker, GebpKernel, LhsMapper, RhsMapper,
+ OutputMapper>(this->m_device, num_threads, lhs, rhs, buffer, m, n,
+ k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0,
+ shard_by_col, parallel_pack)
+ .run();
+ }
+
+ // Context coordinates a single parallel gemm operation.
+ template <typename LhsPacker, typename RhsPacker, typename GebpKernel,
+ typename LhsMapper, typename RhsMapper, typename OutputMapper>
+ class Context {
+ public:
+ Context(const Device& device, int num_threads, LhsMapper& lhs,
+ RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm,
+ Index bn, Index bk, Index nm, Index nn, Index nk, Index gm,
+ Index gn, Index nm0, Index nn0, bool shard_by_col,
+ bool parallel_pack)
+ : device_(device),
+ lhs_(lhs),
+ rhs_(rhs),
+ buffer_(buffer),
+ output_(buffer, tm),
+ num_threads_(num_threads),
+ shard_by_col_(shard_by_col),
+ parallel_pack_(parallel_pack),
+ m_(tm),
+ n_(tn),
+ k_(tk),
+ bm_(bm),
+ bn_(bn),
+ bk_(bk),
+ nm_(nm),
+ nn_(nn),
+ nk_(nk),
+ gm_(gm),
+ gn_(gn),
+ nm0_(nm0),
+ nn0_(nn0)
+ {
+ for (Index x = 0; x < P; x++) {
+ // Normal number of notifications for k slice switch is
+ // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only
+ // nm_ + nn_ notifications, because they will not receive notifications
+ // from preceeding kernels.
+ state_switch_[x] =
+ x == 0
+ ? 1
+ : (parallel_pack_ ? nn_ + nm_ : (shard_by_col_ ? nn_ : nm_)) +
+ (x == P - 1 ? nm_ * nn_ : 0);
+ state_packing_ready_[x] =
+ parallel_pack_ ? 0 : (shard_by_col_ ? nm_ : nn_);
+ state_kernel_[x] = new std::atomic<uint8_t>*[nm_];
+ for (Index m = 0; m < nm_; m++) {
+ state_kernel_[x][m] = new std::atomic<uint8_t>[nn_];
+ // Kernels generally receive 3 notifications (previous kernel + 2
+ // packing), but the first slice won't get notifications from previous
+ // kernels.
+ for (Index n = 0; n < nn_; n++)
+ state_kernel_[x][m][n].store(
+ (x == 0 ? 0 : 1) + (parallel_pack_ ? 2 : 1),
+ std::memory_order_relaxed);
+ }
+ }
+
+ // Allocate memory for packed rhs/lhs matrices.
+ size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
+ size_t lhs_size =
+ divup<size_t>(bm_ * bk_ * sizeof(LhsScalar), align) * align;
+ size_t rhs_size =
+ divup<size_t>(bn_ * bk_ * sizeof(RhsScalar), align) * align;
+ packed_mem_ = static_cast<char*>(internal::aligned_malloc(
+ (nm0_ * lhs_size + nn0_ * rhs_size) * std::min<size_t>(nk_, P - 1)));
+ char* mem = static_cast<char*>(packed_mem_);
+ for (Index x = 0; x < numext::mini<Index>(nk_, P - 1); x++) {
+ packed_lhs_[x].resize(nm0_);
+ for (Index m = 0; m < nm0_; m++) {
+ packed_lhs_[x][m] = reinterpret_cast<LhsScalar*>(mem);
+ mem += lhs_size;
+ }
+ packed_rhs_[x].resize(nn0_);
+ for (Index n = 0; n < nn0_; n++) {
+ packed_rhs_[x][n] = reinterpret_cast<RhsScalar*>(mem);
+ mem += rhs_size;
+ }
+ }
+ }
+
+ ~Context() {
+ for (Index x = 0; x < P; x++) {
+ for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m];
+ delete[] state_kernel_[x];
+ }
+ internal::aligned_free(packed_mem_);
+ }
+
+ void run() {
+ // Kick off packing of the first slice.
+ signal_switch(0, 1);
+ // Wait for overall completion.
+ // TODO(dvyukov): this wait can lead to deadlock.
+ // If nthreads contractions are concurrently submitted from worker
+ // threads, this wait will block all worker threads and the system will
+ // deadlock.
+ done_.Wait();
+ }
+
+ private:
+ Notification done_;
+ const Device& device_;
+ LhsMapper& lhs_;
+ RhsMapper& rhs_;
+ Scalar* const buffer_;
+ OutputMapper output_;
+ const int num_threads_;
+ const bool shard_by_col_;
+ const bool parallel_pack_;
+ // Matrix sizes.
+ const Index m_;
+ const Index n_;
+ const Index k_;
+ // Block sizes.
+ const Index bm_;
+ const Index bn_;
+ const Index bk_;
+ // Number of tasks.
+ const Index nm_;
+ const Index nn_;
+ const Index nk_;
+ // Task grain sizes (number of kernels executed per task).
+ const Index gm_;
+ const Index gn_;
+ // Number of blocks (this is different from ni_/nn_ because of task size
+ // coarsening).
+ const Index nm0_;
+ const Index nn0_;
+
+ // Parallelization strategy.
+ //
+ // Blocks related to the same k block can run in parallel because they write
+ // to different output blocks. So we parallelize within k slices, this
+ // gives us parallelism level of m x n. Before we can start any kernels
+ // related to k-th slice, we need to issue m lhs packing tasks and n rhs
+ // packing tasks.
+ //
+ // However, there is a bottleneck when we are finishing kernels for k-th
+ // slice (at the very end there is only 1 runnable kernel). To mitigate this
+ // bottleneck we allow kernels from k-th and k+1-th slices to run in
+ // parallel. Note that (m, n, k) and (m, n, k+1) kernels write to the same
+ // output block, so they must not run in parallel.
+ //
+ // This gives us the following dependency graph.
+ // On each k slice we have m x n kernel tasks, m lhs paking tasks and n rhs
+ // packing tasks.
+ // Kernel (m, n, k) can start when:
+ // - kernel (m, n, k-1) has finished
+ // - lhs packing (m, k) has finished
+ // - rhs packing (n, k) has finished
+ // Lhs/rhs packing can start when:
+ // - all k-1 packing has finished (artificially imposed to limit amount of
+ // parallel packing)
+ //
+ // On top of that we limit runnable tasks to two consecutive k slices.
+ // This is done to limit amount of memory we need for packed lhs/rhs
+ // (for each k slice we need m*bk + n*bk memory in packed_lhs_/packed_rhs_).
+ //
+ // state_switch_ tracks when we are ready to switch to the next k slice.
+ // state_kernel_[m][n] tracks when we are ready to kick off kernel (m, n).
+ // These variable are rolling over 3 consecutive k slices: first two we are
+ // actively executing + one to track completion of kernels in the second
+ // slice.
+ static const Index P = 3;
+ void* packed_mem_;
+ std::vector<LhsScalar*> packed_lhs_[P - 1];
+ std::vector<RhsScalar*> packed_rhs_[P - 1];
+ std::atomic<uint8_t>** state_kernel_[P];
+ // state_switch_ is frequently modified by worker threads, while other
+ // fields are read-only after constructor. Let's move it to a separate cache
+ // line to reduce cache-coherency traffic.
+ char pad_[128];
+ std::atomic<Index> state_packing_ready_[P];
+ std::atomic<Index> state_switch_[P];
+
+ void pack_lhs(Index m, Index k) {
+ const Index mend = m * gm_ + gm(m);
+ for (Index m1 = m * gm_; m1 < mend; m1++)
+ LhsPacker()(packed_lhs_[k % (P - 1)][m1],
+ lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1));
+
+ if (!parallel_pack_ && shard_by_col_) {
+ signal_packing(k);
+ } else {
+ signal_switch(k + 1);
+ for (Index n = nn_ - 1; n >= 0; n--) signal_kernel(m, n, k, n == 0);
+ }
+ }
+
+ void pack_rhs(Index n, Index k) {
+ const Index nend = n * gn_ + gn(n);
+ for (Index n1 = n * gn_; n1 < nend; n1++) {
+ if (k == 0) {
+ // Zero the output memory in parallel.
+ // On 10000x2x10000 mm zeroing can easily take half of time.
+ // Zero (bn x m) row. Safe to do here because all kernels that will
+ // write to this memory depend on completion of this task.
+ // Note: don't call device_.memset() here. device_.memset() blocks on
+ // thread pool worker thread, which can lead to underutilization and
+ // deadlocks.
+ memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar));
+ }
+ RhsPacker()(packed_rhs_[k % (P - 1)][n1],
+ rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1));
+ }
+
+ if (parallel_pack_ || shard_by_col_) {
+ signal_switch(k + 1);
+ for (Index m = nm_ - 1; m >= 0; m--) signal_kernel(m, n, k, m == 0);
+ } else {
+ signal_packing(k);
+ }
+ }
+
+ void kernel(Index m, Index n, Index k) {
+ // Note: order of iteration matters here. Iteration over m is innermost
+ // because we want to reuse the same packed rhs in consequetive tasks
+ // (rhs fits into L2$ while lhs only into L3$).
+ const Index nend = n * gn_ + gn(n);
+ const Index mend = m * gm_ + gm(m);
+ if (shard_by_col_) {
+ for (Index n1 = n * gn_; n1 < nend; n1++) {
+ for (Index m1 = m * gm_; m1 < mend; m1++)
+ GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_),
+ packed_lhs_[k % (P - 1)][m1],
+ packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
+ Scalar(1), -1, -1, 0, 0);
+ }
+ } else {
+ for (Index m1 = m * gm_; m1 < mend; m1++)
+ for (Index n1 = n * gn_; n1 < nend; n1++) {
+ GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_),
+ packed_lhs_[k % (P - 1)][m1],
+ packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
+ Scalar(1), -1, -1, 0, 0);
+ }
+ }
+ signal_kernel(m, n, k + 1, false);
+ signal_switch(k + 2);
+ }
+
+ void signal_packing(Index k) {
+ eigen_assert(!parallel_pack_);
+ Index s = state_packing_ready_[k % P].fetch_sub(1);
+ eigen_assert(s > 0);
+ if (s != 1) return;
+ state_packing_ready_[k % P] = shard_by_col_ ? nm_ : nn_;
+ enqueue_packing(k, shard_by_col_);
+ }
+
+ void signal_kernel(Index m, Index n, Index k, bool sync) {
+ std::atomic<uint8_t>* state = &state_kernel_[k % P][m][n];
+ Index s = state->load();
+ eigen_assert(s > 0);
+ if (s != 1 && state->fetch_sub(1) != 1) return;
+ state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed);
+ if (sync)
+ kernel(m, n, k);
+ else
+ device_.enqueueNoNotification([=]() { kernel(m, n, k); });
+ }
+
+ void signal_switch(Index k, Index v = 1) {
+ Index s = state_switch_[k % P].fetch_sub(v);
+ eigen_assert(s >= v);
+ if (s != v) return;
+
+ // Ready to switch to the next k slice.
+ // Reset counter for the next iteration.
+ state_switch_[k % P] =
+ (parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)) +
+ nm_ * nn_;
+ if (k < nk_) {
+ // Issue lhs/rhs packing. Their completion will in turn kick off
+ // kernels.
+ if (parallel_pack_) {
+ enqueue_packing(k, !shard_by_col_);
+ enqueue_packing(k, shard_by_col_);
+ } else if (shard_by_col_) {
+ enqueue_packing(k, false);
+ } else {
+ enqueue_packing(k, true);
+ }
+
+ // Termination handling.
+ // Because kernel completion signals k + 2 switch, we need to finish nk
+ // + 2 slices without issuing any tasks on nk + 1 slice. So here we
+ // pretend that all nk + 1 packing tasks just finish instantly; so that
+ // nk + 2 switch only waits for completion of nk kernels.
+ } else if (k == nk_) {
+ signal_switch(k + 1,
+ parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_));
+ } else {
+ done_.Notify();
+ }
+ }
+
+ // Enqueue all rhs/lhs packing for k-th slice.
+ void enqueue_packing(Index k, bool rhs) {
+ enqueue_packing_helper(0, rhs ? nn_ : nm_, k, rhs);
+ }
+
+ void enqueue_packing_helper(Index start, Index end, Index k, bool rhs) {
+ if (end - start == 1) {
+ if (rhs)
+ pack_rhs(start, k);
+ else
+ pack_lhs(start, k);
+ } else {
+ Index mid = (start + end) / 2;
+ device_.enqueueNoNotification(
+ [=]() { enqueue_packing_helper(mid, end, k, rhs); });
+ device_.enqueueNoNotification(
+ [=]() { enqueue_packing_helper(start, mid, k, rhs); });
+ }
+ }
+
+ // Block sizes with accounting for potentially incomplete last block.
+ Index bm(Index m) const { return m + 1 < nm0_ ? bm_ : m_ + bm_ - bm_ * nm0_; }
+ Index bn(Index n) const { return n + 1 < nn0_ ? bn_ : n_ + bn_ - bn_ * nn0_; }
+ Index bk(Index k) const { return k + 1 < nk_ ? bk_ : k_ + bk_ - bk_ * nk_; }
+ // Task grain sizes accounting for potentially incomplete last task.
+ Index gm(Index m) const { return m + 1 < nm_ ? gm_ : nm0_ + gm_ - gm_ * nm_; }
+ Index gn(Index n) const { return n + 1 < nn_ ? gn_ : nn0_ + gn_ - gn_ * nn_; }
+
+ Context(const Context&) = delete;
+ void operator=(const Context&) = delete;
+ };
+
+ // Decide whether we want to shard m x n contraction by columns or by rows.
+ static bool shardByCol(Index m, Index n, Index num_threads) {
+ // Note: we are comparing both n and m against Traits::nr, it is not
+ // a mistake. We are trying to figure out how both n and m will fit into
+ // the main sharding dimension.
+
+ // Sharding by column is the default
+ // ... unless there is enough data for vectorization over rows
+ if (m / num_threads >= Traits::nr &&
+ // and not enough data for vectorization over columns
+ (n / num_threads < Traits::nr ||
+ // ... or barely enough data for vectorization over columns,
+ // but it is not evenly dividable across threads
+ (n / num_threads < 4 * Traits::nr &&
+ (n % (num_threads * Traits::nr)) != 0 &&
+ // ... and it is evenly dividable across threads for rows
+ ((m % (num_threads * Traits::nr)) == 0 ||
+ // .. or it is not evenly dividable for both dimensions but
+ // there is much more data over rows so that corner effects are
+ // mitigated.
+ (m / n >= 6)))))
+ return false;
+ // Wait, or if matrices are just substantially prolonged over the other
+ // dimension.
+ if (n / num_threads < 16 * Traits::nr && m > n * 32) return false;
+ return true;
+ }
+
+ Index coarsenM(Index m, Index n, Index bm, Index bn, Index bk, Index gn,
+ int num_threads, bool shard_by_col) const {
+ Index gm = 1;
+ Index gm1 = 1;
+ Index nm0 = divup(m, bm);
+ Index nm1 = nm0;
+ for (;;) {
+ // Find the next candidate for m grain size. It needs to result in
+ // different number of blocks. E.g. if we have 10 kernels, we want to try
+ // 5 and 10, but not 6, 7, 8 and 9.
+ while (gm1 <= nm0 && nm1 == divup(nm0, gm1)) gm1++;
+ if (gm1 > nm0) break;
+ // Check the candidate.
+ int res = checkGrain(m, n, bm, bn, bk, gm1, gn, gm, gn, num_threads,
+ shard_by_col);
+ if (res < 0) break;
+ nm1 = divup(nm0, gm1);
+ if (res == 0) continue;
+ // Commit new grain size.
+ gm = gm1;
+ }
+ return gm;
+ }
+
+ Index coarsenN(Index m, Index n, Index bm, Index bn, Index bk, Index gm,
+ int num_threads, bool shard_by_col) const {
+ Index gn = 1;
+ Index gn1 = 1;
+ Index nn0 = divup(n, bn);
+ Index nn1 = nn0;
+ for (;;) {
+ while (gn1 <= nn0 && nn1 == divup(nn0, gn1)) gn1++;
+ if (gn1 > nn0) break;
+ int res = checkGrain(m, n, bm, bn, bk, gm, gn1, gm, gn, num_threads,
+ shard_by_col);
+ if (res < 0) break;
+ nn1 = divup(nn0, gn1);
+ if (res == 0) continue;
+ gn = gn1;
+ }
+ return gn;
+ }
+
+ // checkGrain checks whether grain (gm, gn) is suitable and is better than
+ // (oldgm, oldgn).
+ int checkGrain(Index m, Index n, Index bm, Index bn, Index bk, Index gm,
+ Index gn, Index oldgm, Index oldgn, int num_threads,
+ bool shard_by_col) const {
+ const TensorOpCost cost =
+ contractionCost(bm * gm, bn * gn, bm, bn, bk, shard_by_col, true);
+ double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(
+ static_cast<double>(bm) * gm * bn * gn, cost);
+ // If the task is too small, then we agree on it regardless of anything
+ // else. Otherwise synchronization overheads will dominate.
+ if (taskSize < 1) return 1;
+ // If it is too large, then we reject it and all larger tasks.
+ if (taskSize > 2) return -1;
+ // Now we are in presumably good task size range.
+ // The main deciding factor here is parallelism. Consider that we have 12
+ // kernels and 4 threads. Grains of 2, 3 and 4 all yield good task sizes.
+ // But 2/4 yield 6/3 tasks, which gives us parallelism of 0.75 (at most 3/4
+ // of cores will be busy). While grain size 3 gives us 4 tasks, which gives
+ // us parallelism of 1 (we can load all cores).
+ Index nm0 = divup(m, bm);
+ Index nn0 = divup(n, bn);
+ Index new_tasks = divup(nm0, gm) * divup(nn0, gn);
+ double new_parallelism = static_cast<double>(new_tasks) /
+ (divup<int>(new_tasks, num_threads) * num_threads);
+ Index old_tasks = divup(nm0, oldgm) * divup(nn0, oldgn);
+ double old_parallelism = static_cast<double>(old_tasks) /
+ (divup<int>(old_tasks, num_threads) * num_threads);
+ if (new_parallelism > old_parallelism || new_parallelism == 1) return 1;
+ return 0;
+ }
+
+#else // EIGEN_USE_SIMPLE_THREAD_POOL
+
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
void evalProduct(Scalar* buffer) const {
if (this->m_j_size == 1) {
@@ -376,7 +995,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
const Index actual_mc = numext::mini(m_base_start + arg.mc, arg.max_m) - m_base_start;
gebp(arg.output.getSubMapper(m_base_start, arg.n),
(*arg.blockAs)[blockAId], arg.blockB,
- actual_mc, arg.kc, arg.nc, 1.0, -1, -1, 0, 0);
+ actual_mc, arg.kc, arg.nc, Scalar(1), -1, -1, 0, 0);
// Notify that the kernel is done.
const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx;
@@ -384,6 +1003,47 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
}
}
}
+#endif // EIGEN_USE_SIMPLE_THREAD_POOL
+
+ TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk,
+ bool shard_by_col, bool prepacked) const {
+ const int packed_size = std::min<int>(PacketType<LhsScalar, Device>::size,
+ PacketType<RhsScalar, Device>::size);
+ const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
+ const double kd = static_cast<double>(bk);
+ // Peak VFMA bandwidth is 0.5. However if we have not enough data for
+ // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined
+ // experimentally.
+ double computeBandwidth = bk == 1 ? 4.0 :
+ (shard_by_col ? bn : bm) < Traits::nr ||
+ (shard_by_col ? bm : bn) < Traits::mr ? 2.0 : 0.5;
+#ifndef EIGEN_VECTORIZE_FMA
+ // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors.
+ // However for MULPS/ADDPS we have dependent sequence of 2 such instructions,
+ // so overall bandwidth is 1.0.
+ if (computeBandwidth == 0.5) computeBandwidth = 1.0;
+#endif
+ // Computations.
+ TensorOpCost cost = TensorOpCost(0, 0, kd * computeBandwidth, true, packed_size);
+ // Output stores.
+ cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size);
+ if (prepacked) {
+ // Packing and kernels are executed in different tasks. When we calculate
+ // task grain size we look only at kernel cost assuming that kernel
+ // is more expensive than packing.
+ return cost;
+ }
+ // Lhs/rhs loads + computations.
+ TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * (kd / n);
+ TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * (kd / m);
+ // Lhs packing memory cost does not contribute considerably to overall
+ // execution time because lhs is prefetched early and accessed sequentially.
+ if (shard_by_col)
+ lhsCost.dropMemoryCost();
+ else
+ rhsCost.dropMemoryCost();
+ return cost + lhsCost + rhsCost;
+ }
};
} // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index a2f1f71f5..860a6949a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -164,14 +164,14 @@ class TensorConversionOp : public TensorBase<TensorConversionOp<TargetType, XprT
};
template <bool SameType, typename Eval, typename Scalar> struct ConversionSubExprEval {
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool run(Eval& impl, Scalar*) {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar*) {
impl.evalSubExprsIfNeeded(NULL);
return true;
}
};
template <typename Eval, typename Scalar> struct ConversionSubExprEval<true, Eval, Scalar> {
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool run(Eval& impl, Scalar* data) {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar* data) {
return impl.evalSubExprsIfNeeded(data);
}
};
@@ -193,7 +193,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
enum {
IsAligned = false,
- PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess && internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
+ PacketAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = false
};
@@ -224,11 +224,9 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
- const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
- PacketConverter<TensorEvaluator<ArgType, Device>, PacketSourceType, PacketReturnType,
- SrcCoeffRatio, TgtCoeffRatio> converter(m_impl);
- return converter.template packet<LoadMode>(index);
+ const bool Vectorizable = TensorEvaluator<ArgType, Device>::PacketAccess &
+ internal::type_casting_traits<SrcType, TargetType>::VectorizedCast;
+ return PacketConv<LoadMode, Vectorizable>::run(m_impl, index);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
@@ -249,7 +247,31 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
protected:
- TensorEvaluator<ArgType, Device> m_impl;
+ template <int LoadMode, bool ActuallyVectorize>
+ struct PacketConv {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+ internal::scalar_cast_op<SrcType, TargetType> converter;
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ for (int i = 0; i < PacketSize; ++i) {
+ values[i] = converter(impl.coeff(index+i));
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+ };
+
+ template <int LoadMode>
+ struct PacketConv<LoadMode, true> {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+ const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
+ const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
+ PacketConverter<TensorEvaluator<ArgType, Device>, PacketSourceType, PacketReturnType,
+ SrcCoeffRatio, TgtCoeffRatio> converter(impl);
+ return converter.template packet<LoadMode>(index);
+ }
+ };
+
+ TensorEvaluator<ArgType, Device> m_impl;
};
} // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 091007ab7..abdf742c6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -254,7 +254,7 @@ struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, t
template<typename Indices, typename InputXprType, typename KernelXprType>
-class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType> >
+class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
index 0f6dcedaa..83c449cf1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@@ -10,10 +10,6 @@
#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
-//#if !defined(EIGEN_USE_GPU)
-//#define EIGEN_USE_COST_MODEL
-//#endif
-
namespace Eigen {
/** \class TensorEvaluator
@@ -32,45 +28,47 @@ class TensorOpCost {
// model based on minimal reciprocal throughput numbers from Intel or
// Agner Fog's tables would be better than what is there now.
template <typename ArgType>
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int MulCost() {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() {
return internal::functor_traits<
internal::scalar_product_op<ArgType, ArgType> >::Cost;
}
template <typename ArgType>
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int AddCost() {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() {
return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost;
}
template <typename ArgType>
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int DivCost() {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() {
return internal::functor_traits<
internal::scalar_quotient_op<ArgType, ArgType> >::Cost;
}
template <typename ArgType>
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int ModCost() {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() {
return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost;
}
template <typename SrcType, typename TargetType>
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int CastCost() {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() {
return internal::functor_traits<
internal::scalar_cast_op<SrcType, TargetType> >::Cost;
}
+ EIGEN_DEVICE_FUNC
TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
+ EIGEN_DEVICE_FUNC
TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
: bytes_loaded_(bytes_loaded),
bytes_stored_(bytes_stored),
compute_cycles_(compute_cycles) {}
+ EIGEN_DEVICE_FUNC
TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles,
bool vectorized, double packet_size)
: bytes_loaded_(bytes_loaded),
bytes_stored_(bytes_stored),
compute_cycles_(vectorized ? compute_cycles / packet_size
: compute_cycles) {
- using std::isfinite;
- eigen_assert(bytes_loaded >= 0 && (isfinite)(bytes_loaded));
- eigen_assert(bytes_stored >= 0 && (isfinite)(bytes_stored));
- eigen_assert(compute_cycles >= 0 && (isfinite)(compute_cycles));
+ eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded));
+ eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored));
+ eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles));
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const {
@@ -96,21 +94,21 @@ class TensorOpCost {
}
// TODO(rmlarsen): Define min in terms of total cost, not elementwise.
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMin(
- const TensorOpCost& rhs) {
- bytes_loaded_ = numext::mini(bytes_loaded_, rhs.bytes_loaded());
- bytes_stored_ = numext::mini(bytes_stored_, rhs.bytes_stored());
- compute_cycles_ = numext::mini(compute_cycles_, rhs.compute_cycles());
- return *this;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(
+ const TensorOpCost& rhs) const {
+ double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());
+ double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());
+ double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());
+ return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
}
// TODO(rmlarsen): Define max in terms of total cost, not elementwise.
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMax(
- const TensorOpCost& rhs) {
- bytes_loaded_ = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
- bytes_stored_ = numext::maxi(bytes_stored_, rhs.bytes_stored());
- compute_cycles_ = numext::maxi(compute_cycles_, rhs.compute_cycles());
- return *this;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(
+ const TensorOpCost& rhs) const {
+ double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
+ double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());
+ double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());
+ return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index 1d2d162dc..4f5767bc7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -12,6 +12,8 @@
namespace Eigen {
+static const int kCudaScratchSize = 1024;
+
// This defines an interface that GPUDevice can take to use
// CUDA streams underneath.
class StreamInterface {
@@ -24,6 +26,15 @@ class StreamInterface {
// Allocate memory on the actual device where the computation will run
virtual void* allocate(size_t num_bytes) const = 0;
virtual void deallocate(void* buffer) const = 0;
+
+ // Return a scratchpad buffer of size 1k
+ virtual void* scratchpad() const = 0;
+
+ // Return a semaphore. The semaphore is initially initialized to 0, and
+ // each kernel using it is responsible for resetting to 0 upon completion
+ // to maintain the invariant that the semaphore is always equal to 0 upon
+ // each kernel start.
+ virtual unsigned int* semaphore() const = 0;
};
static cudaDeviceProp* m_deviceProperties;
@@ -31,7 +42,21 @@ static bool m_devicePropInitialized = false;
static void initializeDeviceProp() {
if (!m_devicePropInitialized) {
- if (!m_devicePropInitialized) {
+ // Attempts to ensure proper behavior in the case of multiple threads
+ // calling this function simultaneously. This would be trivial to
+ // implement if we could use std::mutex, but unfortunately mutex don't
+ // compile with nvcc, so we resort to atomics and thread fences instead.
+ // Note that if the caller uses a compiler that doesn't support c++11 we
+ // can't ensure that the initialization is thread safe.
+#if __cplusplus >= 201103L
+ static std::atomic<bool> first(true);
+ if (first.exchange(false)) {
+#else
+ static bool first = true;
+ if (first) {
+ first = false;
+#endif
+ // We're the first thread to reach this point.
int num_devices;
cudaError_t status = cudaGetDeviceCount(&num_devices);
if (status != cudaSuccess) {
@@ -52,7 +77,19 @@ static void initializeDeviceProp() {
assert(status == cudaSuccess);
}
}
+
+#if __cplusplus >= 201103L
+ std::atomic_thread_fence(std::memory_order_release);
+#endif
m_devicePropInitialized = true;
+ } else {
+ // Wait for the other thread to inititialize the properties.
+ while (!m_devicePropInitialized) {
+#if __cplusplus >= 201103L
+ std::atomic_thread_fence(std::memory_order_acquire);
+#endif
+ sleep(1);
+ }
}
}
}
@@ -62,12 +99,12 @@ static const cudaStream_t default_stream = cudaStreamDefault;
class CudaStreamDevice : public StreamInterface {
public:
// Use the default stream on the current device
- CudaStreamDevice() : stream_(&default_stream) {
+ CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
cudaGetDevice(&device_);
initializeDeviceProp();
}
// Use the default stream on the specified device
- CudaStreamDevice(int device) : stream_(&default_stream), device_(device) {
+ CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
initializeDeviceProp();
}
// Use the specified stream. Note that it's the
@@ -75,7 +112,7 @@ class CudaStreamDevice : public StreamInterface {
// the specified device. If no device is specified the code
// assumes that the stream is associated to the current gpu device.
CudaStreamDevice(const cudaStream_t* stream, int device = -1)
- : stream_(stream), device_(device) {
+ : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
if (device < 0) {
cudaGetDevice(&device_);
} else {
@@ -89,6 +126,12 @@ class CudaStreamDevice : public StreamInterface {
initializeDeviceProp();
}
+ virtual ~CudaStreamDevice() {
+ if (scratch_) {
+ deallocate(scratch_);
+ }
+ }
+
const cudaStream_t& stream() const { return *stream_; }
const cudaDeviceProp& deviceProperties() const {
return m_deviceProperties[device_];
@@ -112,9 +155,29 @@ class CudaStreamDevice : public StreamInterface {
assert(err == cudaSuccess);
}
+ virtual void* scratchpad() const {
+ if (scratch_ == NULL) {
+ scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int));
+ }
+ return scratch_;
+ }
+
+ virtual unsigned int* semaphore() const {
+ if (semaphore_ == NULL) {
+ char* scratch = static_cast<char*>(scratchpad()) + kCudaScratchSize;
+ semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+ cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
+ EIGEN_UNUSED_VARIABLE(err)
+ assert(err == cudaSuccess);
+ }
+ return semaphore_;
+ }
+
private:
const cudaStream_t* stream_;
int device_;
+ mutable void* scratch_;
+ mutable unsigned int* semaphore_;
};
struct GpuDevice {
@@ -131,22 +194,20 @@ struct GpuDevice {
return stream_->stream();
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
return stream_->allocate(num_bytes);
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
- return NULL;
-#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
stream_->deallocate(buffer);
+ }
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
+ EIGEN_STRONG_INLINE void* scratchpad() const {
+ return stream_->scratchpad();
+ }
+
+ EIGEN_STRONG_INLINE unsigned int* semaphore() const {
+ return stream_->semaphore();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
@@ -156,30 +217,22 @@ struct GpuDevice {
EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess);
#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
cudaError_t err =
cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess);
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
cudaError_t err =
cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess);
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
@@ -188,21 +241,21 @@ struct GpuDevice {
EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess);
#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
+ EIGEN_STRONG_INLINE size_t numThreads() const {
// FIXME
return 32;
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+ EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
// FIXME
return 48*1024;
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+ EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
// We won't try to take advantage of the l2 cache for the time being, and
// there is no l3 cache on cuda devices.
return firstLevelCacheSize();
@@ -222,56 +275,26 @@ struct GpuDevice {
#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
return stream_->deviceProperties().multiProcessorCount;
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
- return 0;
-#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
return stream_->deviceProperties().maxThreadsPerBlock;
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
- return 0;
-#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
return stream_->deviceProperties().maxThreadsPerMultiProcessor;
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
- return 0;
-#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
return stream_->deviceProperties().sharedMemPerBlock;
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
- return 0;
-#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE int majorDeviceVersion() const {
return stream_->deviceProperties().major;
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
- return 0;
-#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int minorDeviceVersion() const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE int minorDeviceVersion() const {
return stream_->deviceProperties().minor;
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
- return 0;
-#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const {
+ EIGEN_STRONG_INLINE int maxBlocks() const {
return max_blocks_;
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index c02891465..069680a11 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -14,7 +14,7 @@ namespace Eigen {
// Use the SimpleThreadPool by default. We'll switch to the new non blocking
// thread pool later.
-#ifdef EIGEN_USE_NONBLOCKING_THREAD_POOL
+#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>;
typedef NonBlockingThreadPool ThreadPool;
#else
@@ -106,7 +106,7 @@ static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) {
// Build a thread pool device on top the an existing pool of threads.
struct ThreadPoolDevice {
// The ownership of the thread pool remains with the caller.
- ThreadPoolDevice(ThreadPoolInterface* pool, size_t num_cores) : pool_(pool), num_threads_(num_cores) { }
+ ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores) : pool_(pool), num_threads_(num_cores) { }
EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
return internal::aligned_malloc(num_bytes);
@@ -130,7 +130,7 @@ struct ThreadPoolDevice {
::memset(buffer, c, n);
}
- EIGEN_STRONG_INLINE size_t numThreads() const {
+ EIGEN_STRONG_INLINE int numThreads() const {
return num_threads_;
}
@@ -151,9 +151,7 @@ struct ThreadPoolDevice {
template <class Function, class... Args>
EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const {
Notification* n = new Notification();
- std::function<void()> func =
- std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, f, args...);
- pool_->Schedule(func);
+ pool_->Schedule(std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, f, args...));
return n;
}
@@ -161,20 +159,118 @@ struct ThreadPoolDevice {
EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b,
Function&& f,
Args&&... args) const {
- std::function<void()> func = std::bind(
- &FunctionWrapperWithBarrier<Function, Args...>::run, b, f, args...);
- pool_->Schedule(func);
+ pool_->Schedule(std::bind(
+ &FunctionWrapperWithBarrier<Function, Args...>::run, b, f, args...));
}
template <class Function, class... Args>
EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const {
- std::function<void()> func = std::bind(f, args...);
- pool_->Schedule(func);
+ pool_->Schedule(std::bind(f, args...));
+ }
+
+ // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if
+ // called from one of the threads in pool_. Returns -1 otherwise.
+ EIGEN_STRONG_INLINE int currentThreadId() const {
+ return pool_->CurrentThreadId();
+ }
+
+ // parallelFor executes f with [0, n) arguments in parallel and waits for
+ // completion. F accepts a half-open interval [first, last).
+ // Block size is choosen based on the iteration cost and resulting parallel
+ // efficiency. If block_align is not nullptr, it is called to round up the
+ // block size.
+ void parallelFor(Index n, const TensorOpCost& cost,
+ std::function<Index(Index)> block_align,
+ std::function<void(Index, Index)> f) const {
+ typedef TensorCostModel<ThreadPoolDevice> CostModel;
+ if (n <= 1 || numThreads() == 1 ||
+ CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
+ f(0, n);
+ return;
+ }
+
+ // Calculate block size based on (1) the iteration cost and (2) parallel
+ // efficiency. We want blocks to be not too small to mitigate
+ // parallelization overheads; not too large to mitigate tail
+ // effect and potential load imbalance and we also want number
+ // of blocks to be evenly dividable across threads.
+
+ double block_size_f = 1.0 / CostModel::taskSize(1, cost);
+ Index block_size = numext::mini(n, numext::maxi<Index>(1, block_size_f));
+ const Index max_block_size =
+ numext::mini(n, numext::maxi<Index>(1, 2 * block_size_f));
+ if (block_align) {
+ Index new_block_size = block_align(block_size);
+ eigen_assert(new_block_size >= block_size);
+ block_size = numext::mini(n, new_block_size);
+ }
+ Index block_count = divup(n, block_size);
+ // Calculate parallel efficiency as fraction of total CPU time used for
+ // computations:
+ double max_efficiency =
+ static_cast<double>(block_count) /
+ (divup<int>(block_count, numThreads()) * numThreads());
+ // Now try to increase block size up to max_block_size as long as it
+ // doesn't decrease parallel efficiency.
+ for (Index prev_block_count = block_count; prev_block_count > 1;) {
+ // This is the next block size that divides size into a smaller number
+ // of blocks than the current block_size.
+ Index coarser_block_size = divup(n, prev_block_count - 1);
+ if (block_align) {
+ Index new_block_size = block_align(coarser_block_size);
+ eigen_assert(new_block_size >= coarser_block_size);
+ coarser_block_size = numext::mini(n, new_block_size);
+ }
+ if (coarser_block_size > max_block_size) {
+ break; // Reached max block size. Stop.
+ }
+ // Recalculate parallel efficiency.
+ const Index coarser_block_count = divup(n, coarser_block_size);
+ eigen_assert(coarser_block_count < prev_block_count);
+ prev_block_count = coarser_block_count;
+ const double coarser_efficiency =
+ static_cast<double>(coarser_block_count) /
+ (divup<int>(coarser_block_count, numThreads()) * numThreads());
+ if (coarser_efficiency + 0.01 >= max_efficiency) {
+ // Taking it.
+ block_size = coarser_block_size;
+ block_count = coarser_block_count;
+ if (max_efficiency < coarser_efficiency) {
+ max_efficiency = coarser_efficiency;
+ }
+ }
+ }
+
+ // Recursively divide size into halves until we reach block_size.
+ // Division code rounds mid to block_size, so we are guaranteed to get
+ // block_count leaves that do actual computations.
+ Barrier barrier(static_cast<unsigned int>(block_count));
+ std::function<void(Index, Index)> handleRange;
+ handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) {
+ if (last - first <= block_size) {
+ // Single block or less, execute directly.
+ f(first, last);
+ barrier.Notify();
+ return;
+ }
+ // Split into halves and submit to the pool.
+ Index mid = first + divup((last - first) / 2, block_size) * block_size;
+ pool_->Schedule([=, &handleRange]() { handleRange(mid, last); });
+ pool_->Schedule([=, &handleRange]() { handleRange(first, mid); });
+ };
+ handleRange(0, n);
+ barrier.Wait();
+ }
+
+ // Convenience wrapper for parallelFor that does not align blocks.
+ void parallelFor(Index n, const TensorOpCost& cost,
+ std::function<void(Index, Index)> f) const {
+ parallelFor(n, cost, nullptr, std::move(f));
}
private:
ThreadPoolInterface* pool_;
- size_t num_threads_;
+ int num_threads_;
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
index ca9ac79df..1a30e45fb 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
@@ -44,7 +44,7 @@ template<DenseIndex n, typename Index, std::size_t Rank> const Index array_get(c
}
-#if defined(EIGEN_HAS_CONSTEXPR)
+#if EIGEN_HAS_CONSTEXPR
template <typename Index, std::size_t Rank>
struct index_known_statically_impl<DimensionList<Index, Rank> > {
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index f0b8ac958..b24cdebf1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -29,14 +29,6 @@ namespace Eigen {
* \sa Tensor
*/
-// Can't use std::pair on cuda devices
-template <typename Index> struct IndexPair {
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) { }
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Index f, Index s) : first(f), second(s) { }
- Index first;
- Index second;
-};
-
// Boilerplate code
namespace internal {
@@ -115,7 +107,7 @@ struct Sizes : internal::numeric_list<std::ptrdiff_t, Indices...> {
explicit EIGEN_DEVICE_FUNC Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
// todo: add assertion
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template <typename... DenseIndex> EIGEN_DEVICE_FUNC Sizes(DenseIndex...) { }
explicit EIGEN_DEVICE_FUNC Sizes(std::initializer_list<std::ptrdiff_t> /*l*/) {
// todo: add assertion
@@ -182,7 +174,7 @@ template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0
return *this;
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template <typename... DenseIndex> Sizes(DenseIndex... /*indices*/) { }
explicit Sizes(std::initializer_list<std::size_t>) {
// todo: add assertion
@@ -190,13 +182,13 @@ template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0
#else
EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex) {
}
- EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex) {
+ EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex) {
}
- EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex, const DenseIndex) {
+ EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex) {
}
- EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
+ EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
}
- EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
+ EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
}
#endif
@@ -290,31 +282,31 @@ struct DSizes : array<DenseIndex, NumDims> {
(*this)[0] = i0;
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) {
EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 2 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
#else
- EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1) {
+ EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1) {
eigen_assert(NumDims == 2);
(*this)[0] = i0;
(*this)[1] = i1;
}
- EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) {
+ EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) {
eigen_assert(NumDims == 3);
(*this)[0] = i0;
(*this)[1] = i1;
(*this)[2] = i2;
}
- EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) {
+ EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) {
eigen_assert(NumDims == 4);
(*this)[0] = i0;
(*this)[1] = i1;
(*this)[2] = i2;
(*this)[3] = i3;
}
- EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) {
+ EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) {
eigen_assert(NumDims == 5);
(*this)[0] = i0;
(*this)[1] = i1;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index c556fec0f..a08dfa7c3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -56,7 +56,7 @@ struct nested<TensorEvalToOp<XprType>, 1, typename eval<TensorEvalToOp<XprType>
template<typename XprType>
-class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType> >
+class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorEvalToOp>::Scalar Scalar;
@@ -94,7 +94,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
enum {
- IsAligned = true,
+ IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index ae4ce3c90..61c111cec 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -129,6 +129,10 @@ template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
double loadConstant(const double* address) {
return __ldg(address);
}
+template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+Eigen::half loadConstant(const Eigen::half* address) {
+ return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x)));
+}
#endif
}
@@ -222,7 +226,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
EIGEN_DEVICE_FUNC
TensorEvaluator(const XprType& op, const Device& device)
- : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device)
+ : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper()
{ }
typedef typename XprType::Index Index;
@@ -239,13 +243,13 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
{
- return m_functor(index);
+ return m_wrapper(m_functor, index);
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- return m_functor.template packetOp<Index, PacketReturnType>(index);
+ return m_wrapper.template packetOp<PacketReturnType, Index>(m_functor, index);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
@@ -259,6 +263,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
private:
const NullaryOp m_functor;
TensorEvaluator<ArgType, Device> m_argImpl;
+ const internal::nullary_wrapper<CoeffReturnType,NullaryOp> m_wrapper;
};
@@ -399,6 +404,101 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
TensorEvaluator<RightArgType, Device> m_rightImpl;
};
+// -------------------- CwiseTernaryOp --------------------
+
+template<typename TernaryOp, typename Arg1Type, typename Arg2Type, typename Arg3Type, typename Device>
+struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type>, Device>
+{
+ typedef TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type> XprType;
+
+ enum {
+ IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned & TensorEvaluator<Arg3Type, Device>::IsAligned,
+ PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess &
+ internal::functor_traits<TernaryOp>::PacketAccess,
+ Layout = TensorEvaluator<Arg1Type, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
+ };
+
+ EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+ : m_functor(op.functor()),
+ m_arg1Impl(op.arg1Expression(), device),
+ m_arg2Impl(op.arg2Expression(), device),
+ m_arg3Impl(op.arg3Expression(), device)
+ {
+ EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<Arg1Type, Device>::Layout) == static_cast<int>(TensorEvaluator<Arg3Type, Device>::Layout) || internal::traits<XprType>::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
+ typename internal::traits<Arg2Type>::StorageKind>::value),
+ STORAGE_KIND_MUST_MATCH)
+ EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
+ typename internal::traits<Arg3Type>::StorageKind>::value),
+ STORAGE_KIND_MUST_MATCH)
+ EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
+ typename internal::traits<Arg2Type>::Index>::value),
+ STORAGE_INDEX_MUST_MATCH)
+ EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
+ typename internal::traits<Arg3Type>::Index>::value),
+ STORAGE_INDEX_MUST_MATCH)
+
+ eigen_assert(dimensions_match(m_arg1Impl.dimensions(), m_arg2Impl.dimensions()) && dimensions_match(m_arg1Impl.dimensions(), m_arg3Impl.dimensions()));
+ }
+
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+ typedef typename TensorEvaluator<Arg1Type, Device>::Dimensions Dimensions;
+
+ EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
+ {
+ // TODO: use arg2 or arg3 dimensions if they are known at compile time.
+ return m_arg1Impl.dimensions();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+ m_arg1Impl.evalSubExprsIfNeeded(NULL);
+ m_arg2Impl.evalSubExprsIfNeeded(NULL);
+ m_arg3Impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ m_arg1Impl.cleanup();
+ m_arg2Impl.cleanup();
+ m_arg3Impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+ {
+ return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index));
+ }
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ return m_functor.packetOp(m_arg1Impl.template packet<LoadMode>(index),
+ m_arg2Impl.template packet<LoadMode>(index),
+ m_arg3Impl.template packet<LoadMode>(index));
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ const double functor_cost = internal::functor_traits<TernaryOp>::Cost;
+ return m_arg1Impl.costPerCoeff(vectorized) +
+ m_arg2Impl.costPerCoeff(vectorized) +
+ m_arg3Impl.costPerCoeff(vectorized) +
+ TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
+ }
+
+ EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+
+ private:
+ const TernaryOp m_functor;
+ TensorEvaluator<Arg1Type, Device> m_arg1Impl;
+ TensorEvaluator<Arg1Type, Device> m_arg2Impl;
+ TensorEvaluator<Arg3Type, Device> m_arg3Impl;
+};
+
// -------------------- SelectOp --------------------
@@ -475,7 +575,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
.cwiseMax(m_elseImpl.costPerCoeff(vectorized));
}
- EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return NULL; }
private:
TensorEvaluator<IfArgType, Device> m_condImpl;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 5c3d4d630..0cac7b179 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -59,13 +59,14 @@ class TensorExecutor<Expression, DefaultDevice, true>
{
const Index size = array_prod(evaluator.dimensions());
const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
- // Manually unroll this loop since compilers don't do it.
+ // Give the compiler a strong hint to unroll the loop. But don't insist
+ // on unrolling, because if the function is expensive the compiler should not
+ // unroll the loop at the expense of inlining.
const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
- evaluator.evalPacket(i);
- evaluator.evalPacket(i+PacketSize);
- evaluator.evalPacket(i+2*PacketSize);
- evaluator.evalPacket(i+3*PacketSize);
+ for (Index j = 0; j < 4; j++) {
+ evaluator.evalPacket(i + j * PacketSize);
+ }
}
const Index VectorizedSize = (size / PacketSize) * PacketSize;
for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
@@ -92,24 +93,30 @@ struct EvalRange {
evaluator.evalScalar(i);
}
}
+
+ static Index alignBlockSize(Index size) {
+ return size;
+ }
};
template <typename Evaluator, typename Index>
struct EvalRange<Evaluator, Index, true> {
+ static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+
static void run(Evaluator* evaluator_in, const Index first, const Index last) {
Evaluator evaluator = *evaluator_in;
eigen_assert(last >= first);
Index i = first;
- const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
if (last - first >= PacketSize) {
eigen_assert(first % PacketSize == 0);
Index last_chunk_offset = last - 4 * PacketSize;
- // Manually unroll this loop since compilers don't do it.
+ // Give the compiler a strong hint to unroll the loop. But don't insist
+ // on unrolling, because if the function is expensive the compiler should not
+ // unroll the loop at the expense of inlining.
for (; i <= last_chunk_offset; i += 4*PacketSize) {
- evaluator.evalPacket(i);
- evaluator.evalPacket(i+PacketSize);
- evaluator.evalPacket(i+2*PacketSize);
- evaluator.evalPacket(i+3*PacketSize);
+ for (Index j = 0; j < 4; j++) {
+ evaluator.evalPacket(i + j * PacketSize);
+ }
}
last_chunk_offset = last - PacketSize;
for (; i <= last_chunk_offset; i += PacketSize) {
@@ -120,6 +127,15 @@ struct EvalRange<Evaluator, Index, true> {
evaluator.evalScalar(i);
}
}
+
+ static Index alignBlockSize(Index size) {
+ // Align block size to packet size and account for unrolling in run above.
+ if (size >= 16 * PacketSize) {
+ return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
+ }
+ // Aligning to 4 * PacketSize would increase block size by more than 25%.
+ return (size + PacketSize - 1) & ~(PacketSize - 1);
+ }
};
template <typename Expression, bool Vectorizable>
@@ -133,18 +149,23 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign)
{
- const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
const Index size = array_prod(evaluator.dimensions());
+#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL)
+ device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
+ EvalRange<Evaluator, Index, Vectorizable>::alignBlockSize,
+ [&evaluator](Index first, Index last) {
+ EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, first, last);
+ });
+#else
size_t num_threads = device.numThreads();
-#ifdef EIGEN_USE_COST_MODEL
if (num_threads > 1) {
num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
size, evaluator.costPerCoeff(Vectorizable), num_threads);
}
-#endif
if (num_threads == 1) {
EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
} else {
+ const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
const Index numblocks = size / blocksize;
@@ -161,11 +182,12 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
}
barrier.Wait();
}
+#endif // defined(!EIGEN_USE_SIMPLE_THREAD_POOL)
}
evaluator.cleanup();
}
};
-#endif
+#endif // EIGEN_USE_THREADS
// GPU: the evaluation of the expression is offloaded to a GPU.
@@ -212,16 +234,11 @@ struct EigenMetaKernelEval<Evaluator, Index, true> {
template <typename Evaluator, typename Index>
__global__ void
__launch_bounds__(1024)
-EigenMetaKernel(Evaluator memcopied_eval, Index size) {
+EigenMetaKernel(Evaluator eval, Index size) {
const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
const Index step_size = blockDim.x * gridDim.x;
- // Cuda memcopies the kernel arguments. That's fine for POD, but for more
- // complex types such as evaluators we should really conform to the C++
- // standard and call a proper copy constructor.
- Evaluator eval(memcopied_eval);
-
const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
index 8491c4ca2..5f2e329f2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
@@ -219,6 +219,86 @@ class TensorCwiseBinaryOp : public TensorBase<TensorCwiseBinaryOp<BinaryOp, LhsX
namespace internal {
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+struct traits<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> >
+{
+ // Type promotion to handle the case where the types of the args are different.
+ typedef typename result_of<
+ TernaryOp(typename Arg1XprType::Scalar,
+ typename Arg2XprType::Scalar,
+ typename Arg3XprType::Scalar)>::type Scalar;
+ typedef traits<Arg1XprType> XprTraits;
+ typedef typename traits<Arg1XprType>::StorageKind StorageKind;
+ typedef typename traits<Arg1XprType>::Index Index;
+ typedef typename Arg1XprType::Nested Arg1Nested;
+ typedef typename Arg2XprType::Nested Arg2Nested;
+ typedef typename Arg3XprType::Nested Arg3Nested;
+ typedef typename remove_reference<Arg1Nested>::type _Arg1Nested;
+ typedef typename remove_reference<Arg2Nested>::type _Arg2Nested;
+ typedef typename remove_reference<Arg3Nested>::type _Arg3Nested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+
+ enum {
+ Flags = 0
+ };
+};
+
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+struct eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, Eigen::Dense>
+{
+ typedef const TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>& type;
+};
+
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+struct nested<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, 1, typename eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> >::type>
+{
+ typedef TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+class TensorCwiseTernaryOp : public TensorBase<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, ReadOnlyAccessors>
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef Scalar CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorCwiseTernaryOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseTernaryOp(const Arg1XprType& arg1, const Arg2XprType& arg2, const Arg3XprType& arg3, const TernaryOp& func = TernaryOp())
+ : m_arg1_xpr(arg1), m_arg2_xpr(arg2), m_arg3_xpr(arg3), m_functor(func) {}
+
+ EIGEN_DEVICE_FUNC
+ const TernaryOp& functor() const { return m_functor; }
+
+ /** \returns the nested expressions */
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename Arg1XprType::Nested>::type&
+ arg1Expression() const { return m_arg1_xpr; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename Arg1XprType::Nested>::type&
+ arg2Expression() const { return m_arg2_xpr; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename Arg3XprType::Nested>::type&
+ arg3Expression() const { return m_arg3_xpr; }
+
+ protected:
+ typename Arg1XprType::Nested m_arg1_xpr;
+ typename Arg1XprType::Nested m_arg2_xpr;
+ typename Arg3XprType::Nested m_arg3_xpr;
+ const TernaryOp m_functor;
+};
+
+
+namespace internal {
template<typename IfXprType, typename ThenXprType, typename ElseXprType>
struct traits<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >
: traits<ThenXprType>
@@ -252,7 +332,7 @@ struct nested<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, 1, typename e
template<typename IfXprType, typename ThenXprType, typename ElseXprType>
-class TensorSelectOp : public TensorBase<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >
+class TensorSelectOp : public TensorBase<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorSelectOp>::Scalar Scalar;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index ece2ed91b..08eb5595a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -329,7 +329,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
for (Index i = 0; i < n; ++i) {
if(FFTDir == FFT_FORWARD) {
- a[i] = data[i] * std::conj(pos_j_base_powered[i]);
+ a[i] = data[i] * numext::conj(pos_j_base_powered[i]);
}
else {
a[i] = data[i] * pos_j_base_powered[i];
@@ -344,7 +344,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
b[i] = pos_j_base_powered[i];
}
else {
- b[i] = std::conj(pos_j_base_powered[i]);
+ b[i] = numext::conj(pos_j_base_powered[i]);
}
}
for (Index i = n; i < m - n; ++i) {
@@ -355,7 +355,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
b[i] = pos_j_base_powered[m-i];
}
else {
- b[i] = std::conj(pos_j_base_powered[m-i]);
+ b[i] = numext::conj(pos_j_base_powered[m-i]);
}
}
@@ -379,7 +379,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
for (Index i = 0; i < n; ++i) {
if(FFTDir == FFT_FORWARD) {
- data[i] = a[i] * std::conj(pos_j_base_powered[i]);
+ data[i] = a[i] * numext::conj(pos_j_base_powered[i]);
}
else {
data[i] = a[i] * pos_j_base_powered[i];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index b27ee0084..fcee5f60d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -65,7 +65,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
inline Self& base() { return *this; }
inline const Self& base() const { return *this; }
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const
{
@@ -97,7 +97,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices)
{
@@ -128,7 +128,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
return m_storage.data()[0];
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const
{
@@ -213,7 +213,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
return coeff(index);
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
{
@@ -309,7 +309,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
{
}
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+#if EIGEN_HAS_RVALUE_REFERENCES
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(Self&& other)
: m_storage(other.m_storage)
{
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index 7ec757519..c23ecdbc4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -55,7 +55,7 @@ struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<X
template<typename XprType>
-class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType> >
+class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar;
@@ -102,7 +102,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
- const Index numValues = m_impl.dimensions().TotalSize();
+ const Index numValues = internal::array_prod(m_impl.dimensions());
m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType));
// Should initialize the memory in case we're dealing with non POD types.
if (NumTraits<CoeffReturnType>::RequireInitialization) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index a8bd8b888..490ddd8bd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -16,11 +16,12 @@ template<typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType
template<typename Scalar_, typename Dimensions, int Options_ = 0, typename IndexType = DenseIndex> class TensorFixedSize;
template<typename PlainObjectType, int Options_ = Unaligned> class TensorMap;
template<typename PlainObjectType> class TensorRef;
-template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value> class TensorBase;
+template<typename Derived, int AccessLevel> class TensorBase;
template<typename NullaryOp, typename PlainObjectType> class TensorCwiseNullaryOp;
template<typename UnaryOp, typename XprType> class TensorCwiseUnaryOp;
template<typename BinaryOp, typename LeftXprType, typename RightXprType> class TensorCwiseBinaryOp;
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType> class TensorCwiseTernaryOp;
template<typename IfXprType, typename ThenXprType, typename ElseXprType> class TensorSelectOp;
template<typename Op, typename Dims, typename XprType> class TensorReductionOp;
template<typename XprType> class TensorIndexTupleOp;
@@ -42,9 +43,11 @@ template<typename ReverseDimensions, typename XprType> class TensorReverseOp;
template<typename PaddingDimensions, typename XprType> class TensorPaddingOp;
template<typename Shuffle, typename XprType> class TensorShufflingOp;
template<typename Strides, typename XprType> class TensorStridingOp;
+template<typename StartIndices, typename StopIndices, typename Strides, typename XprType> class TensorStridingSlicingOp;
template<typename Strides, typename XprType> class TensorInflationOp;
template<typename Generator, typename XprType> class TensorGeneratorOp;
template<typename LeftXprType, typename RightXprType> class TensorAssignOp;
+template<typename Op, typename XprType> class TensorScanOp;
template<typename CustomUnaryFunc, typename XprType> class TensorCustomUnaryOp;
template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> class TensorCustomBinaryOp;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index 33cd00391..7164e8d60 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -25,7 +25,7 @@ struct scalar_mod_op {
};
template <typename Scalar>
struct functor_traits<scalar_mod_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };
+{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; };
/** \internal
@@ -38,7 +38,7 @@ struct scalar_mod2_op {
};
template <typename Scalar>
struct functor_traits<scalar_mod2_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };
+{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; };
template <typename Scalar>
struct scalar_fmod_op {
@@ -69,7 +69,7 @@ struct scalar_sigmoid_op {
template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Packet packetOp(const Packet& x) const {
- const Packet one = pset1<Packet>(1);
+ const Packet one = pset1<Packet>(T(1));
return pdiv(one, padd(one, pexp(pnegate(x))));
}
};
@@ -84,14 +84,23 @@ struct functor_traits<scalar_sigmoid_op<T> > {
};
+template<typename Reducer, typename Device>
+struct reducer_traits {
+ enum {
+ Cost = 1,
+ PacketAccess = false
+ };
+};
+
// Standard reduction functors
template <typename T> struct SumReducer
{
- static const bool PacketAccess = true;
+ static const bool PacketAccess = packet_traits<T>::HasAdd;
static const bool IsStateful = false;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
- (*accum) += t;
+ internal::scalar_sum_op<T> sum_op;
+ *accum = sum_op(*accum, t);
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
@@ -119,16 +128,26 @@ template <typename T> struct SumReducer
}
};
+template <typename T, typename Device>
+struct reducer_traits<SumReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = PacketType<T, Device>::HasAdd
+ };
+};
+
+
template <typename T> struct MeanReducer
{
- static const bool PacketAccess = !NumTraits<T>::IsInteger;
+ static const bool PacketAccess = packet_traits<T>::HasAdd && !NumTraits<T>::IsInteger;
static const bool IsStateful = true;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
MeanReducer() : scalarCount_(0), packetCount_(0) { }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
- (*accum) += t;
+ internal::scalar_sum_op<T> sum_op;
+ *accum = sum_op(*accum, t);
scalarCount_++;
}
template <typename Packet>
@@ -162,9 +181,44 @@ template <typename T> struct MeanReducer
DenseIndex packetCount_;
};
+template <typename T, typename Device>
+struct reducer_traits<MeanReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = PacketType<T, Device>::HasAdd
+ };
+};
+
+
+template <typename T, bool IsMax = true, bool IsInteger = true>
+struct MinMaxBottomValue {
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
+ return Eigen::NumTraits<T>::lowest();
+ }
+};
+template <typename T>
+struct MinMaxBottomValue<T, true, false> {
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
+ return -Eigen::NumTraits<T>::infinity();
+ }
+};
+template <typename T>
+struct MinMaxBottomValue<T, false, true> {
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
+ return Eigen::NumTraits<T>::highest();
+ }
+};
+template <typename T>
+struct MinMaxBottomValue<T, false, false> {
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
+ return Eigen::NumTraits<T>::infinity();
+ }
+};
+
+
template <typename T> struct MaxReducer
{
- static const bool PacketAccess = true;
+ static const bool PacketAccess = packet_traits<T>::HasMax;
static const bool IsStateful = false;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
@@ -174,9 +228,8 @@ template <typename T> struct MaxReducer
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
(*accum) = pmax<Packet>(*accum, p);
}
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
- return Eigen::NumTraits<T>::lowest();
+ return MinMaxBottomValue<T, true, Eigen::NumTraits<T>::IsInteger>::bottom_value();
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
@@ -195,9 +248,18 @@ template <typename T> struct MaxReducer
}
};
+template <typename T, typename Device>
+struct reducer_traits<MaxReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = PacketType<T, Device>::HasMax
+ };
+};
+
+
template <typename T> struct MinReducer
{
- static const bool PacketAccess = true;
+ static const bool PacketAccess = packet_traits<T>::HasMin;
static const bool IsStateful = false;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
@@ -207,9 +269,8 @@ template <typename T> struct MinReducer
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
(*accum) = pmin<Packet>(*accum, p);
}
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
- return Eigen::NumTraits<T>::highest();
+ return MinMaxBottomValue<T, false, Eigen::NumTraits<T>::IsInteger>::bottom_value();
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
@@ -228,10 +289,18 @@ template <typename T> struct MinReducer
}
};
+template <typename T, typename Device>
+struct reducer_traits<MinReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = PacketType<T, Device>::HasMin
+ };
+};
+
template <typename T> struct ProdReducer
{
- static const bool PacketAccess = true;
+ static const bool PacketAccess = packet_traits<T>::HasMul;
static const bool IsStateful = false;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
@@ -263,6 +332,14 @@ template <typename T> struct ProdReducer
}
};
+template <typename T, typename Device>
+struct reducer_traits<ProdReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::MulCost,
+ PacketAccess = PacketType<T, Device>::HasMul
+ };
+};
+
struct AndReducer
{
@@ -280,6 +357,15 @@ struct AndReducer
}
};
+template <typename Device>
+struct reducer_traits<AndReducer, Device> {
+ enum {
+ Cost = 1,
+ PacketAccess = false
+ };
+};
+
+
struct OrReducer {
static const bool PacketAccess = false;
static const bool IsStateful = false;
@@ -295,6 +381,15 @@ struct OrReducer {
}
};
+template <typename Device>
+struct reducer_traits<OrReducer, Device> {
+ enum {
+ Cost = 1,
+ PacketAccess = false
+ };
+};
+
+
// Argmin/Argmax reducers
template <typename T> struct ArgMaxTupleReducer
{
@@ -312,6 +407,15 @@ template <typename T> struct ArgMaxTupleReducer
}
};
+template <typename T, typename Device>
+struct reducer_traits<ArgMaxTupleReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = false
+ };
+};
+
+
template <typename T> struct ArgMinTupleReducer
{
static const bool PacketAccess = false;
@@ -328,457 +432,11 @@ template <typename T> struct ArgMinTupleReducer
}
};
-
-// Random number generation
-namespace {
-#ifdef __CUDA_ARCH__
-__device__ int get_random_seed() {
- return clock();
-}
-#else
-int get_random_seed() {
-#ifdef _WIN32
- SYSTEMTIME st;
- GetSystemTime(&st);
- return st.wSecond + 1000 * st.wMilliseconds;
-#elif defined __APPLE__
- return static_cast<int>(mach_absolute_time());
-#else
- timespec ts;
- clock_gettime(CLOCK_REALTIME, &ts);
- return static_cast<int>(ts.tv_nsec);
-#endif
-}
-#endif
-}
-
-#if !defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)
-// We're not compiling a cuda kernel
-template <typename T> class UniformRandomGenerator {
-
- public:
- static const bool PacketAccess = true;
-
- UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
- if (!deterministic) {
- srand(get_random_seed());
- }
- }
- UniformRandomGenerator(const UniformRandomGenerator& other) {
- m_deterministic = other.m_deterministic;
- }
-
- template<typename Index>
- T operator()(Index) const {
- return random<T>();
- }
- template<typename Index, typename PacketType>
- PacketType packetOp(Index) const {
- const int packetSize = internal::unpacket_traits<PacketType>::size;
- EIGEN_ALIGN_MAX T values[packetSize];
- for (int i = 0; i < packetSize; ++i) {
- values[i] = random<T>();
- }
- return internal::pload<PacketType>(values);
- }
-
- private:
- bool m_deterministic;
-};
-
-#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900
-template <> class UniformRandomGenerator<float> {
- public:
- static const bool PacketAccess = true;
-
- UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_generator(new std::mt19937()) {
- if (!deterministic) {
- m_generator->seed(get_random_seed());
- }
- }
- UniformRandomGenerator(const UniformRandomGenerator<float>& other) {
- m_generator = new std::mt19937();
- m_generator->seed(other(0) * UINT_MAX);
- m_deterministic = other.m_deterministic;
- }
- ~UniformRandomGenerator() {
- delete m_generator;
- }
-
- template<typename Index>
- float operator()(Index) const {
- return m_distribution(*m_generator);
- }
- template<typename Index, typename PacketType>
- PacketType packetOp(Index i) const {
- const int packetSize = internal::unpacket_traits<PacketType>::size;
- EIGEN_ALIGN_MAX float values[packetSize];
- for (int k = 0; k < packetSize; ++k) {
- values[k] = this->operator()(i);
- }
- return internal::pload<PacketType>(values);
- }
-
- private:
- UniformRandomGenerator& operator = (const UniformRandomGenerator&);
- // Make sure m_deterministic comes first to match the layout of the cpu
- // version of the code.
- bool m_deterministic;
- std::mt19937* m_generator;
- mutable std::uniform_real_distribution<float> m_distribution;
-};
-
-template <> class UniformRandomGenerator<double> {
- public:
- static const bool PacketAccess = true;
-
- UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_generator(new std::mt19937()) {
- if (!deterministic) {
- m_generator->seed(get_random_seed());
- }
- }
- UniformRandomGenerator(const UniformRandomGenerator<double>& other) {
- m_generator = new std::mt19937();
- m_generator->seed(other(0) * UINT_MAX);
- m_deterministic = other.m_deterministic;
- }
- ~UniformRandomGenerator() {
- delete m_generator;
- }
-
- template<typename Index>
- double operator()(Index) const {
- return m_distribution(*m_generator);
- }
- template<typename Index, typename PacketType>
- PacketType packetOp(Index i) const {
- const int packetSize = internal::unpacket_traits<PacketType>::size;
- EIGEN_ALIGN_MAX double values[packetSize];
- for (int k = 0; k < packetSize; ++k) {
- values[k] = this->operator()(i);
- }
- return internal::pload<PacketType>(values);
- }
-
- private:
- UniformRandomGenerator& operator = (const UniformRandomGenerator&);
- // Make sure m_deterministic comes first to match the layout of the cpu
- // version of the code.
- bool m_deterministic;
- std::mt19937* m_generator;
- mutable std::uniform_real_distribution<double> m_distribution;
-};
-#endif
-
-#else
-
-// We're compiling a cuda kernel
-template <typename T> class UniformRandomGenerator;
-
-template <> class UniformRandomGenerator<float> {
- public:
- static const bool PacketAccess = true;
-
- __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
-
- __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
- m_deterministic = other.m_deterministic;
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = m_deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
-
- template<typename Index>
- __device__ float operator()(Index) const {
- return curand_uniform(&m_state);
- }
- template<typename Index, typename PacketType>
- __device__ float4 packetOp(Index) const {
- EIGEN_STATIC_ASSERT((is_same<PacketType, float4>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
- return curand_uniform4(&m_state);
- }
-
- private:
- bool m_deterministic;
- mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class UniformRandomGenerator<double> {
- public:
- static const bool PacketAccess = true;
-
- __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
- m_deterministic = other.m_deterministic;
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = m_deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- template<typename Index>
- __device__ double operator()(Index) const {
- return curand_uniform_double(&m_state);
- }
- template<typename Index, typename PacketType>
- __device__ double2 packetOp(Index) const {
- EIGEN_STATIC_ASSERT((is_same<PacketType, double2>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
- return curand_uniform2_double(&m_state);
- }
-
- private:
- bool m_deterministic;
- mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class UniformRandomGenerator<std::complex<float> > {
- public:
- static const bool PacketAccess = false;
-
- __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
- m_deterministic = other.m_deterministic;
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = m_deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- template<typename Index>
- __device__ std::complex<float> operator()(Index) const {
- float4 vals = curand_uniform4(&m_state);
- return std::complex<float>(vals.x, vals.y);
- }
-
- private:
- bool m_deterministic;
- mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class UniformRandomGenerator<std::complex<double> > {
- public:
- static const bool PacketAccess = false;
-
- __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
- m_deterministic = other.m_deterministic;
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = m_deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- template<typename Index>
- __device__ std::complex<double> operator()(Index) const {
- double2 vals = curand_uniform2_double(&m_state);
- return std::complex<double>(vals.x, vals.y);
- }
-
- private:
- bool m_deterministic;
- mutable curandStatePhilox4_32_10_t m_state;
-};
-
-#endif
-
-template <typename Scalar>
-struct functor_traits<UniformRandomGenerator<Scalar> > {
- enum {
- // Rough estimate.
- Cost = 100 * NumTraits<Scalar>::MulCost,
- PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
- };
-};
-
-
-
-#if (!defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)) && (__cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900)
-// We're not compiling a cuda kernel
-template <typename T> class NormalRandomGenerator {
- public:
- static const bool PacketAccess = true;
-
- NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_distribution(0, 1), m_generator(new std::mt19937()) {
- if (!deterministic) {
- m_generator->seed(get_random_seed());
- }
- }
- NormalRandomGenerator(const NormalRandomGenerator& other)
- : m_deterministic(other.m_deterministic), m_distribution(other.m_distribution), m_generator(new std::mt19937()) {
- m_generator->seed(other(0) * UINT_MAX);
- }
- ~NormalRandomGenerator() {
- delete m_generator;
- }
- template<typename Index>
- T operator()(Index) const {
- return m_distribution(*m_generator);
- }
- template<typename Index, typename PacketType>
- PacketType packetOp(Index) const {
- const int packetSize = internal::unpacket_traits<PacketType>::size;
- EIGEN_ALIGN_MAX T values[packetSize];
- for (int i = 0; i < packetSize; ++i) {
- values[i] = m_distribution(*m_generator);
- }
- return internal::pload<PacketType>(values);
- }
-
- private:
- // No assignment
- NormalRandomGenerator& operator = (const NormalRandomGenerator&);
-
- bool m_deterministic;
- mutable std::normal_distribution<T> m_distribution;
- std::mt19937* m_generator;
-};
-
-#elif defined (EIGEN_USE_GPU) && defined(__CUDACC__) && defined(__CUDA_ARCH__)
-
-// We're compiling a cuda kernel
-template <typename T> class NormalRandomGenerator;
-
-template <> class NormalRandomGenerator<float> {
- public:
- static const bool PacketAccess = true;
-
- __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- __device__ NormalRandomGenerator(const NormalRandomGenerator<float>& other) {
- m_deterministic = other.m_deterministic;
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = m_deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- template<typename Index>
- __device__ float operator()(Index) const {
- return curand_normal(&m_state);
- }
- template<typename Index, typename PacketType>
- __device__ float4 packetOp(Index) const {
- EIGEN_STATIC_ASSERT((is_same<PacketType, float4>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
- return curand_normal4(&m_state);
- }
-
- private:
- bool m_deterministic;
- mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class NormalRandomGenerator<double> {
- public:
- static const bool PacketAccess = true;
-
- __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- __device__ NormalRandomGenerator(const NormalRandomGenerator<double>& other) {
- m_deterministic = other.m_deterministic;
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = m_deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- template<typename Index>
- __device__ double operator()(Index) const {
- return curand_normal_double(&m_state);
- }
- template<typename Index, typename PacketType>
- __device__ double2 packetOp(Index) const {
- EIGEN_STATIC_ASSERT((is_same<PacketType, double2>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
- return curand_normal2_double(&m_state);
- }
-
- private:
- bool m_deterministic;
- mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class NormalRandomGenerator<std::complex<float> > {
- public:
- static const bool PacketAccess = false;
-
- __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- __device__ NormalRandomGenerator(const NormalRandomGenerator& other) {
- m_deterministic = other.m_deterministic;
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = m_deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- template<typename Index>
- __device__ std::complex<float> operator()(Index) const {
- float4 vals = curand_normal4(&m_state);
- return std::complex<float>(vals.x, vals.y);
- }
-
- private:
- bool m_deterministic;
- mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class NormalRandomGenerator<std::complex<double> > {
- public:
- static const bool PacketAccess = false;
-
- __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- __device__ NormalRandomGenerator(const NormalRandomGenerator& other) {
- m_deterministic = other.m_deterministic;
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = m_deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- template<typename Index>
- __device__ std::complex<double> operator()(Index) const {
- double2 vals = curand_normal2_double(&m_state);
- return std::complex<double>(vals.x, vals.y);
- }
-
- private:
- bool m_deterministic;
- mutable curandStatePhilox4_32_10_t m_state;
-};
-
-#else
-
-template <typename T> class NormalRandomGenerator {
- public:
- static const bool PacketAccess = false;
- NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {}
-
- private:
- bool m_deterministic;
-};
-
-#endif
-
-template <typename Scalar>
-struct functor_traits<NormalRandomGenerator<Scalar> > {
+template <typename T, typename Device>
+struct reducer_traits<ArgMinTupleReducer<T>, Device> {
enum {
- // Rough estimate.
- Cost = 100 * NumTraits<Scalar>::MulCost,
- PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = false
};
};
@@ -797,7 +455,7 @@ class GaussianGenerator {
}
}
- T operator()(const array<Index, NumDims>& coordinates) const {
+ EIGEN_DEVICE_FUNC T operator()(const array<Index, NumDims>& coordinates) const {
T tmp = T(0);
for (size_t i = 0; i < NumDims; ++i) {
T offset = coordinates[i] - m_means[i];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
index 8ff7d5815..eb1d4934e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@@ -134,7 +134,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
- EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h
new file mode 100644
index 000000000..665b861cf
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h
@@ -0,0 +1,33 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
+
+namespace Eigen {
+
+/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given tensors.
+ *
+ * This function computes the regularized incomplete beta function (integral).
+ *
+ */
+template <typename ADerived, typename BDerived, typename XDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+ TensorCwiseTernaryOp<internal::scalar_betainc_op<typename XDerived::Scalar>,
+ const ADerived, const BDerived, const XDerived>
+ betainc(const ADerived& a, const BDerived& b, const XDerived& x) {
+ return TensorCwiseTernaryOp<
+ internal::scalar_betainc_op<typename XDerived::Scalar>, const ADerived,
+ const BDerived, const XDerived>(
+ a, b, x, internal::scalar_betainc_op<typename XDerived::Scalar>());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
index 38a833f82..a901c5dd4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
@@ -13,38 +13,61 @@
namespace Eigen {
namespace internal {
-template<>
-struct significant_decimals_impl<std::string>
- : significant_decimals_default_impl<std::string, true>
-{};
-}
+// Print the tensor as a 2d matrix
+template <typename Tensor, int Rank>
+struct TensorPrinter {
+ static void run (std::ostream& os, const Tensor& tensor) {
+ typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar;
+ typedef typename Tensor::Index Index;
+ const Index total_size = internal::array_prod(tensor.dimensions());
+ if (total_size > 0) {
+ const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions());
+ static const int layout = Tensor::Layout;
+ Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim);
+ os << matrix;
+ }
+ }
+};
+
+
+// Print the tensor as a vector
+template <typename Tensor>
+struct TensorPrinter<Tensor, 1> {
+ static void run (std::ostream& os, const Tensor& tensor) {
+ typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar;
+ typedef typename Tensor::Index Index;
+ const Index total_size = internal::array_prod(tensor.dimensions());
+ if (total_size > 0) {
+ Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size);
+ os << array;
+ }
+ }
+};
+
+
+// Print the tensor as a scalar
+template <typename Tensor>
+struct TensorPrinter<Tensor, 0> {
+ static void run (std::ostream& os, const Tensor& tensor) {
+ os << tensor.coeff(0);
+ }
+};
+}
template <typename T>
std::ostream& operator << (std::ostream& os, const TensorBase<T, ReadOnlyAccessors>& expr) {
+ typedef TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> Evaluator;
+ typedef typename Evaluator::Dimensions Dimensions;
+
// Evaluate the expression if needed
TensorForcedEvalOp<const T> eval = expr.eval();
- TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> tensor(eval, DefaultDevice());
+ Evaluator tensor(eval, DefaultDevice());
tensor.evalSubExprsIfNeeded(NULL);
- typedef typename internal::remove_const<typename T::Scalar>::type Scalar;
- typedef typename T::Index Index;
- typedef typename TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Dimensions Dimensions;
- const Index total_size = internal::array_prod(tensor.dimensions());
-
- // Print the tensor as a 1d vector or a 2d matrix.
+ // Print the result
static const int rank = internal::array_size<Dimensions>::value;
- if (rank == 0) {
- os << tensor.coeff(0);
- } else if (rank == 1) {
- Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size);
- os << array;
- } else {
- const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions());
- static const int layout = TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Layout;
- Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim);
- os << matrix;
- }
+ internal::TensorPrinter<Evaluator, rank>::run(os, tensor);
// Cleanup.
tensor.cleanup();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index bafcc67bd..566856ed2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -174,7 +174,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device)
{
- EIGEN_STATIC_ASSERT(NumDims >= 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
m_paddingValue = op.padding_value();
@@ -362,7 +362,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
index 985594bc8..3209fecd3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
@@ -10,7 +10,8 @@
#ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
#define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
-#if defined(EIGEN_HAS_CONSTEXPR) && defined(EIGEN_HAS_VARIADIC_TEMPLATES)
+
+#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES
#define EIGEN_HAS_INDEX_LIST
@@ -45,6 +46,24 @@ struct type2index {
}
};
+// This can be used with IndexPairList to get compile-time constant pairs,
+// such as IndexPairList<type2indexpair<1,2>, type2indexpair<3,4>>().
+template <DenseIndex f, DenseIndex s>
+struct type2indexpair {
+ static const DenseIndex first = f;
+ static const DenseIndex second = s;
+
+ constexpr EIGEN_DEVICE_FUNC operator IndexPair<DenseIndex>() const {
+ return IndexPair<DenseIndex>(f, s);
+ }
+
+ EIGEN_DEVICE_FUNC void set(const IndexPair<DenseIndex>& val) {
+ eigen_assert(val.first == f);
+ eigen_assert(val.second == s);
+ }
+};
+
+
template<DenseIndex n> struct NumTraits<type2index<n> >
{
typedef DenseIndex Real;
@@ -73,6 +92,16 @@ EIGEN_DEVICE_FUNC void update_value(type2index<n>& val, DenseIndex new_val) {
}
template <typename T>
+EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair<DenseIndex> new_val) {
+ val = new_val;
+}
+template <DenseIndex f, DenseIndex s>
+EIGEN_DEVICE_FUNC void update_value(type2indexpair<f, s>& val, IndexPair<DenseIndex> new_val) {
+ val.set(new_val);
+}
+
+
+template <typename T>
struct is_compile_time_constant {
static constexpr bool value = false;
};
@@ -94,7 +123,22 @@ struct is_compile_time_constant<const type2index<idx>& > {
static constexpr bool value = true;
};
-
+template <DenseIndex f, DenseIndex s>
+struct is_compile_time_constant<type2indexpair<f, s> > {
+ static constexpr bool value = true;
+};
+template <DenseIndex f, DenseIndex s>
+struct is_compile_time_constant<const type2indexpair<f, s> > {
+ static constexpr bool value = true;
+};
+template <DenseIndex f, DenseIndex s>
+struct is_compile_time_constant<type2indexpair<f, s>& > {
+ static constexpr bool value = true;
+};
+template <DenseIndex f, DenseIndex s>
+struct is_compile_time_constant<const type2indexpair<f, s>& > {
+ static constexpr bool value = true;
+};
template<typename... T>
@@ -184,31 +228,32 @@ template <typename T, typename... O>
-template <DenseIndex Idx>
+template <DenseIndex Idx, typename ValueT>
struct tuple_coeff {
template <typename... T>
- EIGEN_DEVICE_FUNC static constexpr DenseIndex get(const DenseIndex i, const IndexTuple<T...>& t) {
- return array_get<Idx>(t) * (i == Idx) + tuple_coeff<Idx-1>::get(i, t) * (i != Idx);
+ EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex i, const IndexTuple<T...>& t) {
+ // return array_get<Idx>(t) * (i == Idx) + tuple_coeff<Idx-1>::get(i, t) * (i != Idx);
+ return (i == Idx ? array_get<Idx>(t) : tuple_coeff<Idx-1, ValueT>::get(i, t));
}
template <typename... T>
- EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple<T...>& t, const DenseIndex value) {
+ EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple<T...>& t, const ValueT& value) {
if (i == Idx) {
update_value(array_get<Idx>(t), value);
} else {
- tuple_coeff<Idx-1>::set(i, t, value);
+ tuple_coeff<Idx-1, ValueT>::set(i, t, value);
}
}
template <typename... T>
EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple<T...>& t) {
return ((i == Idx) & is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value) ||
- tuple_coeff<Idx-1>::value_known_statically(i, t);
+ tuple_coeff<Idx-1, ValueT>::value_known_statically(i, t);
}
template <typename... T>
EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple<T...>& t) {
return is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
- tuple_coeff<Idx-1>::values_up_to_known_statically(t);
+ tuple_coeff<Idx-1, ValueT>::values_up_to_known_statically(t);
}
template <typename... T>
@@ -216,19 +261,19 @@ struct tuple_coeff {
return is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
array_get<Idx>(t) > array_get<Idx-1>(t) &&
- tuple_coeff<Idx-1>::values_up_to_statically_known_to_increase(t);
+ tuple_coeff<Idx-1, ValueT>::values_up_to_statically_known_to_increase(t);
}
};
-template <>
-struct tuple_coeff<0> {
+template <typename ValueT>
+struct tuple_coeff<0, ValueT> {
template <typename... T>
- EIGEN_DEVICE_FUNC static constexpr DenseIndex get(const DenseIndex i, const IndexTuple<T...>& t) {
+ EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex /*i*/, const IndexTuple<T...>& t) {
// eigen_assert (i == 0); // gcc fails to compile assertions in constexpr
- return array_get<0>(t) * (i == 0);
+ return array_get<0>(t)/* * (i == 0)*/;
}
template <typename... T>
- EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple<T...>& t, const DenseIndex value) {
+ EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple<T...>& t, const ValueT value) {
eigen_assert (i == 0);
update_value(array_get<0>(t), value);
}
@@ -254,13 +299,13 @@ struct tuple_coeff<0> {
template<typename FirstType, typename... OtherTypes>
struct IndexList : internal::IndexTuple<FirstType, OtherTypes...> {
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex operator[] (const DenseIndex i) const {
- return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::get(i, *this);
+ return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::get(i, *this);
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex get(const DenseIndex i) const {
- return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::get(i, *this);
+ return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::get(i, *this);
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const DenseIndex value) {
- return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::set(i, *this, value);
+ return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::set(i, *this, value);
}
EIGEN_DEVICE_FUNC constexpr IndexList(const internal::IndexTuple<FirstType, OtherTypes...>& other) : internal::IndexTuple<FirstType, OtherTypes...>(other) { }
@@ -268,14 +313,14 @@ struct IndexList : internal::IndexTuple<FirstType, OtherTypes...> {
EIGEN_DEVICE_FUNC constexpr IndexList() : internal::IndexTuple<FirstType, OtherTypes...>() { }
EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const {
- return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::value_known_statically(i, *this);
+ return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::value_known_statically(i, *this);
}
EIGEN_DEVICE_FUNC constexpr bool all_values_known_statically() const {
- return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::values_up_to_known_statically(*this);
+ return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::values_up_to_known_statically(*this);
}
EIGEN_DEVICE_FUNC constexpr bool values_statically_known_to_increase() const {
- return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::values_up_to_statically_known_to_increase(*this);
+ return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::values_up_to_statically_known_to_increase(*this);
}
};
@@ -286,6 +331,23 @@ constexpr IndexList<FirstType, OtherTypes...> make_index_list(FirstType val1, Ot
}
+template<typename FirstType, typename... OtherTypes>
+struct IndexPairList : internal::IndexTuple<FirstType, OtherTypes...> {
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair<DenseIndex> operator[] (const DenseIndex i) const {
+ return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, IndexPair<DenseIndex>>::get(i, *this);
+ }
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const IndexPair<DenseIndex> value) {
+ return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value-1, IndexPair<DenseIndex> >::set(i, *this, value);
+ }
+
+ EIGEN_DEVICE_FUNC constexpr IndexPairList(const internal::IndexTuple<FirstType, OtherTypes...>& other) : internal::IndexTuple<FirstType, OtherTypes...>(other) { }
+ EIGEN_DEVICE_FUNC constexpr IndexPairList() : internal::IndexTuple<FirstType, OtherTypes...>() { }
+
+ EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const {
+ return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::value_known_statically(i, *this);
+ }
+};
+
namespace internal {
template<typename FirstType, typename... OtherTypes> size_t array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
@@ -303,6 +365,13 @@ template<typename FirstType, typename... OtherTypes> struct array_size<const Ind
static const size_t value = array_size<IndexTuple<FirstType, OtherTypes...> >::value;
};
+template<typename FirstType, typename... OtherTypes> struct array_size<IndexPairList<FirstType, OtherTypes...> > {
+ static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
+};
+template<typename FirstType, typename... OtherTypes> struct array_size<const IndexPairList<FirstType, OtherTypes...> > {
+ static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
+};
+
template<DenseIndex N, typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC constexpr DenseIndex array_get(IndexList<FirstType, OtherTypes...>& a) {
return IndexTupleExtractor<N, FirstType, OtherTypes...>::get_val(a);
}
@@ -472,6 +541,57 @@ struct index_statically_lt_impl<const IndexList<FirstType, OtherTypes...> > {
}
};
+
+
+template <typename Tx>
+struct index_pair_first_statically_eq_impl {
+ EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
+ return false;
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_first_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+ return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+ (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_first_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+ return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+ (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
+ }
+};
+
+
+
+template <typename Tx>
+struct index_pair_second_statically_eq_impl {
+ EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
+ return false;
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_second_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+ return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+ (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_second_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+ return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+ (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
+ }
+};
+
+
} // end namespace internal
} // end namespace Eigen
@@ -482,53 +602,69 @@ namespace internal {
template <typename T>
struct index_known_statically_impl {
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(const DenseIndex) {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex) {
return false;
}
};
template <typename T>
struct all_indices_known_statically_impl {
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run() {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
return false;
}
};
template <typename T>
struct indices_statically_known_to_increase_impl {
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run() {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
return false;
}
};
template <typename T>
struct index_statically_eq_impl {
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(DenseIndex, DenseIndex) {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
return false;
}
};
template <typename T>
struct index_statically_ne_impl {
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(DenseIndex, DenseIndex) {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
return false;
}
};
template <typename T>
struct index_statically_gt_impl {
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(DenseIndex, DenseIndex) {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
return false;
}
};
template <typename T>
struct index_statically_lt_impl {
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(DenseIndex, DenseIndex) {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
+ return false;
+ }
+};
+
+template <typename Tx>
+struct index_pair_first_statically_eq_impl {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
+ return false;
+ }
+};
+
+template <typename Tx>
+struct index_pair_second_statically_eq_impl {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
return false;
}
};
+
+
} // end namespace internal
} // end namespace Eigen
@@ -572,6 +708,16 @@ static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(DenseIndex i,
return index_statically_lt_impl<T>::run(i, value);
}
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(DenseIndex i, DenseIndex value) {
+ return index_pair_first_statically_eq_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(DenseIndex i, DenseIndex value) {
+ return index_pair_second_statically_eq_impl<T>::run(i, value);
+}
+
} // end namespace internal
} // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
index de2f67d74..f391fb9ee 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
@@ -189,7 +189,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
index 2d223140e..33edc49e3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
@@ -10,7 +10,7 @@
#ifndef EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
#define EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
#include <initializer_list>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
index 33c6c1b0f..ede3939c2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -29,25 +29,47 @@ namespace Eigen {
namespace internal {
namespace {
+
// Note: result is undefined if val == 0
template <typename T>
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int count_leading_zeros(const T val)
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+ typename internal::enable_if<sizeof(T)==4,int>::type count_leading_zeros(const T val)
{
#ifdef __CUDA_ARCH__
- return (sizeof(T) == 8) ? __clzll(val) : __clz(val);
+ return __clz(val);
#elif EIGEN_COMP_MSVC
- unsigned long index;
- if (sizeof(T) == 8) {
- _BitScanReverse64(&index, val);
- } else {
- _BitScanReverse(&index, val);
- }
- return (sizeof(T) == 8) ? 63 - index : 31 - index;
+ unsigned long index;
+ _BitScanReverse(&index, val);
+ return 31 - index;
+#else
+ EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ return __builtin_clz(static_cast<uint32_t>(val));
+#endif
+ }
+
+ template <typename T>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+ typename internal::enable_if<sizeof(T)==8,int>::type count_leading_zeros(const T val)
+ {
+#ifdef __CUDA_ARCH__
+ return __clzll(val);
+#elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64
+ unsigned long index;
+ _BitScanReverse64(&index, val);
+ return 63 - index;
+#elif EIGEN_COMP_MSVC
+ // MSVC's _BitScanReverse64 is not available for 32bits builds.
+ unsigned int lo = (unsigned int)(val&0xffffffff);
+ unsigned int hi = (unsigned int)((val>>32)&0xffffffff);
+ int n;
+ if(hi==0)
+ n = 32 + count_leading_zeros<unsigned int>(lo);
+ else
+ n = count_leading_zeros<unsigned int>(hi);
+ return n;
#else
EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE);
- return (sizeof(T) == 8) ?
- __builtin_clzll(static_cast<uint64_t>(val)) :
- __builtin_clz(static_cast<uint32_t>(val));
+ return __builtin_clzll(static_cast<uint64_t>(val));
#endif
}
@@ -98,7 +120,9 @@ namespace {
return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
#else
const uint64_t shift = 1ULL << log_div;
- TensorUInt128<uint64_t, uint64_t> result = (TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider) - TensorUInt128<static_val<1>, static_val<0> >(1, 0) + TensorUInt128<static_val<0>, static_val<1> >(1));
+ TensorUInt128<uint64_t, uint64_t> result = TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider)
+ - TensorUInt128<static_val<1>, static_val<0> >(1, 0)
+ + TensorUInt128<static_val<0>, static_val<1> >(1);
return static_cast<uint64_t>(result);
#endif
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
index 8ed71f838..ee0078bbc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
@@ -28,7 +28,7 @@
// SFINAE requires variadic templates
#ifndef __CUDACC__
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
// SFINAE doesn't work for gcc <= 4.7
#ifdef EIGEN_COMP_GNUC
#if EIGEN_GNUC_AT_LEAST(4,8)
@@ -44,7 +44,7 @@
typename internal::enable_if< ( __condition__ ) , int >::type = 0
-#if defined(EIGEN_HAS_CONSTEXPR)
+#if EIGEN_HAS_CONSTEXPR
#define EIGEN_CONSTEXPR constexpr
#else
#define EIGEN_CONSTEXPR
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
index 9ebd9172b..6fb4f4a31 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
@@ -57,7 +57,7 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) {
// The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
@@ -140,7 +140,7 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
return m_data[index];
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
{
@@ -227,7 +227,7 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
return m_data[index];
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
{
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index cd04716bd..fdb5ee6b8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -47,22 +47,39 @@ template <> struct max_n_1<0> {
// Default packet types
template <typename Scalar, typename Device>
-struct PacketType {
+struct PacketType : internal::packet_traits<Scalar> {
typedef typename internal::packet_traits<Scalar>::type type;
- enum { size = internal::unpacket_traits<type>::size };
};
// For CUDA packet types when using a GpuDevice
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16)
template <>
-struct PacketType<float, GpuDevice> {
- typedef float4 type;
- static const int size = 4;
-};
-template <>
-struct PacketType<double, GpuDevice> {
- typedef double2 type;
+struct PacketType<half, GpuDevice> {
+ typedef half2 type;
static const int size = 2;
+ enum {
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasNegate = 1,
+ HasAbs = 1,
+ HasArg = 0,
+ HasAbs2 = 0,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 0,
+ HasSetLinear = 0,
+ HasBlend = 0,
+
+ HasDiv = 1,
+ HasSqrt = 1,
+ HasRsqrt = 1,
+ HasExp = 1,
+ HasLog = 1,
+ HasLog1p = 0,
+ HasLog10 = 0,
+ HasPow = 1,
+ };
};
#endif
@@ -112,6 +129,20 @@ bool operator!=(const Tuple<U, V>& x, const Tuple<U, V>& y) {
}
+// Can't use std::pairs on cuda devices
+template <typename Idx> struct IndexPair {
+ EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) {}
+ EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Idx f, Idx s) : first(f), second(s) {}
+
+ EIGEN_DEVICE_FUNC void set(IndexPair<Idx> val) {
+ first = val.first;
+ second = val.second;
+ }
+
+ Idx first;
+ Idx second;
+};
+
#ifdef EIGEN_HAS_SFINAE
namespace internal {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index bfa65a607..d34f1e328 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -148,7 +148,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); }
- const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+ EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
protected:
TensorEvaluator<ArgType, Device> m_impl;
@@ -409,7 +409,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
- EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+packetSize-1 < internal::array_prod(dimensions()));
Index inputIndices[] = {0, 0};
@@ -603,6 +603,286 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
};
+
+namespace internal {
+template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+struct traits<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> > : public traits<XprType>
+{
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = array_size<StartIndices>::value;
+ static const int Layout = XprTraits::Layout;
+};
+
+template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+struct eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Eigen::Dense>
+{
+ typedef const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>& type;
+};
+
+template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+struct nested<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, 1, typename eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >::type>
+{
+ typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> type;
+};
+
+} // end namespace internal
+
+
+template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+class TensorStridingSlicingOp : public TensorBase<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >
+{
+ public:
+ typedef typename internal::traits<TensorStridingSlicingOp>::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename internal::nested<TensorStridingSlicingOp>::type Nested;
+ typedef typename internal::traits<TensorStridingSlicingOp>::StorageKind StorageKind;
+ typedef typename internal::traits<TensorStridingSlicingOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingSlicingOp(
+ const XprType& expr, const StartIndices& startIndices,
+ const StopIndices& stopIndices, const Strides& strides)
+ : m_xpr(expr), m_startIndices(startIndices), m_stopIndices(stopIndices),
+ m_strides(strides) {}
+
+ EIGEN_DEVICE_FUNC
+ const StartIndices& startIndices() const { return m_startIndices; }
+ EIGEN_DEVICE_FUNC
+ const StartIndices& stopIndices() const { return m_stopIndices; }
+ EIGEN_DEVICE_FUNC
+ const StartIndices& strides() const { return m_strides; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const TensorStridingSlicingOp& other)
+ {
+ typedef TensorAssignOp<TensorStridingSlicingOp, const TensorStridingSlicingOp> Assign;
+ Assign assign(*this, other);
+ internal::TensorExecutor<const Assign, DefaultDevice>::run(
+ assign, DefaultDevice());
+ return *this;
+ }
+
+ template<typename OtherDerived>
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const OtherDerived& other)
+ {
+ typedef TensorAssignOp<TensorStridingSlicingOp, const OtherDerived> Assign;
+ Assign assign(*this, other);
+ internal::TensorExecutor<const Assign, DefaultDevice>::run(
+ assign, DefaultDevice());
+ return *this;
+ }
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const StartIndices m_startIndices;
+ const StopIndices m_stopIndices;
+ const Strides m_strides;
+};
+
+// Eval as rvalue
+template<typename StartIndices, typename StopIndices, typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device>
+{
+ typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType;
+ static const int NumDims = internal::array_size<Strides>::value;
+
+ enum {
+ // Alignment can't be guaranteed at compile time since it depends on the
+ // slice offsets and sizes.
+ IsAligned = false,
+ PacketAccess = false,
+ BlockAccess = false,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ RawAccess = false
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device), m_device(device), m_strides(op.strides())
+ {
+ // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero
+ DSizes<Index,NumDims> startIndicesClamped, stopIndicesClamped;
+ for (size_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
+ eigen_assert(m_strides[i] != 0 && "0 stride is invalid");
+ if(m_strides[i]>0){
+ startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]);
+ stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]);
+ }else{
+ /* implies m_strides[i]<0 by assert */
+ startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1);
+ stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1);
+ }
+ m_startIndices[i] = startIndicesClamped[i];
+ }
+
+ const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+
+ // check for degenerate intervals and compute output tensor shape
+ bool degenerate = false;;
+ for(int i = 0; i < NumDims; i++){
+ Index interval = stopIndicesClamped[i] - startIndicesClamped[i];
+ if(interval == 0 || ((interval<0) != (m_strides[i]<0))){
+ m_dimensions[i] = 0;
+ degenerate = true;
+ }else{
+ m_dimensions[i] = interval / m_strides[i]
+ + (interval % m_strides[i] != 0 ? 1 : 0);
+ eigen_assert(m_dimensions[i] >= 0);
+ }
+ }
+ Strides output_dims = m_dimensions;
+
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_inputStrides[0] = m_strides[0];
+ m_offsets[0] = startIndicesClamped[0];
+ Index previousDimProduct = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ previousDimProduct *= input_dims[i-1];
+ m_inputStrides[i] = previousDimProduct * m_strides[i];
+ m_offsets[i] = startIndicesClamped[i] * previousDimProduct;
+ }
+
+ // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed.
+ m_outputStrides[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1];
+ // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash
+ m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(degenerate ? 1 : m_outputStrides[i]);
+ }
+ } else {
+ m_inputStrides[NumDims-1] = m_strides[NumDims-1];
+ m_offsets[NumDims-1] = startIndicesClamped[NumDims-1];
+ Index previousDimProduct = 1;
+ for (int i = NumDims - 2; i >= 0; --i) {
+ previousDimProduct *= input_dims[i+1];
+ m_inputStrides[i] = previousDimProduct * m_strides[i];
+ m_offsets[i] = startIndicesClamped[i] * previousDimProduct;
+ }
+
+ m_outputStrides[NumDims-1] = 1;
+ for (int i = NumDims - 2; i >= 0; --i) {
+ m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1];
+ // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash
+ m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(degenerate ? 1 : m_outputStrides[i]);
+ }
+ }
+ m_block_total_size_max = numext::maxi(static_cast<std::size_t>(1),
+ device.lastLevelCacheSize() /
+ sizeof(Scalar));
+ }
+
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef Strides Dimensions;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ return m_impl.coeff(srcCoeff(index));
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const {
+ return NULL;
+ }
+
+ protected:
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
+ {
+ Index inputIndex = 0;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = NumDims - 1; i >= 0; --i) {
+ const Index idx = index / m_fastOutputStrides[i];
+ inputIndex += idx * m_inputStrides[i] + m_offsets[i];
+ index -= idx * m_outputStrides[i];
+ }
+ } else {
+ for (int i = 0; i < NumDims; ++i) {
+ const Index idx = index / m_fastOutputStrides[i];
+ inputIndex += idx * m_inputStrides[i] + m_offsets[i];
+ index -= idx * m_outputStrides[i];
+ }
+ }
+ return inputIndex;
+ }
+
+ static EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
+ return numext::maxi(min, numext::mini(max,value));
+ }
+
+ array<Index, NumDims> m_outputStrides;
+ array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
+ array<Index, NumDims> m_inputStrides;
+ TensorEvaluator<ArgType, Device> m_impl;
+ const Device& m_device;
+ DSizes<Index, NumDims> m_startIndices; // clamped startIndices
+ DSizes<Index, NumDims> m_dimensions;
+ DSizes<Index, NumDims> m_offsets; // offset in a flattened shape
+ const Strides m_strides;
+ std::size_t m_block_total_size_max;
+};
+
+// Eval as lvalue
+template<typename StartIndices, typename StopIndices, typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device>
+ : public TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device>
+{
+ typedef TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device> Base;
+ typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType;
+ static const int NumDims = internal::array_size<Strides>::value;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = false,
+ BlockAccess = false,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
+ RawAccess = false
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : Base(op, device)
+ { }
+
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef Strides Dimensions;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+ {
+ return this->m_impl.coeffRef(this->srcCoeff(index));
+ }
+};
+
+
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index 88b838b27..647bcf108 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -93,7 +93,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
enum {
- IsAligned = false,
+ IsAligned = true,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = true,
@@ -106,7 +106,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
// The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead
// to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector
// of 1 element first and then pad.
- EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
// Compute dimensions
m_dimensions = m_impl.dimensions();
@@ -150,27 +150,26 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > 0; --i) {
const Index idx = index / m_outputStrides[i];
- if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
+ if (isPaddingAtIndexForDim(idx, i)) {
return m_paddingValue;
}
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
index -= idx * m_outputStrides[i];
}
- if (index < m_padding[0].first || index >= m_dimensions[0] - m_padding[0].second) {
+ if (isPaddingAtIndexForDim(index, 0)) {
return m_paddingValue;
}
inputIndex += (index - m_padding[0].first);
} else {
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx = index / m_outputStrides[i+1];
- if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
+ if (isPaddingAtIndexForDim(idx, i)) {
return m_paddingValue;
}
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
index -= idx * m_outputStrides[i+1];
}
- if (index < m_padding[NumDims-1].first ||
- index >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) {
+ if (isPaddingAtIndexForDim(index, NumDims-1)) {
return m_paddingValue;
}
inputIndex += (index - m_padding[NumDims-1].first);
@@ -187,43 +186,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
return packetRowMajor(index);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const
- {
- Index inputIndex;
- if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
- {
- const Index idx = coords[0];
- if (idx < m_padding[0].first || idx >= m_dimensions[0] - m_padding[0].second) {
- return m_paddingValue;
- }
- inputIndex = idx - m_padding[0].first;
- }
- for (int i = 1; i < NumDims; ++i) {
- const Index idx = coords[i];
- if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
- return m_paddingValue;
- }
- inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
- }
- } else {
- {
- const Index idx = coords[NumDims-1];
- if (idx < m_padding[NumDims-1].first || idx >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) {
- return m_paddingValue;
- }
- inputIndex = idx - m_padding[NumDims-1].first;
- }
- for (int i = NumDims - 2; i >= 0; --i) {
- const Index idx = coords[i];
- if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
- return m_paddingValue;
- }
- inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
- }
- }
- return m_impl.coeff(inputIndex);
- }
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
TensorOpCost cost = m_impl.costPerCoeff(vectorized);
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -239,6 +201,40 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
private:
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim(
+ Index index, int dim_index) const {
+#if defined(EIGEN_HAS_INDEX_LIST)
+ return (!internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0) &&
+ index < m_padding[dim_index].first) ||
+ (!internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0) &&
+ index >= m_dimensions[dim_index] - m_padding[dim_index].second);
+#else
+ return (index < m_padding[dim_index].first) ||
+ (index >= m_dimensions[dim_index] - m_padding[dim_index].second);
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isLeftPaddingCompileTimeZero(
+ int dim_index) const {
+#if defined(EIGEN_HAS_INDEX_LIST)
+ return internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0);
+#else
+ EIGEN_UNUSED_VARIABLE(dim_index);
+ return false;
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isRightPaddingCompileTimeZero(
+ int dim_index) const {
+#if defined(EIGEN_HAS_INDEX_LIST)
+ return internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0);
+#else
+ EIGEN_UNUSED_VARIABLE(dim_index);
+ return false;
+#endif
+ }
+
+
void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const {
const double in = static_cast<double>(m_impl.dimensions()[i]);
const double out = in + m_padding[i].first + m_padding[i].second;
@@ -261,7 +257,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
const Index initialIndex = index;
@@ -273,15 +269,15 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
const Index lastPaddedRight = m_outputStrides[i+1];
- if (last < lastPaddedLeft) {
+ if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(m_paddingValue);
}
- else if (first >= firstPaddedRight && last < lastPaddedRight) {
+ else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(m_paddingValue);
}
- else if (first >= lastPaddedLeft && last < firstPaddedRight) {
+ else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
// all the coefficient are between the 2 padding zones.
const Index idx = index / m_outputStrides[i];
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
@@ -299,15 +295,15 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
const Index lastPaddedRight = m_outputStrides[1];
- if (last < lastPaddedLeft) {
+ if (!isLeftPaddingCompileTimeZero(0) && last < lastPaddedLeft) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(m_paddingValue);
}
- else if (first >= firstPaddedRight && last < lastPaddedRight) {
+ else if (!isRightPaddingCompileTimeZero(0) && first >= firstPaddedRight && last < lastPaddedRight) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(m_paddingValue);
}
- else if (first >= lastPaddedLeft && last < firstPaddedRight) {
+ else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
// all the coefficient are between the 2 padding zones.
inputIndex += (index - m_padding[0].first);
return m_impl.template packet<Unaligned>(inputIndex);
@@ -318,7 +314,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
const Index initialIndex = index;
@@ -331,15 +327,15 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1];
const Index lastPaddedRight = m_outputStrides[i];
- if (last < lastPaddedLeft) {
+ if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(m_paddingValue);
}
- else if (first >= firstPaddedRight && last < lastPaddedRight) {
+ else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(m_paddingValue);
}
- else if (first >= lastPaddedLeft && last < firstPaddedRight) {
+ else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
// all the coefficient are between the 2 padding zones.
const Index idx = index / m_outputStrides[i+1];
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
@@ -357,15 +353,15 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second);
const Index lastPaddedRight = m_outputStrides[NumDims-1];
- if (last < lastPaddedLeft) {
+ if (!isLeftPaddingCompileTimeZero(NumDims-1) && last < lastPaddedLeft) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(m_paddingValue);
}
- else if (first >= firstPaddedRight && last < lastPaddedRight) {
+ else if (!isRightPaddingCompileTimeZero(NumDims-1) && first >= firstPaddedRight && last < lastPaddedRight) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(m_paddingValue);
}
- else if (first >= lastPaddedLeft && last < firstPaddedRight) {
+ else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
// all the coefficient are between the 2 padding zones.
inputIndex += (index - m_padding[NumDims-1].first);
return m_impl.template packet<Unaligned>(inputIndex);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
index a87e45330..886a254f6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@@ -184,7 +184,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
new file mode 100644
index 000000000..1655a813e
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
@@ -0,0 +1,276 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
+#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
+
+namespace Eigen {
+namespace internal {
+
+namespace {
+
+EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
+#ifdef __CUDA_ARCH__
+ // We don't support 3d kernels since we currently only use 1 and
+ // 2d kernels.
+ assert(threadIdx.z == 0);
+ return clock64() +
+ blockIdx.x * blockDim.x + threadIdx.x +
+ gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
+
+#elif defined _WIN32
+ // Use the current time as a baseline.
+ SYSTEMTIME st;
+ GetSystemTime(&st);
+ int time = st.wSecond + 1000 * st.wMilliseconds;
+ // Mix in a random number to make sure that we get different seeds if
+ // we try to generate seeds faster than the clock resolution.
+ // We need 2 random values since the generator only generate 16 bits at
+ // a time (https://msdn.microsoft.com/en-us/library/398ax69y.aspx)
+ int rnd1 = ::rand();
+ int rnd2 = ::rand();
+ uint64_t rnd = (rnd1 | rnd2 << 16) ^ time;
+ return rnd;
+
+#elif defined __APPLE__
+ // Same approach as for win32, except that the random number generator
+ // is better (// https://developer.apple.com/legacy/library/documentation/Darwin/Reference/ManPages/man3/random.3.html#//apple_ref/doc/man/3/random).
+ uint64_t rnd = ::random() ^ mach_absolute_time();
+ return rnd;
+
+#else
+ // Augment the current time with pseudo random number generation
+ // to ensure that we get different seeds if we try to generate seeds
+ // faster than the clock resolution.
+ timespec ts;
+ clock_gettime(CLOCK_REALTIME, &ts);
+ uint64_t rnd = ::random() ^ ts.tv_nsec;
+ return rnd;
+#endif
+}
+
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state) {
+ // TODO: Unify with the implementation in the non blocking thread pool.
+ uint64_t current = *state;
+ // Update the internal state
+ *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+ // Generate the random output (using the PCG-XSH-RS scheme)
+ return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
+}
+
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {
+ seed = seed ? seed : get_random_seed();
+ return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+}
+
+} // namespace
+
+
+template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+T RandomToTypeUniform(uint64_t* state) {
+ unsigned rnd = PCG_XSH_RS_generator(state);
+ return static_cast<T>(rnd);
+}
+
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state) {
+ Eigen::half result;
+ // Generate 10 random bits for the mantissa
+ unsigned rnd = PCG_XSH_RS_generator(state);
+ result.x = static_cast<uint16_t>(rnd & 0x3ffu);
+ // Set the exponent
+ result.x |= (static_cast<uint16_t>(15) << 10);
+ // Return the final result
+ return result - Eigen::half(1.0f);
+}
+
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float RandomToTypeUniform<float>(uint64_t* state) {
+ typedef union {
+ uint32_t raw;
+ float fp;
+ } internal;
+ internal result;
+ // Generate 23 random bits for the mantissa mantissa
+ const unsigned rnd = PCG_XSH_RS_generator(state);
+ result.raw = rnd & 0x7fffffu;
+ // Set the exponent
+ result.raw |= (static_cast<uint32_t>(127) << 23);
+ // Return the final result
+ return result.fp - 1.0f;
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double RandomToTypeUniform<double>(uint64_t* state) {
+ typedef union {
+ uint64_t raw;
+ double dp;
+ } internal;
+ internal result;
+ result.raw = 0;
+ // Generate 52 random bits for the mantissa
+ // First generate the upper 20 bits
+ unsigned rnd1 = PCG_XSH_RS_generator(state) & 0xfffffu;
+ // The generate the lower 32 bits
+ unsigned rnd2 = PCG_XSH_RS_generator(state);
+ result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
+ // Set the exponent
+ result.raw |= (static_cast<uint64_t>(1023) << 52);
+ // Return the final result
+ return result.dp - 1.0;
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state) {
+ return std::complex<float>(RandomToTypeUniform<float>(state),
+ RandomToTypeUniform<float>(state));
+}
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state) {
+ return std::complex<double>(RandomToTypeUniform<double>(state),
+ RandomToTypeUniform<double>(state));
+}
+
+template <typename T> class UniformRandomGenerator {
+ public:
+ static const bool PacketAccess = true;
+
+ // Uses the given "seed" if non-zero, otherwise uses a random seed.
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
+ uint64_t seed = 0) {
+ m_state = PCG_XSH_RS_state(seed);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
+ const UniformRandomGenerator& other) {
+ m_state = other.m_state;
+ }
+
+ template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ T operator()(Index i) const {
+ uint64_t local_state = m_state + i;
+ T result = RandomToTypeUniform<T>(&local_state);
+ m_state = local_state;
+ return result;
+ }
+
+ template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Packet packetOp(Index i) const {
+ const int packetSize = internal::unpacket_traits<Packet>::size;
+ EIGEN_ALIGN_MAX T values[packetSize];
+ uint64_t local_state = m_state + i;
+ for (int j = 0; j < packetSize; ++j) {
+ values[j] = RandomToTypeUniform<T>(&local_state);
+ }
+ m_state = local_state;
+ return internal::pload<Packet>(values);
+ }
+
+ private:
+ mutable uint64_t m_state;
+};
+
+template <typename Scalar>
+struct functor_traits<UniformRandomGenerator<Scalar> > {
+ enum {
+ // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)).
+ Cost = 12 * NumTraits<Scalar>::AddCost *
+ ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)),
+ PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
+ };
+};
+
+
+
+template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+T RandomToTypeNormal(uint64_t* state) {
+ // Use the ratio of uniform method to generate numbers following a normal
+ // distribution. See for example Numerical Recipes chapter 7.3.9 for the
+ // details.
+ T u, v, q;
+ do {
+ u = RandomToTypeUniform<T>(state);
+ v = T(1.7156) * (RandomToTypeUniform<T>(state) - T(0.5));
+ const T x = u - T(0.449871);
+ const T y = numext::abs(v) + T(0.386595);
+ q = x*x + y * (T(0.196)*y - T(0.25472)*x);
+ } while (q > T(0.27597) &&
+ (q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u));
+
+ return v/u;
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state) {
+ return std::complex<float>(RandomToTypeNormal<float>(state),
+ RandomToTypeNormal<float>(state));
+}
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state) {
+ return std::complex<double>(RandomToTypeNormal<double>(state),
+ RandomToTypeNormal<double>(state));
+}
+
+
+template <typename T> class NormalRandomGenerator {
+ public:
+ static const bool PacketAccess = true;
+
+ // Uses the given "seed" if non-zero, otherwise uses a random seed.
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
+ m_state = PCG_XSH_RS_state(seed);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(
+ const NormalRandomGenerator& other) {
+ m_state = other.m_state;
+ }
+
+ template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ T operator()(Index i) const {
+ uint64_t local_state = m_state + i;
+ T result = RandomToTypeNormal<T>(&local_state);
+ m_state = local_state;
+ return result;
+ }
+
+ template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Packet packetOp(Index i) const {
+ const int packetSize = internal::unpacket_traits<Packet>::size;
+ EIGEN_ALIGN_MAX T values[packetSize];
+ uint64_t local_state = m_state + i;
+ for (int j = 0; j < packetSize; ++j) {
+ values[j] = RandomToTypeNormal<T>(&local_state);
+ }
+ m_state = local_state;
+ return internal::pload<Packet>(values);
+ }
+
+ private:
+ mutable uint64_t m_state;
+};
+
+
+template <typename Scalar>
+struct functor_traits<NormalRandomGenerator<Scalar> > {
+ enum {
+ // On average, we need to generate about 3 random numbers
+ // 15 mul, 8 add, 1.5 logs
+ Cost = 3 * functor_traits<UniformRandomGenerator<Scalar> >::Cost +
+ 15 * NumTraits<Scalar>::AddCost + 8 * NumTraits<Scalar>::AddCost +
+ 3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2,
+ PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
+ };
+};
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 885295f0a..a87777b22 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -87,7 +87,7 @@ struct preserve_inner_most_dims {
static const bool value = false;
};
-#if defined(EIGEN_HAS_CONSTEXPR) && defined(EIGEN_HAS_VARIADIC_TEMPLATES)
+#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES
template <typename ReducedDims, int NumTensorDims>
struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
@@ -122,7 +122,7 @@ struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
template <int DimIndex, typename Self, typename Op>
struct GenericDimReducer {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
- EIGEN_STATIC_ASSERT(DimIndex > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
GenericDimReducer<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
@@ -183,7 +183,7 @@ struct InnerMostDimPreserver {
template <int DimIndex, typename Self, typename Op>
struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
- EIGEN_STATIC_ASSERT(DimIndex > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
@@ -248,16 +248,12 @@ struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
*output = reducer.finalize(reducer.initialize());
return;
}
-#ifdef EIGEN_USE_COST_MODEL
const TensorOpCost cost =
self.m_impl.costPerCoeff(Vectorizable) +
TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable,
PacketSize);
const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
num_coeffs, cost, device.numThreads());
-#else
- const int num_threads = device.numThreads();
-#endif
if (num_threads == 1) {
*output =
InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
@@ -268,7 +264,7 @@ struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
eigen_assert(num_coeffs >= numblocks * blocksize);
- Barrier barrier(numblocks);
+ Barrier barrier(internal::convert_index<unsigned int>(numblocks));
MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
for (Index i = 0; i < numblocks; ++i) {
device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run,
@@ -320,7 +316,18 @@ struct OuterReducer {
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
template <int B, int N, typename S, typename R, typename I>
-__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
+__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
+
+
+#ifdef EIGEN_HAS_CUDA_FP16
+template <typename S, typename R, typename I>
+__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
+template <int B, int N, typename S, typename R, typename I>
+__global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
+template <int NPT, typename S, typename R, typename I>
+__global__ void InnerReductionKernelHalfFloat(R, const S, I, I, half*);
+
+#endif
template <int NPT, typename S, typename R, typename I>
__global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
@@ -396,7 +403,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device)
{
- EIGEN_STATIC_ASSERT(NumInputDims >= NumReducedDims, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -464,22 +471,14 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
- static bool size_large_enough(Index total_size) {
-#ifndef EIGEN_USE_COST_MODEL
- return total_size > 1024 * 1024;
-#else
- return true || total_size;
-#endif
- }
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) {
m_impl.evalSubExprsIfNeeded(NULL);
// Use the FullReducer if possible.
- if (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
+ if (RunningFullReduction &&
+ internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
- (!RunningOnGPU && size_large_enough(internal::array_prod(m_impl.dimensions()))))) {
-
+ !RunningOnGPU)) {
bool need_assign = false;
if (!data) {
m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType)));
@@ -493,7 +492,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
}
// Attempt to use an optimized reduction.
- else if (RunningOnGPU && data && (m_device.majorDeviceVersion() >= 3)) {
+ else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) {
bool reducing_inner_dims = true;
for (int i = 0; i < NumReducedDims; ++i) {
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -506,8 +505,25 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
(reducing_inner_dims || ReducingInnerMostDims)) {
const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+ if (!data) {
+ if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) {
+ data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+ m_result = data;
+ }
+ else {
+ return true;
+ }
+ }
Op reducer(m_reducer);
- return internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
+ if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
+ if (m_result) {
+ m_device.deallocate(m_result);
+ m_result = NULL;
+ }
+ return true;
+ } else {
+ return (m_result != NULL);
+ }
}
bool preserving_inner_dims = true;
@@ -522,8 +538,25 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
preserving_inner_dims) {
const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+ if (!data) {
+ if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) {
+ data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+ m_result = data;
+ }
+ else {
+ return true;
+ }
+ }
Op reducer(m_reducer);
- return internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
+ if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
+ if (m_result) {
+ m_device.deallocate(m_result);
+ m_result = NULL;
+ }
+ return true;
+ } else {
+ return (m_result != NULL);
+ }
}
}
return true;
@@ -533,13 +566,14 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
m_impl.cleanup();
if (m_result) {
m_device.deallocate(m_result);
+ m_result = NULL;
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
- if (RunningFullReduction && m_result) {
- return *m_result;
+ if ((RunningFullReduction || RunningOnGPU) && m_result) {
+ return *(m_result + index);
}
Op reducer(m_reducer);
if (ReducingInnerMostDims || RunningFullReduction) {
@@ -558,8 +592,12 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
- eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions())));
+
+ if (RunningOnGPU && m_result) {
+ return internal::pload<PacketReturnType>(m_result + index);
+ }
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
if (ReducingInnerMostDims) {
@@ -617,11 +655,19 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
#endif
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
- template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
+ template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
+#ifdef EIGEN_HAS_CUDA_FP16
+ template <typename S, typename R, typename I> friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
+ template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
+ template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
+#endif
template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+
template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
#endif
+ template <typename S, typename O, typename D> friend struct internal::InnerReducer;
+
// Returns the Index in the input tensor of the first value that needs to be
// used to compute the reduction at output index "index".
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index fd2587dd5..65638b6a8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -67,8 +67,41 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
#endif
}
-template <typename T>
-__device__ inline void atomicReduce(T* output, T accum, SumReducer<T>&) {
+// We extend atomicExch to support extra data types
+template <typename Type>
+__device__ inline Type atomicExchCustom(Type* address, Type val) {
+ return atomicExch(address, val);
+}
+
+template <>
+__device__ inline double atomicExchCustom(double* address, double val) {
+ unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
+ return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
+}
+
+#ifdef EIGEN_HAS_CUDA_FP16
+template <template <typename T> class R>
+__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
+ unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+ unsigned int newval = oldval;
+ reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+ if (newval == oldval) {
+ return;
+ }
+ unsigned int readback;
+ while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+ oldval = readback;
+ newval = oldval;
+ reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+ if (newval == oldval) {
+ return;
+ }
+ }
+}
+#endif
+
+template <>
+__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
#if __CUDA_ARCH__ >= 300
atomicAdd(output, accum);
#else
@@ -86,17 +119,44 @@ __global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coe
}
}
+
template <int BlockSize, int NumPerThread, typename Self,
typename Reducer, typename Index>
__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
- typename Self::CoeffReturnType* output) {
+ typename Self::CoeffReturnType* output, unsigned int* semaphore) {
+#if __CUDA_ARCH__ >= 300
+ // Initialize the output value
const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
-
- // Initialize the output value if it wasn't initialized by the ReductionInitKernel
- if (gridDim.x == 1 && first_index == 0) {
- *output = reducer.initialize();
+ if (gridDim.x == 1) {
+ if (first_index == 0) {
+ *output = reducer.initialize();
+ }
+ }
+ else {
+ if (threadIdx.x == 0) {
+ unsigned int block = atomicCAS(semaphore, 0u, 1u);
+ if (block == 0) {
+ // We're the first block to run, initialize the output value
+ atomicExchCustom(output, reducer.initialize());
+ __threadfence();
+ atomicExch(semaphore, 2u);
+ }
+ else {
+ // Wait for the first block to initialize the output value.
+ // Use atomicCAS here to ensure that the reads aren't cached
+ unsigned int val;
+ do {
+ val = atomicCAS(semaphore, 2u, 2u);
+ }
+ while (val < 2u);
+ }
+ }
}
+ __syncthreads();
+
+ eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
+
typename Self::CoeffReturnType accum = reducer.initialize();
Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
for (Index i = 0; i < max_iter; i+=BlockSize) {
@@ -108,50 +168,203 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
#pragma unroll
for (int offset = warpSize/2; offset > 0; offset /= 2) {
- reducer.reduce(__shfl_down(accum, offset), &accum);
+ reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
}
if ((threadIdx.x & (warpSize - 1)) == 0) {
atomicReduce(output, accum, reducer);
}
+
+ if (gridDim.x > 1 && threadIdx.x == 0) {
+ // Let the last block reset the semaphore
+ atomicInc(semaphore, gridDim.x + 1);
+ }
+#else
+ assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
+
+
+#ifdef EIGEN_HAS_CUDA_FP16
+template <typename Self,
+ typename Reducer, typename Index>
+__global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) {
+ eigen_assert(blockDim.x == 1);
+ eigen_assert(gridDim.x == 1);
+ if (num_coeffs % 2 != 0) {
+ half last = input.m_impl.coeff(num_coeffs-1);
+ *scratch = __halves2half2(last, reducer.initialize());
+ } else {
+ *scratch = reducer.template initializePacket<half2>();
+ }
}
+template <typename Self,
+ typename Reducer, typename Index>
+__global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
+ const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+ const Index num_threads = blockDim.x * gridDim.x;
+ const Index num_packets = num_coeffs / 2;
+ for (Index i = thread_id; i < num_packets; i += num_threads) {
+ ((half2*)output)[i] = reducer.template initializePacket<half2>();
+ }
-template <typename Self, typename Op, bool Vectorizable>
-struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
- // Unfortunately nvidia doesn't support well exotic types such as complex,
- // so reduce the scope of the optimized version of the code to the simple case
- // of floats.
- static const bool HasOptimizedImplementation = !Op::IsStateful &&
- internal::is_same<typename Self::CoeffReturnType, float>::value;
+ if (thread_id == 0 && num_coeffs % 2 != 0) {
+ output[num_coeffs-1] = reducer.initialize();
+ }
+}
- template <typename OutputType>
- static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const GpuDevice&, OutputType*) {
- assert(false && "Should only be called on floats");
+template <int BlockSize, int NumPerThread, typename Self,
+ typename Reducer, typename Index>
+__global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
+ half* output, half2* scratch) {
+ eigen_assert(NumPerThread % 2 == 0);
+
+ const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x;
+
+ // Initialize the output value if it wasn't initialized by the ReductionInitKernel
+ if (gridDim.x == 1 && first_index == 0) {
+ if (num_coeffs % 2 != 0) {
+ half last = input.m_impl.coeff(num_coeffs-1);
+ *scratch = __halves2half2(last, reducer.initialize());
+ } else {
+ *scratch = reducer.template initializePacket<half2>();
+ }
+ __syncthreads();
+ }
+
+ half2 accum = reducer.template initializePacket<half2>();
+ const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2);
+ for (Index i = 0; i < max_iter; i += BlockSize) {
+ const Index index = first_index + 2*i;
+ eigen_assert(index + 1 < num_coeffs);
+ half2 val = input.m_impl.template packet<Unaligned>(index);
+ reducer.reducePacket(val, &accum);
+ }
+
+#pragma unroll
+ for (int offset = warpSize/2; offset > 0; offset /= 2) {
+ reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
}
- static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
+ if ((threadIdx.x & (warpSize - 1)) == 0) {
+ atomicReduce(scratch, accum, reducer);
+ }
+
+ __syncthreads();
+
+ if (gridDim.x == 1 && first_index == 0) {
+ half tmp = __low2half(*scratch);
+ reducer.reduce(__high2half(*scratch), &tmp);
+ *output = tmp;
+ }
+}
+
+template <typename Op>
+__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
+ eigen_assert(threadIdx.x == 1);
+ half tmp = __low2half(*scratch);
+ reducer.reduce(__high2half(*scratch), &tmp);
+ *output = tmp;
+}
+
+#endif
+
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct FullReductionLauncher {
+ static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
+ assert(false && "Should only be called on doubles, floats and half floats");
+ }
+};
+
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct FullReductionLauncher<
+ Self, Op, OutputType, PacketAccess,
+ typename internal::enable_if<
+ internal::is_same<float, OutputType>::value ||
+ internal::is_same<double, OutputType>::value,
+ void>::type> {
+ static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
typedef typename Self::Index Index;
+ typedef typename Self::CoeffReturnType Scalar;
+ const int block_size = 256;
+ const int num_per_thread = 128;
+ const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
- const Index num_coeffs = array_prod(self.m_impl.dimensions());
- // Don't crash when we're called with an input tensor of size 0.
- if (num_coeffs == 0) {
- return;
+ unsigned int* semaphore = NULL;
+ if (num_blocks > 1) {
+ semaphore = device.semaphore();
}
+ LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
+ num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
+ }
+};
+
+#ifdef EIGEN_HAS_CUDA_FP16
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, false> {
+ static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
+ assert(false && "Should not be called since there is no packet accessor");
+ }
+};
+
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, true> {
+ static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
+ typedef typename Self::Index Index;
+
const int block_size = 256;
const int num_per_thread = 128;
const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+ half2* scratch = static_cast<half2*>(device.scratchpad());
if (num_blocks > 1) {
- // We initialize the outputs outside the reduction kernel when we can't be sure that there
+ // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
// won't be a race conditions between multiple thread blocks.
- LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
- 1, 32, 0, device, reducer.initialize(), 1, output);
+ LAUNCH_CUDA_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
+ 1, 1, 0, device, reducer, self, num_coeffs, scratch);
}
- LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
- num_blocks, block_size, 0, device, reducer, self, num_coeffs, output);
+ LAUNCH_CUDA_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
+ num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
+
+ if (num_blocks > 1) {
+ LAUNCH_CUDA_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
+ 1, 1, 0, device, reducer, output, scratch);
+ }
+ }
+};
+#endif
+
+
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
+ // Unfortunately nvidia doesn't support well exotic types such as complex,
+ // so reduce the scope of the optimized version of the code to the simple cases
+ // of doubles, floats and half floats
+#ifdef EIGEN_HAS_CUDA_FP16
+ static const bool HasOptimizedImplementation = !Op::IsStateful &&
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value ||
+ (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
+#else
+ static const bool HasOptimizedImplementation = !Op::IsStateful &&
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif
+
+ template <typename OutputType>
+ static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
+ assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+ const Index num_coeffs = array_prod(self.m_impl.dimensions());
+ // Don't crash when we're called with an input tensor of size 0.
+ if (num_coeffs == 0) {
+ return;
+ }
+
+ FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
}
};
@@ -160,6 +373,8 @@ template <int NumPerThread, typename Self,
typename Reducer, typename Index>
__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
typename Self::CoeffReturnType* output) {
+#if __CUDA_ARCH__ >= 300
+ typedef typename Self::CoeffReturnType Type;
eigen_assert(blockDim.y == 1);
eigen_assert(blockDim.z == 1);
eigen_assert(gridDim.y == 1);
@@ -179,6 +394,7 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
output[i] = reducer.initialize();
}
+ __syncthreads();
}
for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
@@ -188,13 +404,13 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
const Index col_block = i % input_col_blocks;
const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
- float reduced_val = reducer.initialize();
+ Type reduced_val = reducer.initialize();
for (Index j = 0; j < NumPerThread; j += unroll_times) {
const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
if (last_col >= num_coeffs_to_reduce) {
- for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col +=blockDim.x) {
- const float val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+ for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
+ const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
reducer.reduce(val, &reduced_val);
}
break;
@@ -217,33 +433,128 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
atomicReduce(&(output[row]), reduced_val, reducer);
}
}
+ }
+#else
+ assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
+
+#ifdef EIGEN_HAS_CUDA_FP16
+
+template <int NumPerThread, typename Self,
+ typename Reducer, typename Index>
+__global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+ half* output) {
+ eigen_assert(blockDim.y == 1);
+ eigen_assert(blockDim.z == 1);
+ eigen_assert(gridDim.y == 1);
+ eigen_assert(gridDim.z == 1);
+
+ const int unroll_times = 16;
+ eigen_assert(NumPerThread % unroll_times == 0);
+ eigen_assert(unroll_times % 2 == 0);
+
+ const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
+ const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
+ const Index num_threads = blockDim.x * gridDim.x;
+ const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+ // Initialize the output values if they weren't initialized by the ReductionInitKernel
+ if (gridDim.x == 1) {
+ Index i = 2*thread_id;
+ for (; i + 1 < num_preserved_coeffs; i += 2*num_threads) {
+ half* loc = output + i;
+ *((half2*)loc) = reducer.template initializePacket<half2>();
+ }
+ if (i < num_preserved_coeffs) {
+ output[i] = reducer.initialize();
+ }
__syncthreads();
}
+
+ for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
+ const Index row = 2 * (i / input_col_blocks);
+
+ if (row + 1 < num_preserved_coeffs) {
+ const Index col_block = i % input_col_blocks;
+ const Index col_begin = 2 * (col_block * blockDim.x * NumPerThread + threadIdx.x);
+
+ half2 reduced_val1 = reducer.template initializePacket<half2>();
+ half2 reduced_val2 = reducer.template initializePacket<half2>();
+
+ for (Index j = 0; j < NumPerThread; j += unroll_times) {
+ const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1) * 2;
+ if (last_col >= num_coeffs_to_reduce) {
+ Index col = col_begin + blockDim.x * j;
+ for (; col + 1 < num_coeffs_to_reduce; col += blockDim.x) {
+ const half2 val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col);
+ reducer.reducePacket(val1, &reduced_val1);
+ const half2 val2 = input.m_impl.template packet<Unaligned>((row+1) * num_coeffs_to_reduce + col);
+ reducer.reducePacket(val2, &reduced_val2);
+ }
+ if (col < num_coeffs_to_reduce) {
+ // Peel;
+ const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+ const half2 val1 = __halves2half2(last1, reducer.initialize());
+ reducer.reducePacket(val1, &reduced_val1);
+ const half last2 = input.m_impl.coeff((row+1) * num_coeffs_to_reduce + col);
+ const half2 val2 = __halves2half2(last2, reducer.initialize());
+ reducer.reducePacket(val2, &reduced_val2);
+ }
+ break;
+ } else {
+ // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+ for (int k = 0; k < unroll_times; ++k) {
+ const Index col = col_begin + blockDim.x * (j + k) * 2;
+ reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col), &reduced_val1);
+ reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1)* num_coeffs_to_reduce + col), &reduced_val2);
+ }
+ }
+ }
+
+#pragma unroll
+ for (int offset = warpSize/2; offset > 0; offset /= 2) {
+ reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1);
+ reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2);
+ }
+
+ half val1 = __low2half(reduced_val1);
+ reducer.reduce(__high2half(reduced_val1), &val1);
+ half val2 = __low2half(reduced_val2);
+ reducer.reduce(__high2half(reduced_val2), &val2);
+ half2 val = __halves2half2(val1, val2);
+
+ if ((threadIdx.x & (warpSize - 1)) == 0) {
+ half* loc = output + row;
+ atomicReduce((half2*)loc, val, reducer);
+ }
+ }
+ }
}
-template <typename Self, typename Op>
-struct InnerReducer<Self, Op, GpuDevice> {
- // Unfortunately nvidia doesn't support well exotic types such as complex,
- // so reduce the scope of the optimized version of the code to the simple case
- // of floats.
- static const bool HasOptimizedImplementation = !Op::IsStateful &&
- internal::is_same<typename Self::CoeffReturnType, float>::value;
+#endif
- template <typename Device, typename OutputType>
- static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
- assert(false && "Should only be called to reduce floats on a gpu device");
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct InnerReductionLauncher {
+ static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
+ assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
return true;
}
+};
- static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct InnerReductionLauncher<
+ Self, Op, OutputType, PacketAccess,
+ typename internal::enable_if<
+ internal::is_same<float, OutputType>::value ||
+ internal::is_same<double, OutputType>::value,
+ void>::type> {
+ static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
typedef typename Self::Index Index;
- // It's faster to use the usual code.
- if (num_coeffs_to_reduce <= 32) {
- return true;
- }
-
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
const int block_size = 256;
const int num_per_thread = 128;
@@ -259,7 +570,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
const int max_blocks = device.getNumCudaMultiProcessors() *
device.maxCudaThreadsPerMultiProcessor() / 1024;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
- LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
+ LAUNCH_CUDA_KERNEL((ReductionInitKernel<OutputType, Index>),
num_blocks, 1024, 0, device, reducer.initialize(),
num_preserved_vals, output);
}
@@ -271,6 +582,85 @@ struct InnerReducer<Self, Op, GpuDevice> {
}
};
+#ifdef EIGEN_HAS_CUDA_FP16
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
+ static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
+ assert(false && "Should not be called since there is no packet accessor");
+ return true;
+ }
+};
+
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
+ static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+ typedef typename Self::Index Index;
+
+ if (num_preserved_vals % 2 != 0) {
+ // Not supported yet, revert to the slower code path
+ return true;
+ }
+
+ const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+ const int block_size = /*256*/128;
+ const int num_per_thread = /*128*/64;
+ const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+ const int max_blocks = device.getNumCudaMultiProcessors() *
+ device.maxCudaThreadsPerMultiProcessor() / block_size;
+ const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+ if (num_blocks > 1) {
+ // We initialize the outputs outside the reduction kernel when we can't be sure that there
+ // won't be a race conditions between multiple thread blocks.
+ const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+ const int max_blocks = device.getNumCudaMultiProcessors() *
+ device.maxCudaThreadsPerMultiProcessor() / 1024;
+ const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+ LAUNCH_CUDA_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
+ 1, 1, 0, device, reducer, self, num_preserved_vals, output);
+ }
+
+ LAUNCH_CUDA_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
+ num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+ return false;
+ }
+};
+#endif
+
+
+template <typename Self, typename Op>
+struct InnerReducer<Self, Op, GpuDevice> {
+ // Unfortunately nvidia doesn't support well exotic types such as complex,
+ // so reduce the scope of the optimized version of the code to the simple case
+ // of floats and half floats.
+#ifdef EIGEN_HAS_CUDA_FP16
+ static const bool HasOptimizedImplementation = !Op::IsStateful &&
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value ||
+ (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
+#else
+ static const bool HasOptimizedImplementation = !Op::IsStateful &&
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif
+
+ template <typename OutputType>
+ static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+ assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+ const Index num_coeffs = array_prod(self.m_impl.dimensions());
+ // Don't crash when we're called with an input tensor of size 0.
+ if (num_coeffs == 0) {
+ return true;
+ }
+ // It's faster to use the usual code.
+ if (num_coeffs_to_reduce <= 128) {
+ return true;
+ }
+
+ return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
+ }
+};
template <int NumPerThread, typename Self,
typename Reducer, typename Index>
@@ -283,6 +673,7 @@ __global__ void OuterReductionKernel(Reducer reducer, const Self input, Index nu
for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
output[i] = reducer.initialize();
}
+ __syncthreads();
}
// Do the reduction.
@@ -307,11 +698,11 @@ struct OuterReducer<Self, Op, GpuDevice> {
// so reduce the scope of the optimized version of the code to the simple case
// of floats.
static const bool HasOptimizedImplementation = !Op::IsStateful &&
- internal::is_same<typename Self::CoeffReturnType, float>::value;
-
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value);
template <typename Device, typename OutputType>
static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
- assert(false && "Should only be called to reduce floats on a gpu device");
+ assert(false && "Should only be called to reduce doubles or floats on a gpu device");
return true;
}
@@ -323,7 +714,7 @@ struct OuterReducer<Self, Op, GpuDevice> {
return true;
}
- const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+ const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
const int block_size = 256;
const int num_per_thread = 16;
const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
index bc92d9e6d..99245f778 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
@@ -193,7 +193,7 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef
return m_evaluator->coeff(index);
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar operator()(Index firstIndex, IndexTypes... otherIndices) const
{
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index 1a59cc8f7..14e392e36 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -122,7 +122,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
: m_impl(op.expression(), device), m_reverse(op.reverse())
{
// Reversing a scalar isn't supported yet. It would be a no-op anyway.
- EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
// Compute strides
m_dimensions = m_impl.dimensions();
@@ -195,7 +195,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
PacketReturnType packet(Index index) const
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
// TODO(ndjaitly): write a better packing routine that uses
@@ -269,7 +269,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x) {
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
// This code is pilfered from TensorMorphing.h
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
new file mode 100644
index 000000000..8501466ce
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -0,0 +1,287 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Igor Babuschkin <igor@babuschk.in>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
+#define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Op, typename XprType>
+struct traits<TensorScanOp<Op, XprType> >
+ : public traits<XprType> {
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+};
+
+template<typename Op, typename XprType>
+struct eval<TensorScanOp<Op, XprType>, Eigen::Dense>
+{
+ typedef const TensorScanOp<Op, XprType>& type;
+};
+
+template<typename Op, typename XprType>
+struct nested<TensorScanOp<Op, XprType>, 1,
+ typename eval<TensorScanOp<Op, XprType> >::type>
+{
+ typedef TensorScanOp<Op, XprType> type;
+};
+} // end namespace internal
+
+/** \class TensorScan
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor scan class.
+ */
+template <typename Op, typename XprType>
+class TensorScanOp
+ : public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> {
+public:
+ typedef typename Eigen::internal::traits<TensorScanOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorScanOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorScanOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorScanOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp(
+ const XprType& expr, const Index& axis, bool exclusive = false, const Op& op = Op())
+ : m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const Index axis() const { return m_axis; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const XprType& expression() const { return m_expr; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const Op accumulator() const { return m_accumulator; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ bool exclusive() const { return m_exclusive; }
+
+protected:
+ typename XprType::Nested m_expr;
+ const Index m_axis;
+ const Op m_accumulator;
+ const bool m_exclusive;
+};
+
+template <typename Self, typename Reducer, typename Device>
+struct ScanLauncher;
+
+// Eval as rvalue
+template <typename Op, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
+
+ typedef TensorScanOp<Op, ArgType> XprType;
+ typedef typename XprType::Index Index;
+ static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
+ BlockAccess = false,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false,
+ RawAccess = true
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
+ const Device& device)
+ : m_impl(op.expression(), device),
+ m_device(device),
+ m_exclusive(op.exclusive()),
+ m_accumulator(op.accumulator()),
+ m_size(m_impl.dimensions()[op.axis()]),
+ m_stride(1),
+ m_output(NULL) {
+
+ // Accumulating a scalar isn't supported.
+ EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ eigen_assert(op.axis() >= 0 && op.axis() < NumDims);
+
+ // Compute stride of scan axis
+ const Dimensions& dims = m_impl.dimensions();
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = 0; i < op.axis(); ++i) {
+ m_stride = m_stride * dims[i];
+ }
+ } else {
+ for (int i = NumDims - 1; i > op.axis(); --i) {
+ m_stride = m_stride * dims[i];
+ }
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
+ return m_impl.dimensions();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const {
+ return m_stride;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const {
+ return m_size;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const {
+ return m_accumulator;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const {
+ return m_exclusive;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const {
+ return m_impl;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const {
+ return m_device;
+ }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ ScanLauncher<Self, Op, Device> launcher;
+ if (data) {
+ launcher(*this, data);
+ return false;
+ }
+
+ const Index total_size = internal::array_prod(dimensions());
+ m_output = static_cast<CoeffReturnType*>(m_device.allocate(total_size * sizeof(Scalar)));
+ launcher(*this, m_output);
+ return true;
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+ return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const
+ {
+ return m_output;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ return m_output[index];
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
+ return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ if (m_output != NULL) {
+ m_device.deallocate(m_output);
+ m_output = NULL;
+ }
+ m_impl.cleanup();
+ }
+
+protected:
+ TensorEvaluator<ArgType, Device> m_impl;
+ const Device& m_device;
+ const bool m_exclusive;
+ Op m_accumulator;
+ const Index m_size;
+ Index m_stride;
+ CoeffReturnType* m_output;
+};
+
+// CPU implementation of scan
+// TODO(ibab) This single-threaded implementation should be parallelized,
+// at least by running multiple scans at the same time.
+template <typename Self, typename Reducer, typename Device>
+struct ScanLauncher {
+ void operator()(Self& self, typename Self::CoeffReturnType *data) {
+ Index total_size = internal::array_prod(self.dimensions());
+
+ // We fix the index along the scan axis to 0 and perform a
+ // scan per remaining entry. The iteration is split into two nested
+ // loops to avoid an integer division by keeping track of each idx1 and idx2.
+ for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
+ for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
+ // Calculate the starting offset for the scan
+ Index offset = idx1 + idx2;
+
+ // Compute the scan along the axis, starting at the calculated offset
+ typename Self::CoeffReturnType accum = self.accumulator().initialize();
+ for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+ Index curr = offset + idx3 * self.stride();
+
+ if (self.exclusive()) {
+ data[curr] = self.accumulator().finalize(accum);
+ self.accumulator().reduce(self.inner().coeff(curr), &accum);
+ } else {
+ self.accumulator().reduce(self.inner().coeff(curr), &accum);
+ data[curr] = self.accumulator().finalize(accum);
+ }
+ }
+ }
+ }
+ }
+};
+
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+
+// GPU implementation of scan
+// TODO(ibab) This placeholder implementation performs multiple scans in
+// parallel, but it would be better to use a parallel scan algorithm and
+// optimize memory access.
+template <typename Self, typename Reducer>
+__global__ void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
+ // Compute offset as in the CPU version
+ Index val = threadIdx.x + blockIdx.x * blockDim.x;
+ Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
+
+ if (offset + (self.size() - 1) * self.stride() < total_size) {
+ // Compute the scan along the axis, starting at the calculated offset
+ typename Self::CoeffReturnType accum = self.accumulator().initialize();
+ for (Index idx = 0; idx < self.size(); idx++) {
+ Index curr = offset + idx * self.stride();
+ if (self.exclusive()) {
+ data[curr] = self.accumulator().finalize(accum);
+ self.accumulator().reduce(self.inner().coeff(curr), &accum);
+ } else {
+ self.accumulator().reduce(self.inner().coeff(curr), &accum);
+ data[curr] = self.accumulator().finalize(accum);
+ }
+ }
+ }
+ __syncthreads();
+
+}
+
+template <typename Self, typename Reducer>
+struct ScanLauncher<Self, Reducer, GpuDevice> {
+ void operator()(const Self& self, typename Self::CoeffReturnType* data) {
+ Index total_size = internal::array_prod(self.dimensions());
+ Index num_blocks = (total_size / self.size() + 63) / 64;
+ Index block_size = 64;
+ LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
+ }
+};
+#endif // EIGEN_USE_GPU && __CUDACC__
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index e76533710..113c060e3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -166,7 +166,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
@@ -248,7 +248,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
template <int StoreMode> EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
index 0e89033c4..f8121d17b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
@@ -85,7 +85,7 @@ class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_>
: m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size)), m_dimensions(dimensions)
{ EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN }
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template <typename... DenseIndex>
EIGEN_DEVICE_FUNC TensorStorage(DenseIndex... indices) : m_dimensions(indices...) {
m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(internal::array_prod(m_dimensions));
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 52b7d216a..6c35bfdb6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -164,7 +164,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
Index inputIndices[] = {0, 0};
@@ -289,7 +289,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < this->dimensions().TotalSize());
Index inputIndices[] = {0, 0};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
index 5950f38e2..3523e7c94 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
@@ -20,6 +20,7 @@ struct static_val {
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator uint64_t() const { return n; }
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val() { }
+
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) {
eigen_assert(v == n);
@@ -53,7 +54,7 @@ struct TensorUInt128
template<typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
explicit TensorUInt128(const T& x) : high(0), low(x) {
- eigen_assert((static_cast<typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type>(x) <= static_cast<typename conditional<sizeof(LOW) == 8, uint64_t, uint32_t>::type>(NumTraits<LOW>::highest())));
+ eigen_assert((static_cast<typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type>(x) <= NumTraits<uint64_t>::highest()));
eigen_assert(x >= 0);
}
@@ -74,21 +75,21 @@ struct TensorUInt128
template <typename HL, typename LL, typename HR, typename LR>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-static bool operator == (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+bool operator == (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
{
return (lhs.high == rhs.high) & (lhs.low == rhs.low);
}
template <typename HL, typename LL, typename HR, typename LR>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-static bool operator != (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+bool operator != (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
{
return (lhs.high != rhs.high) | (lhs.low != rhs.low);
}
template <typename HL, typename LL, typename HR, typename LR>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-static bool operator >= (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+bool operator >= (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
{
if (lhs.high != rhs.high) {
return lhs.high > rhs.high;
@@ -98,7 +99,7 @@ static bool operator >= (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<H
template <typename HL, typename LL, typename HR, typename LR>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-static bool operator < (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+bool operator < (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
{
if (lhs.high != rhs.high) {
return lhs.high < rhs.high;
@@ -108,7 +109,7 @@ static bool operator < (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR
template <typename HL, typename LL, typename HR, typename LR>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-static TensorUInt128<uint64_t, uint64_t> operator + (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+TensorUInt128<uint64_t, uint64_t> operator + (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
{
TensorUInt128<uint64_t, uint64_t> result(lhs.high + rhs.high, lhs.low + rhs.low);
if (result.low < rhs.low) {
@@ -119,7 +120,7 @@ static TensorUInt128<uint64_t, uint64_t> operator + (const TensorUInt128<HL, LL>
template <typename HL, typename LL, typename HR, typename LR>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-static TensorUInt128<uint64_t, uint64_t> operator - (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+TensorUInt128<uint64_t, uint64_t> operator - (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
{
TensorUInt128<uint64_t, uint64_t> result(lhs.high - rhs.high, lhs.low - rhs.low);
if (result.low > lhs.low) {
@@ -130,8 +131,8 @@ static TensorUInt128<uint64_t, uint64_t> operator - (const TensorUInt128<HL, LL>
template <typename HL, typename LL, typename HR, typename LR>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-static TensorUInt128<uint64_t, uint64_t> operator * (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+TensorUInt128<uint64_t, uint64_t> operator * (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
{
// Split each 128-bit integer into 4 32-bit integers, and then do the
// multiplications by hand as follow:
@@ -205,8 +206,8 @@ static TensorUInt128<uint64_t, uint64_t> operator * (const TensorUInt128<HL, LL>
}
template <typename HL, typename LL, typename HR, typename LR>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-static TensorUInt128<uint64_t, uint64_t> operator / (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+TensorUInt128<uint64_t, uint64_t> operator / (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
{
if (rhs == TensorUInt128<static_val<0>, static_val<1> >(1)) {
return TensorUInt128<uint64_t, uint64_t>(lhs.high, lhs.low);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
index e735fc76f..0ca2cac84 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@@ -187,7 +187,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device)
{
- EIGEN_STATIC_ASSERT(NumDims >= 5, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE);
m_paddingValue = op.padding_value();
@@ -408,7 +408,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1 ||
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt b/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt
deleted file mode 100644
index 6e871a8da..000000000
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-FILE(GLOB Eigen_CXX11_TensorSymmetry_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_CXX11_TensorSymmetry_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/TensorSymmetry COMPONENT Devel
- )
-
-add_subdirectory(util)
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt
deleted file mode 100644
index dc9fc78ec..000000000
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_TensorSymmetry_util_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_CXX11_TensorSymmetry_util_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/TensorSymmetry/util COMPONENT Devel
- )
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt b/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt
deleted file mode 100644
index 88fef50c6..000000000
--- a/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_ThreadPool_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_CXX11_ThreadPool_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/ThreadPool COMPONENT Devel
- )
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
index 6dd64f185..71d55552d 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
@@ -50,7 +50,7 @@ class EventCount {
public:
class Waiter;
- EventCount(std::vector<Waiter>& waiters) : waiters_(waiters) {
+ EventCount(MaxSizeVector<Waiter>& waiters) : waiters_(waiters) {
eigen_assert(waiters.size() < (1 << kWaiterBits) - 1);
// Initialize epoch to something close to overflow to test overflow.
state_ = kStackMask | (kEpochMask - kEpochInc * waiters.size() * 2);
@@ -169,7 +169,8 @@ class EventCount {
class Waiter {
friend class EventCount;
- std::atomic<Waiter*> next;
+ // Align to 128 byte boundary to prevent false sharing with other Waiter objects in the same vector.
+ EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic<Waiter*> next;
std::mutex mu;
std::condition_variable cv;
uint64_t epoch;
@@ -179,8 +180,6 @@ class EventCount {
kWaiting,
kSignaled,
};
- // Prevent false sharing with other Waiter objects in the same vector.
- char pad_[128];
};
private:
@@ -200,7 +199,7 @@ class EventCount {
static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift;
static const uint64_t kEpochInc = 1ull << kEpochShift;
std::atomic<uint64_t> state_;
- std::vector<Waiter>& waiters_;
+ MaxSizeVector<Waiter>& waiters_;
void Park(Waiter* w) {
std::unique_lock<std::mutex> lock(w->mu);
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
index 1c471a19f..354bce52a 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@@ -23,18 +23,44 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
: env_(env),
threads_(num_threads),
queues_(num_threads),
+ coprimes_(num_threads),
waiters_(num_threads),
- blocked_(),
- spinning_(),
- done_(),
+ blocked_(0),
+ spinning_(0),
+ done_(false),
ec_(waiters_) {
- for (int i = 0; i < num_threads; i++) queues_.push_back(new Queue());
- for (int i = 0; i < num_threads; i++)
+ waiters_.resize(num_threads);
+
+ // Calculate coprimes of num_threads.
+ // Coprimes are used for a random walk over all threads in Steal
+ // and NonEmptyQueueIndex. Iteration is based on the fact that if we take
+ // a walk starting thread index t and calculate num_threads - 1 subsequent
+ // indices as (t + coprime) % num_threads, we will cover all threads without
+ // repetitions (effectively getting a presudo-random permutation of thread
+ // indices).
+ for (int i = 1; i <= num_threads; i++) {
+ unsigned a = i;
+ unsigned b = num_threads;
+ // If GCD(a, b) == 1, then a and b are coprimes.
+ while (b != 0) {
+ unsigned tmp = a;
+ a = b;
+ b = tmp % b;
+ }
+ if (a == 1) {
+ coprimes_.push_back(i);
+ }
+ }
+ for (int i = 0; i < num_threads; i++) {
+ queues_.push_back(new Queue());
+ }
+ for (int i = 0; i < num_threads; i++) {
threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); }));
+ }
}
~NonBlockingThreadPoolTempl() {
- done_.store(true, std::memory_order_relaxed);
+ done_ = true;
// Now if all threads block without work, they will start exiting.
// But note that threads can continue to work arbitrary long,
// block, submit new work, unblock and otherwise live full life.
@@ -50,7 +76,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
PerThread* pt = GetPerThread();
if (pt->pool == this) {
// Worker thread of this pool, push onto the thread's queue.
- Queue* q = queues_[pt->index];
+ Queue* q = queues_[pt->thread_id];
t = q->PushFront(std::move(t));
} else {
// A free-standing thread (or worker of another pool), push onto a random
@@ -71,108 +97,111 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
env_.ExecuteTask(t); // Push failed, execute directly.
}
+ int NumThreads() const final {
+ return static_cast<int>(threads_.size());
+ }
+
+ int CurrentThreadId() const final {
+ const PerThread* pt =
+ const_cast<NonBlockingThreadPoolTempl*>(this)->GetPerThread();
+ if (pt->pool == this) {
+ return pt->thread_id;
+ } else {
+ return -1;
+ }
+ }
+
private:
typedef typename Environment::EnvThread Thread;
struct PerThread {
- bool inited;
+ constexpr PerThread() : pool(NULL), rand(0), thread_id(-1) { }
NonBlockingThreadPoolTempl* pool; // Parent pool, or null for normal threads.
- unsigned index; // Worker thread index in pool.
- unsigned rand; // Random generator state.
+ uint64_t rand; // Random generator state.
+ int thread_id; // Worker thread index in pool.
};
Environment env_;
MaxSizeVector<Thread*> threads_;
MaxSizeVector<Queue*> queues_;
- std::vector<EventCount::Waiter> waiters_;
+ MaxSizeVector<unsigned> coprimes_;
+ MaxSizeVector<EventCount::Waiter> waiters_;
std::atomic<unsigned> blocked_;
std::atomic<bool> spinning_;
std::atomic<bool> done_;
EventCount ec_;
// Main worker thread loop.
- void WorkerLoop(unsigned index) {
+ void WorkerLoop(int thread_id) {
PerThread* pt = GetPerThread();
pt->pool = this;
- pt->index = index;
- Queue* q = queues_[index];
- EventCount::Waiter* waiter = &waiters_[index];
- std::vector<Task> stolen;
+ pt->rand = std::hash<std::thread::id>()(std::this_thread::get_id());
+ pt->thread_id = thread_id;
+ Queue* q = queues_[thread_id];
+ EventCount::Waiter* waiter = &waiters_[thread_id];
for (;;) {
- Task t;
- if (!stolen.empty()) {
- t = std::move(stolen.back());
- stolen.pop_back();
- }
- if (!t.f) t = q->PopFront();
+ Task t = q->PopFront();
if (!t.f) {
- if (Steal(&stolen)) {
- t = std::move(stolen.back());
- stolen.pop_back();
- while (stolen.size()) {
- Task t1 = q->PushFront(std::move(stolen.back()));
- stolen.pop_back();
- if (t1.f) {
- // There is not much we can do in this case. Just execute the
- // remaining directly.
- stolen.push_back(std::move(t1));
- break;
+ t = Steal();
+ if (!t.f) {
+ // Leave one thread spinning. This reduces latency.
+ // TODO(dvyukov): 1000 iterations is based on fair dice roll, tune it.
+ // Also, the time it takes to attempt to steal work 1000 times depends
+ // on the size of the thread pool. However the speed at which the user
+ // of the thread pool submit tasks is independent of the size of the
+ // pool. Consider a time based limit instead.
+ if (!spinning_ && !spinning_.exchange(true)) {
+ for (int i = 0; i < 1000 && !t.f; i++) {
+ t = Steal();
+ }
+ spinning_ = false;
+ }
+ if (!t.f) {
+ if (!WaitForWork(waiter, &t)) {
+ return;
}
}
}
}
if (t.f) {
env_.ExecuteTask(t);
- continue;
}
- // Leave one thread spinning. This reduces latency.
- if (!spinning_ && !spinning_.exchange(true)) {
- bool nowork = true;
- for (int i = 0; i < 1000; i++) {
- if (!OutOfWork()) {
- nowork = false;
- break;
- }
- }
- spinning_ = false;
- if (!nowork) continue;
- }
- if (!WaitForWork(waiter)) return;
}
}
// Steal tries to steal work from other worker threads in best-effort manner.
- bool Steal(std::vector<Task>* stolen) {
- if (queues_.size() == 1) return false;
+ Task Steal() {
PerThread* pt = GetPerThread();
- unsigned lastq = pt->index;
- for (unsigned i = queues_.size(); i > 0; i--) {
- unsigned victim = Rand(&pt->rand) % queues_.size();
- if (victim == lastq && queues_.size() > 2) {
- i++;
- continue;
+ const size_t size = queues_.size();
+ unsigned r = Rand(&pt->rand);
+ unsigned inc = coprimes_[r % coprimes_.size()];
+ unsigned victim = r % size;
+ for (unsigned i = 0; i < size; i++) {
+ Task t = queues_[victim]->PopBack();
+ if (t.f) {
+ return t;
+ }
+ victim += inc;
+ if (victim >= size) {
+ victim -= size;
}
- // Steal half of elements from a victim queue.
- // It is typical to steal just one element, but that assumes that work is
- // recursively subdivided in halves so that the stolen element is exactly
- // half of work. If work elements are equally-sized, then is makes sense
- // to steal half of elements at once and then work locally for a while.
- if (queues_[victim]->PopBackHalf(stolen)) return true;
- lastq = victim;
}
- // Just to make sure that we did not miss anything.
- for (unsigned i = queues_.size(); i > 0; i--)
- if (queues_[i - 1]->PopBackHalf(stolen)) return true;
- return false;
+ return Task();
}
- // WaitForWork blocks until new work is available, or if it is time to exit.
- bool WaitForWork(EventCount::Waiter* waiter) {
- // We already did best-effort emptiness check in Steal, so prepare blocking.
+ // WaitForWork blocks until new work is available (returns true), or if it is
+ // time to exit (returns false). Can optionally return a task to execute in t
+ // (in such case t.f != nullptr on return).
+ bool WaitForWork(EventCount::Waiter* waiter, Task* t) {
+ eigen_assert(!t->f);
+ // We already did best-effort emptiness check in Steal, so prepare for
+ // blocking.
ec_.Prewait(waiter);
- // Now do reliable emptiness check.
- if (!OutOfWork()) {
+ // Now do a reliable emptiness check.
+ int victim = NonEmptyQueueIndex();
+ if (victim != -1) {
ec_.CancelWait(waiter);
+ *t = queues_[victim]->PopBack();
return true;
}
// Number of blocked threads is used as termination condition.
@@ -186,7 +215,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
// right after incrementing blocked_ above. Now a free-standing thread
// submits work and calls destructor (which sets done_). If we don't
// re-check queues, we will exit leaving the work unexecuted.
- if (!OutOfWork()) {
+ if (NonEmptyQueueIndex() != -1) {
// Note: we must not pop from queues before we decrement blocked_,
// otherwise the following scenario is possible. Consider that instead
// of checking for emptiness we popped the only element from queues.
@@ -205,23 +234,36 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
return true;
}
- bool OutOfWork() {
- for (unsigned i = 0; i < queues_.size(); i++)
- if (!queues_[i]->Empty()) return false;
- return true;
+ int NonEmptyQueueIndex() {
+ PerThread* pt = GetPerThread();
+ const size_t size = queues_.size();
+ unsigned r = Rand(&pt->rand);
+ unsigned inc = coprimes_[r % coprimes_.size()];
+ unsigned victim = r % size;
+ for (unsigned i = 0; i < size; i++) {
+ if (!queues_[victim]->Empty()) {
+ return victim;
+ }
+ victim += inc;
+ if (victim >= size) {
+ victim -= size;
+ }
+ }
+ return -1;
}
- PerThread* GetPerThread() {
+ static EIGEN_STRONG_INLINE PerThread* GetPerThread() {
EIGEN_THREAD_LOCAL PerThread per_thread_;
PerThread* pt = &per_thread_;
- if (pt->inited) return pt;
- pt->inited = true;
- pt->rand = std::hash<std::thread::id>()(std::this_thread::get_id());
return pt;
}
- static unsigned Rand(unsigned* state) {
- return *state = *state * 1103515245 + 12345;
+ static EIGEN_STRONG_INLINE unsigned Rand(uint64_t* state) {
+ uint64_t current = *state;
+ // Update the internal state
+ *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+ // Generate the random output (using the PCG-XSH-RS scheme)
+ return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
}
};
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
index 0544a6e15..05ed76cbe 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
@@ -38,7 +38,7 @@ namespace Eigen {
template <typename Work, unsigned kSize>
class RunQueue {
public:
- RunQueue() : front_(), back_() {
+ RunQueue() : front_(0), back_(0) {
// require power-of-two for fast masking
eigen_assert((kSize & (kSize - 1)) == 0);
eigen_assert(kSize > 2); // why would you do this?
@@ -100,7 +100,7 @@ class RunQueue {
// PopBack removes and returns the last elements in the queue.
// Can fail spuriously.
Work PopBack() {
- if (Empty()) return 0;
+ if (Empty()) return Work();
std::unique_lock<std::mutex> lock(mutex_, std::try_to_lock);
if (!lock) return Work();
unsigned back = back_.load(std::memory_order_relaxed);
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
index 17fd1658b..e75d0f467 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
@@ -24,7 +24,7 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface {
explicit SimpleThreadPoolTempl(int num_threads, Environment env = Environment())
: env_(env), threads_(num_threads), waiters_(num_threads) {
for (int i = 0; i < num_threads; i++) {
- threads_.push_back(env.CreateThread([this]() { WorkerLoop(); }));
+ threads_.push_back(env.CreateThread([this, i]() { WorkerLoop(i); }));
}
}
@@ -55,7 +55,7 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface {
// Schedule fn() for execution in the pool of threads. The functions are
// executed in the order in which they are scheduled.
- void Schedule(std::function<void()> fn) {
+ void Schedule(std::function<void()> fn) final {
Task t = env_.CreateTask(std::move(fn));
std::unique_lock<std::mutex> l(mu_);
if (waiters_.empty()) {
@@ -69,9 +69,25 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface {
}
}
+ int NumThreads() const final {
+ return static_cast<int>(threads_.size());
+ }
+
+ int CurrentThreadId() const final {
+ const PerThread* pt = this->GetPerThread();
+ if (pt->pool == this) {
+ return pt->thread_id;
+ } else {
+ return -1;
+ }
+ }
+
protected:
- void WorkerLoop() {
+ void WorkerLoop(int thread_id) {
std::unique_lock<std::mutex> l(mu_);
+ PerThread* pt = GetPerThread();
+ pt->pool = this;
+ pt->thread_id = thread_id;
Waiter w;
Task t;
while (!exiting_) {
@@ -111,13 +127,24 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface {
bool ready;
};
+ struct PerThread {
+ constexpr PerThread() : pool(NULL), thread_id(-1) { }
+ SimpleThreadPoolTempl* pool; // Parent pool, or null for normal threads.
+ int thread_id; // Worker thread index in pool.
+ };
+
Environment env_;
std::mutex mu_;
MaxSizeVector<Thread*> threads_; // All threads
MaxSizeVector<Waiter*> waiters_; // Stack of waiting threads.
- std::deque<Task> pending_; // Queue of pending work
- std::condition_variable empty_; // Signaled on pending_.empty()
+ std::deque<Task> pending_; // Queue of pending work
+ std::condition_variable empty_; // Signaled on pending_.empty()
bool exiting_ = false;
+
+ PerThread* GetPerThread() const {
+ EIGEN_THREAD_LOCAL PerThread per_thread;
+ return &per_thread;
+ }
};
typedef SimpleThreadPoolTempl<StlThreadEnvironment> SimpleThreadPool;
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
index d2204ad5b..399f95cc1 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
@@ -21,14 +21,14 @@ struct StlThreadEnvironment {
// destructor must join the thread.
class EnvThread {
public:
- EnvThread(std::function<void()> f) : thr_(f) {}
+ EnvThread(std::function<void()> f) : thr_(std::move(f)) {}
~EnvThread() { thr_.join(); }
private:
std::thread thr_;
};
- EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(f); }
+ EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(std::move(f)); }
Task CreateTask(std::function<void()> f) { return Task{std::move(f)}; }
void ExecuteTask(const Task& t) { t.f(); }
};
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
index 38b40aceb..a65ee97c9 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
@@ -18,6 +18,13 @@ class ThreadPoolInterface {
public:
virtual void Schedule(std::function<void()> fn) = 0;
+ // Returns the number of threads in the pool.
+ virtual int NumThreads() const = 0;
+
+ // Returns a logical thread index between 0 and NumThreads() - 1 if called
+ // from one of the threads in the pool. Returns -1 otherwise.
+ virtual int CurrentThreadId() const = 0;
+
virtual ~ThreadPoolInterface() {}
};
diff --git a/unsupported/Eigen/CXX11/src/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/util/CMakeLists.txt
deleted file mode 100644
index 7eab492d6..000000000
--- a/unsupported/Eigen/CXX11/src/util/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_util_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_CXX11_util_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/util COMPONENT Devel
- )
diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
index 24159e54c..30d3ebcff 100644
--- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
@@ -117,7 +117,7 @@ template <typename T, size_t n> class array {
values[7] = v8;
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE array(std::initializer_list<T> l) {
eigen_assert(l.size() == n);
@@ -167,7 +167,7 @@ template <typename T> class array<T, 0> {
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE array() : dummy() { }
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() {
eigen_assert(l.size() == 0);
}
diff --git a/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h b/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
index 961456f10..4bc3dd1ba 100644
--- a/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
+++ b/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
@@ -55,6 +55,17 @@ class MaxSizeVector {
internal::aligned_free(data_);
}
+ void resize(size_t n) {
+ eigen_assert(n <= reserve_);
+ for (size_t i = size_; i < n; ++i) {
+ new (&data_[i]) T;
+ }
+ for (size_t i = n; i < size_; ++i) {
+ data_[i].~T();
+ }
+ size_ = n;
+ }
+
// Append new elements (up to reserved size).
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void push_back(const T& t) {
diff --git a/unsupported/Eigen/EulerAngles b/unsupported/Eigen/EulerAngles
new file mode 100644
index 000000000..521fa3f76
--- /dev/null
+++ b/unsupported/Eigen/EulerAngles
@@ -0,0 +1,43 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EULERANGLES_MODULE_H
+#define EIGEN_EULERANGLES_MODULE_H
+
+
+#include "Eigen/Core"
+#include "Eigen/Geometry"
+
+#include "Eigen/src/Core/util/DisableStupidWarnings.h"
+
+namespace Eigen {
+
+/**
+ * \defgroup EulerAngles_Module EulerAngles module
+ * \brief This module provides generic euler angles rotation.
+ *
+ * Euler angles are a way to represent 3D rotation.
+ *
+ * In order to use this module in your code, include this header:
+ * \code
+ * #include <unsupported/Eigen/EulerAngles>
+ * \endcode
+ *
+ * See \ref EulerAngles for more information.
+ *
+ */
+
+}
+
+#include "src/EulerAngles/EulerSystem.h"
+#include "src/EulerAngles/EulerAngles.h"
+
+#include "Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_EULERANGLES_MODULE_H
diff --git a/unsupported/Eigen/KroneckerProduct b/unsupported/Eigen/KroneckerProduct
index c932c06a6..5f5afb8cf 100644
--- a/unsupported/Eigen/KroneckerProduct
+++ b/unsupported/Eigen/KroneckerProduct
@@ -13,6 +13,8 @@
#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+#include "../../Eigen/src/SparseCore/SparseUtil.h"
+
namespace Eigen {
/**
diff --git a/unsupported/Eigen/MPRealSupport b/unsupported/Eigen/MPRealSupport
index 89036886b..7f0b70c63 100644
--- a/unsupported/Eigen/MPRealSupport
+++ b/unsupported/Eigen/MPRealSupport
@@ -67,27 +67,32 @@ int main()
IsSigned = 1,
IsComplex = 0,
RequireInitialization = 1,
- ReadCost = 10,
- AddCost = 10,
- MulCost = 40
+ ReadCost = HugeCost,
+ AddCost = HugeCost,
+ MulCost = HugeCost
};
typedef mpfr::mpreal Real;
typedef mpfr::mpreal NonInteger;
- inline static Real highest (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::maxval(Precision); }
- inline static Real lowest (long Precision = mpfr::mpreal::get_default_prec()) { return -mpfr::maxval(Precision); }
+ static inline Real highest (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::maxval(Precision); }
+ static inline Real lowest (long Precision = mpfr::mpreal::get_default_prec()) { return -mpfr::maxval(Precision); }
// Constants
- inline static Real Pi (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_pi(Precision); }
- inline static Real Euler (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_euler(Precision); }
- inline static Real Log2 (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_log2(Precision); }
- inline static Real Catalan (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_catalan(Precision); }
+ static inline Real Pi (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_pi(Precision); }
+ static inline Real Euler (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_euler(Precision); }
+ static inline Real Log2 (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_log2(Precision); }
+ static inline Real Catalan (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_catalan(Precision); }
- inline static Real epsilon (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::machine_epsilon(Precision); }
- inline static Real epsilon (const Real& x) { return mpfr::machine_epsilon(x); }
+ static inline Real epsilon (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::machine_epsilon(Precision); }
+ static inline Real epsilon (const Real& x) { return mpfr::machine_epsilon(x); }
- inline static Real dummy_precision()
+#ifdef MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS
+ static inline int digits10 (long Precision = mpfr::mpreal::get_default_prec()) { return std::numeric_limits<Real>::digits10(Precision); }
+ static inline int digits10 (const Real& x) { return std::numeric_limits<Real>::digits10(x); }
+#endif
+
+ static inline Real dummy_precision()
{
mpfr_prec_t weak_prec = ((mpfr::mpreal::get_default_prec()-1) * 90) / 100;
return mpfr::machine_epsilon(weak_prec);
diff --git a/unsupported/Eigen/SpecialFunctions b/unsupported/Eigen/SpecialFunctions
new file mode 100644
index 000000000..7c7493c56
--- /dev/null
+++ b/unsupported/Eigen/SpecialFunctions
@@ -0,0 +1,61 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_MODULE
+#define EIGEN_SPECIALFUNCTIONS_MODULE
+
+#include "../../Eigen/Core"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+namespace Eigen {
+
+/**
+ * \defgroup SpecialFunctions_Module Special math functions module
+ *
+ * This module features additional coefficient-wise math functions available
+ * within the numext:: namespace for the scalar version, and as method and/or free
+ * functions of Array. Those include:
+ *
+ * - erf
+ * - erfc
+ * - lgamma
+ * - igamma
+ * - igammac
+ * - digamma
+ * - polygamma
+ * - zeta
+ * - betainc
+ *
+ * \code
+ * #include <unsupported/Eigen/SpecialFunctions>
+ * \endcode
+ */
+//@{
+
+}
+
+#include "src/SpecialFunctions/SpecialFunctionsImpl.h"
+#include "src/SpecialFunctions/SpecialFunctionsPacketMath.h"
+#include "src/SpecialFunctions/SpecialFunctionsHalf.h"
+#include "src/SpecialFunctions/SpecialFunctionsFunctors.h"
+#include "src/SpecialFunctions/SpecialFunctionsArrayAPI.h"
+
+#if defined EIGEN_VECTORIZE_CUDA
+ #include "src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h"
+#endif
+
+namespace Eigen {
+//@}
+}
+
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_SPECIALFUNCTIONS_MODULE
diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h b/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h
index 1a61e3367..33b6c393f 100644
--- a/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h
+++ b/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h
@@ -20,37 +20,60 @@ public:
AutoDiffJacobian(const Functor& f) : Functor(f) {}
// forward constructors
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+ template<typename... T>
+ AutoDiffJacobian(const T& ...Values) : Functor(Values...) {}
+#else
template<typename T0>
AutoDiffJacobian(const T0& a0) : Functor(a0) {}
template<typename T0, typename T1>
AutoDiffJacobian(const T0& a0, const T1& a1) : Functor(a0, a1) {}
template<typename T0, typename T1, typename T2>
AutoDiffJacobian(const T0& a0, const T1& a1, const T2& a2) : Functor(a0, a1, a2) {}
+#endif
+
+ typedef typename Functor::InputType InputType;
+ typedef typename Functor::ValueType ValueType;
+ typedef typename ValueType::Scalar Scalar;
enum {
- InputsAtCompileTime = Functor::InputsAtCompileTime,
- ValuesAtCompileTime = Functor::ValuesAtCompileTime
+ InputsAtCompileTime = InputType::RowsAtCompileTime,
+ ValuesAtCompileTime = ValueType::RowsAtCompileTime
};
- typedef typename Functor::InputType InputType;
- typedef typename Functor::ValueType ValueType;
- typedef typename Functor::JacobianType JacobianType;
- typedef typename JacobianType::Scalar Scalar;
+ typedef Matrix<Scalar, ValuesAtCompileTime, InputsAtCompileTime> JacobianType;
typedef typename JacobianType::Index Index;
- typedef Matrix<Scalar,InputsAtCompileTime,1> DerivativeType;
+ typedef Matrix<Scalar, InputsAtCompileTime, 1> DerivativeType;
typedef AutoDiffScalar<DerivativeType> ActiveScalar;
-
typedef Matrix<ActiveScalar, InputsAtCompileTime, 1> ActiveInput;
typedef Matrix<ActiveScalar, ValuesAtCompileTime, 1> ActiveValue;
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+ // Some compilers don't accept variadic parameters after a default parameter,
+ // i.e., we can't just write _jac=0 but we need to overload operator():
+ EIGEN_STRONG_INLINE
+ void operator() (const InputType& x, ValueType* v) const
+ {
+ this->operator()(x, v, 0);
+ }
+ template<typename... ParamsType>
+ void operator() (const InputType& x, ValueType* v, JacobianType* _jac,
+ const ParamsType&... Params) const
+#else
void operator() (const InputType& x, ValueType* v, JacobianType* _jac=0) const
+#endif
{
eigen_assert(v!=0);
+
if (!_jac)
{
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+ Functor::operator()(x, v, Params...);
+#else
Functor::operator()(x, v);
+#endif
return;
}
@@ -61,12 +84,16 @@ public:
if(InputsAtCompileTime==Dynamic)
for (Index j=0; j<jac.rows(); j++)
- av[j].derivatives().resize(this->inputs());
+ av[j].derivatives().resize(x.rows());
for (Index i=0; i<jac.cols(); i++)
- ax[i].derivatives() = DerivativeType::Unit(this->inputs(),i);
+ ax[i].derivatives() = DerivativeType::Unit(x.rows(),i);
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+ Functor::operator()(ax, &av, Params...);
+#else
Functor::operator()(ax, &av);
+#endif
for (Index i=0; i<jac.rows(); i++)
{
@@ -74,8 +101,6 @@ public:
jac.row(i) = av[i].derivatives();
}
}
-protected:
-
};
}
diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
index 481dfa91a..50fedf6ac 100755
--- a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
+++ b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
@@ -30,6 +30,13 @@ template<typename _DerType, bool Enable> struct auto_diff_special_op;
} // end namespace internal
+template<typename _DerType> class AutoDiffScalar;
+
+template<typename NewDerType>
+inline AutoDiffScalar<NewDerType> MakeAutoDiffScalar(const typename NewDerType::Scalar& value, const NewDerType &der) {
+ return AutoDiffScalar<NewDerType>(value,der);
+}
+
/** \class AutoDiffScalar
* \brief A scalar type replacement with automatic differentation capability
*
@@ -60,7 +67,7 @@ template<typename _DerType>
class AutoDiffScalar
: public internal::auto_diff_special_op
<_DerType, !internal::is_same<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar,
- typename NumTraits<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar>::Real>::value>
+ typename NumTraits<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar>::Real>::value>
{
public:
typedef internal::auto_diff_special_op
@@ -101,7 +108,7 @@ class AutoDiffScalar
template<typename OtherDerType>
AutoDiffScalar(const AutoDiffScalar<OtherDerType>& other
#ifndef EIGEN_PARSED_BY_DOXYGEN
- , typename internal::enable_if<internal::is_same<Scalar,typename OtherDerType::Scalar>::value,void*>::type = 0
+ , typename internal::enable_if<internal::is_same<Scalar, typename internal::traits<typename internal::remove_all<OtherDerType>::type>::Scalar>::value,void*>::type = 0
#endif
)
: m_value(other.value()), m_derivatives(other.derivatives())
@@ -257,20 +264,16 @@ class AutoDiffScalar
-m_derivatives);
}
- inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >
+ inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) >
operator*(const Scalar& other) const
{
- return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >(
- m_value * other,
- (m_derivatives * other));
+ return MakeAutoDiffScalar(m_value * other, m_derivatives * other);
}
- friend inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >
+ friend inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) >
operator*(const Scalar& other, const AutoDiffScalar& a)
{
- return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >(
- a.value() * other,
- a.derivatives() * other);
+ return MakeAutoDiffScalar(a.value() * other, a.derivatives() * other);
}
// inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
@@ -289,20 +292,16 @@ class AutoDiffScalar
// a.derivatives() * other);
// }
- inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >
+ inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) >
operator/(const Scalar& other) const
{
- return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >(
- m_value / other,
- (m_derivatives * (Scalar(1)/other)));
+ return MakeAutoDiffScalar(m_value / other, (m_derivatives * (Scalar(1)/other)));
}
- friend inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >
+ friend inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) >
operator/(const Scalar& other, const AutoDiffScalar& a)
{
- return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >(
- other / a.value(),
- a.derivatives() * (Scalar(-other) / (a.value()*a.value())));
+ return MakeAutoDiffScalar(other / a.value(), a.derivatives() * (Scalar(-other) / (a.value()*a.value())));
}
// inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
@@ -322,34 +321,29 @@ class AutoDiffScalar
// }
template<typename OtherDerType>
- inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,
- const CwiseBinaryOp<internal::scalar_difference_op<Scalar>,
- const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>,
- const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type > > > >
+ inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(
+ CwiseBinaryOp<internal::scalar_difference_op<Scalar> EIGEN_COMMA
+ const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) EIGEN_COMMA
+ const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename internal::remove_all<OtherDerType>::type,Scalar,product) >,Scalar,product) >
operator/(const AutoDiffScalar<OtherDerType>& other) const
{
internal::make_coherent(m_derivatives, other.derivatives());
- return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,
- const CwiseBinaryOp<internal::scalar_difference_op<Scalar>,
- const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>,
- const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type > > > >(
+ return MakeAutoDiffScalar(
m_value / other.value(),
- ((m_derivatives * other.value()) - (m_value * other.derivatives()))
+ ((m_derivatives * other.value()) - (other.derivatives() * m_value))
* (Scalar(1)/(other.value()*other.value())));
}
template<typename OtherDerType>
inline const AutoDiffScalar<CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
- const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>,
- const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type> > >
+ const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product),
+ const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename internal::remove_all<OtherDerType>::type,Scalar,product) > >
operator*(const AutoDiffScalar<OtherDerType>& other) const
{
internal::make_coherent(m_derivatives, other.derivatives());
- return AutoDiffScalar<const CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
- const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>,
- const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type > > >(
+ return MakeAutoDiffScalar(
m_value * other.value(),
- (m_derivatives * other.value()) + (m_value * other.derivatives()));
+ (m_derivatives * other.value()) + (other.derivatives() * m_value));
}
inline AutoDiffScalar& operator*=(const Scalar& other)
@@ -426,18 +420,18 @@ struct auto_diff_special_op<_DerType, true>
}
- inline const AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >
+ inline const AutoDiffScalar<typename CwiseUnaryOp<bind2nd_op<scalar_product_op<Scalar,Real> >, DerType>::Type >
operator*(const Real& other) const
{
- return AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >(
+ return AutoDiffScalar<typename CwiseUnaryOp<bind2nd_op<scalar_product_op<Scalar,Real> >, DerType>::Type >(
derived().value() * other,
derived().derivatives() * other);
}
- friend inline const AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >
+ friend inline const AutoDiffScalar<typename CwiseUnaryOp<bind1st_op<scalar_product_op<Real,Scalar> >, DerType>::Type >
operator*(const Real& other, const AutoDiffScalar<_DerType>& a)
{
- return AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >(
+ return AutoDiffScalar<typename CwiseUnaryOp<bind1st_op<scalar_product_op<Real,Scalar> >, DerType>::Type >(
a.value() * other,
a.derivatives() * other);
}
@@ -501,43 +495,44 @@ struct make_coherent_impl<Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows,
}
};
-template<typename A_Scalar, int A_Rows, int A_Cols, int A_Options, int A_MaxRows, int A_MaxCols>
-struct scalar_product_traits<Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols>,A_Scalar>
-{
- enum { Defined = 1 };
- typedef Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> ReturnType;
-};
-
-template<typename A_Scalar, int A_Rows, int A_Cols, int A_Options, int A_MaxRows, int A_MaxCols>
-struct scalar_product_traits<A_Scalar, Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> >
-{
- enum { Defined = 1 };
- typedef Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> ReturnType;
-};
+} // end namespace internal
-template<typename DerType>
-struct scalar_product_traits<AutoDiffScalar<DerType>,typename DerType::Scalar>
+template<typename DerType, typename BinOp>
+struct ScalarBinaryOpTraits<AutoDiffScalar<DerType>,typename DerType::Scalar,BinOp>
{
- enum { Defined = 1 };
typedef AutoDiffScalar<DerType> ReturnType;
};
-template<typename DerType>
-struct scalar_product_traits<typename DerType::Scalar,AutoDiffScalar<DerType> >
+template<typename DerType, typename BinOp>
+struct ScalarBinaryOpTraits<typename DerType::Scalar,AutoDiffScalar<DerType>, BinOp>
{
- enum { Defined = 1 };
typedef AutoDiffScalar<DerType> ReturnType;
};
-} // end namespace internal
+
+// The following is an attempt to let Eigen's known about expression template, but that's more tricky!
+
+// template<typename DerType, typename BinOp>
+// struct ScalarBinaryOpTraits<AutoDiffScalar<DerType>,AutoDiffScalar<DerType>, BinOp>
+// {
+// enum { Defined = 1 };
+// typedef AutoDiffScalar<typename DerType::PlainObject> ReturnType;
+// };
+//
+// template<typename DerType1,typename DerType2, typename BinOp>
+// struct ScalarBinaryOpTraits<AutoDiffScalar<DerType1>,AutoDiffScalar<DerType2>, BinOp>
+// {
+// enum { Defined = 1 };//internal::is_same<typename DerType1::Scalar,typename DerType2::Scalar>::value };
+// typedef AutoDiffScalar<typename DerType1::PlainObject> ReturnType;
+// };
#define EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(FUNC,CODE) \
template<typename DerType> \
- inline const Eigen::AutoDiffScalar<Eigen::CwiseUnaryOp<Eigen::internal::scalar_multiple_op<typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar>, const typename Eigen::internal::remove_all<DerType>::type> > \
+ inline const Eigen::AutoDiffScalar< \
+ EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename Eigen::internal::remove_all<DerType>::type, typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar, product) > \
FUNC(const Eigen::AutoDiffScalar<DerType>& x) { \
using namespace Eigen; \
- typedef typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar Scalar; \
- typedef AutoDiffScalar<CwiseUnaryOp<Eigen::internal::scalar_multiple_op<Scalar>, const typename Eigen::internal::remove_all<DerType>::type> > ReturnType; \
+ EIGEN_UNUSED typedef typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar Scalar; \
CODE; \
}
@@ -548,56 +543,75 @@ inline const AutoDiffScalar<DerType>& real(const AutoDiffScalar<DerType>& x) {
template<typename DerType>
inline typename DerType::Scalar imag(const AutoDiffScalar<DerType>&) { return 0.; }
template<typename DerType, typename T>
-inline AutoDiffScalar<DerType> (min)(const AutoDiffScalar<DerType>& x, const T& y) { return (x <= y ? x : y); }
+inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (min)(const AutoDiffScalar<DerType>& x, const T& y) {
+ typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> ADS;
+ return (x <= y ? ADS(x) : ADS(y));
+}
template<typename DerType, typename T>
-inline AutoDiffScalar<DerType> (max)(const AutoDiffScalar<DerType>& x, const T& y) { return (x >= y ? x : y); }
+inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (max)(const AutoDiffScalar<DerType>& x, const T& y) {
+ typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> ADS;
+ return (x >= y ? ADS(x) : ADS(y));
+}
template<typename DerType, typename T>
-inline AutoDiffScalar<DerType> (min)(const T& x, const AutoDiffScalar<DerType>& y) { return (x < y ? x : y); }
+inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (min)(const T& x, const AutoDiffScalar<DerType>& y) {
+ typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> ADS;
+ return (x < y ? ADS(x) : ADS(y));
+}
template<typename DerType, typename T>
-inline AutoDiffScalar<DerType> (max)(const T& x, const AutoDiffScalar<DerType>& y) { return (x > y ? x : y); }
+inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (max)(const T& x, const AutoDiffScalar<DerType>& y) {
+ typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> ADS;
+ return (x > y ? ADS(x) : ADS(y));
+}
+template<typename DerType>
+inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (min)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) {
+ return (x.value() < y.value() ? x : y);
+}
+template<typename DerType>
+inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (max)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) {
+ return (x.value() >= y.value() ? x : y);
+}
+
EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(abs,
using std::abs;
- return ReturnType(abs(x.value()), x.derivatives() * (x.value()<0 ? -1 : 1) );)
+ return Eigen::MakeAutoDiffScalar(abs(x.value()), x.derivatives() * (x.value()<0 ? -1 : 1) );)
EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(abs2,
using numext::abs2;
- return ReturnType(abs2(x.value()), x.derivatives() * (Scalar(2)*x.value()));)
+ return Eigen::MakeAutoDiffScalar(abs2(x.value()), x.derivatives() * (Scalar(2)*x.value()));)
EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sqrt,
using std::sqrt;
Scalar sqrtx = sqrt(x.value());
- return ReturnType(sqrtx,x.derivatives() * (Scalar(0.5) / sqrtx));)
+ return Eigen::MakeAutoDiffScalar(sqrtx,x.derivatives() * (Scalar(0.5) / sqrtx));)
EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(cos,
using std::cos;
using std::sin;
- return ReturnType(cos(x.value()), x.derivatives() * (-sin(x.value())));)
+ return Eigen::MakeAutoDiffScalar(cos(x.value()), x.derivatives() * (-sin(x.value())));)
EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sin,
using std::sin;
using std::cos;
- return ReturnType(sin(x.value()),x.derivatives() * cos(x.value()));)
+ return Eigen::MakeAutoDiffScalar(sin(x.value()),x.derivatives() * cos(x.value()));)
EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(exp,
using std::exp;
Scalar expx = exp(x.value());
- return ReturnType(expx,x.derivatives() * expx);)
+ return Eigen::MakeAutoDiffScalar(expx,x.derivatives() * expx);)
EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(log,
using std::log;
- return ReturnType(log(x.value()),x.derivatives() * (Scalar(1)/x.value()));)
+ return Eigen::MakeAutoDiffScalar(log(x.value()),x.derivatives() * (Scalar(1)/x.value()));)
template<typename DerType>
-inline const Eigen::AutoDiffScalar<Eigen::CwiseUnaryOp<Eigen::internal::scalar_multiple_op<typename internal::traits<typename internal::remove_all<DerType>::type>::Scalar>, const typename internal::remove_all<DerType>::type> >
-pow(const Eigen::AutoDiffScalar<DerType>& x, typename internal::traits<typename internal::remove_all<DerType>::type>::Scalar y)
+inline const Eigen::AutoDiffScalar<
+EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename internal::remove_all<DerType>::type,typename internal::traits<typename internal::remove_all<DerType>::type>::Scalar,product) >
+pow(const Eigen::AutoDiffScalar<DerType> &x, const typename internal::traits<typename internal::remove_all<DerType>::type>::Scalar &y)
{
using namespace Eigen;
- typedef typename internal::remove_all<DerType>::type DerTypeCleaned;
- typedef typename Eigen::internal::traits<DerTypeCleaned>::Scalar Scalar;
- return AutoDiffScalar<CwiseUnaryOp<Eigen::internal::scalar_multiple_op<Scalar>, const DerTypeCleaned> >(
- std::pow(x.value(),y),
- x.derivatives() * (y * std::pow(x.value(),y-1)));
+ using std::pow;
+ return Eigen::MakeAutoDiffScalar(pow(x.value(),y), x.derivatives() * (y * pow(x.value(),y-1)));
}
@@ -622,27 +636,44 @@ atan2(const AutoDiffScalar<DerTypeA>& a, const AutoDiffScalar<DerTypeB>& b)
EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(tan,
using std::tan;
using std::cos;
- return ReturnType(tan(x.value()),x.derivatives() * (Scalar(1)/numext::abs2(cos(x.value()))));)
+ return Eigen::MakeAutoDiffScalar(tan(x.value()),x.derivatives() * (Scalar(1)/numext::abs2(cos(x.value()))));)
EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(asin,
using std::sqrt;
using std::asin;
- return ReturnType(asin(x.value()),x.derivatives() * (Scalar(1)/sqrt(1-numext::abs2(x.value()))));)
+ return Eigen::MakeAutoDiffScalar(asin(x.value()),x.derivatives() * (Scalar(1)/sqrt(1-numext::abs2(x.value()))));)
EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(acos,
using std::sqrt;
using std::acos;
- return ReturnType(acos(x.value()),x.derivatives() * (Scalar(-1)/sqrt(1-numext::abs2(x.value()))));)
+ return Eigen::MakeAutoDiffScalar(acos(x.value()),x.derivatives() * (Scalar(-1)/sqrt(1-numext::abs2(x.value()))));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(tanh,
+ using std::cosh;
+ using std::tanh;
+ return Eigen::MakeAutoDiffScalar(tanh(x.value()),x.derivatives() * (Scalar(1)/numext::abs2(cosh(x.value()))));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sinh,
+ using std::sinh;
+ using std::cosh;
+ return Eigen::MakeAutoDiffScalar(sinh(x.value()),x.derivatives() * cosh(x.value()));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(cosh,
+ using std::sinh;
+ using std::cosh;
+ return Eigen::MakeAutoDiffScalar(cosh(x.value()),x.derivatives() * sinh(x.value()));)
#undef EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY
template<typename DerType> struct NumTraits<AutoDiffScalar<DerType> >
- : NumTraits< typename NumTraits<typename DerType::Scalar>::Real >
+ : NumTraits< typename NumTraits<typename internal::remove_all<DerType>::type::Scalar>::Real >
{
- typedef AutoDiffScalar<Matrix<typename NumTraits<typename DerType::Scalar>::Real,DerType::RowsAtCompileTime,DerType::ColsAtCompileTime,
- DerType::Options, DerType::MaxRowsAtCompileTime, DerType::MaxColsAtCompileTime> > Real;
+ typedef typename internal::remove_all<DerType>::type DerTypeCleaned;
+ typedef AutoDiffScalar<Matrix<typename NumTraits<typename DerTypeCleaned::Scalar>::Real,DerTypeCleaned::RowsAtCompileTime,DerTypeCleaned::ColsAtCompileTime,
+ 0, DerTypeCleaned::MaxRowsAtCompileTime, DerTypeCleaned::MaxColsAtCompileTime> > Real;
typedef AutoDiffScalar<DerType> NonInteger;
typedef AutoDiffScalar<DerType> Nested;
+ typedef typename NumTraits<typename DerTypeCleaned::Scalar>::Literal Literal;
enum{
RequireInitialization = 1
};
diff --git a/unsupported/Eigen/src/AutoDiff/CMakeLists.txt b/unsupported/Eigen/src/AutoDiff/CMakeLists.txt
deleted file mode 100644
index ad91fd9c4..000000000
--- a/unsupported/Eigen/src/AutoDiff/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_AutoDiff_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_AutoDiff_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/AutoDiff COMPONENT Devel
- )
diff --git a/unsupported/Eigen/src/BVH/CMakeLists.txt b/unsupported/Eigen/src/BVH/CMakeLists.txt
deleted file mode 100644
index b377d865c..000000000
--- a/unsupported/Eigen/src/BVH/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_BVH_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_BVH_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/BVH COMPONENT Devel
- )
diff --git a/unsupported/Eigen/src/CMakeLists.txt b/unsupported/Eigen/src/CMakeLists.txt
deleted file mode 100644
index a7e8c7553..000000000
--- a/unsupported/Eigen/src/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-ADD_SUBDIRECTORY(AutoDiff)
-ADD_SUBDIRECTORY(BVH)
-ADD_SUBDIRECTORY(Eigenvalues)
-ADD_SUBDIRECTORY(FFT)
-ADD_SUBDIRECTORY(IterativeSolvers)
-ADD_SUBDIRECTORY(LevenbergMarquardt)
-ADD_SUBDIRECTORY(MatrixFunctions)
-ADD_SUBDIRECTORY(MoreVectorization)
-ADD_SUBDIRECTORY(NonLinearOptimization)
-ADD_SUBDIRECTORY(NumericalDiff)
-ADD_SUBDIRECTORY(Polynomials)
-ADD_SUBDIRECTORY(Skyline)
-ADD_SUBDIRECTORY(SparseExtra)
-ADD_SUBDIRECTORY(KroneckerProduct)
-ADD_SUBDIRECTORY(Splines)
diff --git a/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h b/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
index 3b6a69aff..866a8a460 100644
--- a/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
+++ b/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
@@ -628,15 +628,15 @@ ArpackGeneralizedSelfAdjointEigenSolver<MatrixType, MatrixSolver, BisSPD>&
m_info = Success;
}
- delete select;
+ delete[] select;
}
- delete v;
- delete iparam;
- delete ipntr;
- delete workd;
- delete workl;
- delete resid;
+ delete[] v;
+ delete[] iparam;
+ delete[] ipntr;
+ delete[] workd;
+ delete[] workl;
+ delete[] resid;
m_isInitialized = true;
diff --git a/unsupported/Eigen/src/Eigenvalues/CMakeLists.txt b/unsupported/Eigen/src/Eigenvalues/CMakeLists.txt
deleted file mode 100644
index 1d4387c82..000000000
--- a/unsupported/Eigen/src/Eigenvalues/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Eigenvalues_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_Eigenvalues_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Eigenvalues COMPONENT Devel
- )
diff --git a/unsupported/Eigen/src/EulerAngles/CMakeLists.txt b/unsupported/Eigen/src/EulerAngles/CMakeLists.txt
new file mode 100644
index 000000000..40af550e8
--- /dev/null
+++ b/unsupported/Eigen/src/EulerAngles/CMakeLists.txt
@@ -0,0 +1,6 @@
+FILE(GLOB Eigen_EulerAngles_SRCS "*.h")
+
+INSTALL(FILES
+ ${Eigen_EulerAngles_SRCS}
+ DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/EulerAngles COMPONENT Devel
+ )
diff --git a/unsupported/Eigen/src/EulerAngles/EulerAngles.h b/unsupported/Eigen/src/EulerAngles/EulerAngles.h
new file mode 100644
index 000000000..13a0da1ab
--- /dev/null
+++ b/unsupported/Eigen/src/EulerAngles/EulerAngles.h
@@ -0,0 +1,386 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EULERANGLESCLASS_H// TODO: Fix previous "EIGEN_EULERANGLES_H" definition?
+#define EIGEN_EULERANGLESCLASS_H
+
+namespace Eigen
+{
+ /*template<typename Other,
+ int OtherRows=Other::RowsAtCompileTime,
+ int OtherCols=Other::ColsAtCompileTime>
+ struct ei_eulerangles_assign_impl;*/
+
+ /** \class EulerAngles
+ *
+ * \ingroup EulerAngles_Module
+ *
+ * \brief Represents a rotation in a 3 dimensional space as three Euler angles.
+ *
+ * Euler rotation is a set of three rotation of three angles over three fixed axes, defined by the EulerSystem given as a template parameter.
+ *
+ * Here is how intrinsic Euler angles works:
+ * - first, rotate the axes system over the alpha axis in angle alpha
+ * - then, rotate the axes system over the beta axis(which was rotated in the first stage) in angle beta
+ * - then, rotate the axes system over the gamma axis(which was rotated in the two stages above) in angle gamma
+ *
+ * \note This class support only intrinsic Euler angles for simplicity,
+ * see EulerSystem how to easily overcome this for extrinsic systems.
+ *
+ * ### Rotation representation and conversions ###
+ *
+ * It has been proved(see Wikipedia link below) that every rotation can be represented
+ * by Euler angles, but there is no singular representation (e.g. unlike rotation matrices).
+ * Therefore, you can convert from Eigen rotation and to them
+ * (including rotation matrices, which is not called "rotations" by Eigen design).
+ *
+ * Euler angles usually used for:
+ * - convenient human representation of rotation, especially in interactive GUI.
+ * - gimbal systems and robotics
+ * - efficient encoding(i.e. 3 floats only) of rotation for network protocols.
+ *
+ * However, Euler angles are slow comparing to quaternion or matrices,
+ * because their unnatural math definition, although it's simple for human.
+ * To overcome this, this class provide easy movement from the math friendly representation
+ * to the human friendly representation, and vise-versa.
+ *
+ * All the user need to do is a safe simple C++ type conversion,
+ * and this class take care for the math.
+ * Additionally, some axes related computation is done in compile time.
+ *
+ * #### Euler angles ranges in conversions ####
+ *
+ * When converting some rotation to Euler angles, there are some ways you can guarantee
+ * the Euler angles ranges.
+ *
+ * #### implicit ranges ####
+ * When using implicit ranges, all angles are guarantee to be in the range [-PI, +PI],
+ * unless you convert from some other Euler angles.
+ * In this case, the range is __undefined__ (might be even less than -PI or greater than +2*PI).
+ * \sa EulerAngles(const MatrixBase<Derived>&)
+ * \sa EulerAngles(const RotationBase<Derived, 3>&)
+ *
+ * #### explicit ranges ####
+ * When using explicit ranges, all angles are guarantee to be in the range you choose.
+ * In the range Boolean parameter, you're been ask whether you prefer the positive range or not:
+ * - _true_ - force the range between [0, +2*PI]
+ * - _false_ - force the range between [-PI, +PI]
+ *
+ * ##### compile time ranges #####
+ * This is when you have compile time ranges and you prefer to
+ * use template parameter. (e.g. for performance)
+ * \sa FromRotation()
+ *
+ * ##### run-time time ranges #####
+ * Run-time ranges are also supported.
+ * \sa EulerAngles(const MatrixBase<Derived>&, bool, bool, bool)
+ * \sa EulerAngles(const RotationBase<Derived, 3>&, bool, bool, bool)
+ *
+ * ### Convenient user typedefs ###
+ *
+ * Convenient typedefs for EulerAngles exist for float and double scalar,
+ * in a form of EulerAngles{A}{B}{C}{scalar},
+ * e.g. \ref EulerAnglesXYZd, \ref EulerAnglesZYZf.
+ *
+ * Only for positive axes{+x,+y,+z} Euler systems are have convenient typedef.
+ * If you need negative axes{-x,-y,-z}, it is recommended to create you own typedef with
+ * a word that represent what you need.
+ *
+ * ### Example ###
+ *
+ * \include EulerAngles.cpp
+ * Output: \verbinclude EulerAngles.out
+ *
+ * ### Additional reading ###
+ *
+ * If you're want to get more idea about how Euler system work in Eigen see EulerSystem.
+ *
+ * More information about Euler angles: https://en.wikipedia.org/wiki/Euler_angles
+ *
+ * \tparam _Scalar the scalar type, i.e., the type of the angles.
+ *
+ * \tparam _System the EulerSystem to use, which represents the axes of rotation.
+ */
+ template <typename _Scalar, class _System>
+ class EulerAngles : public RotationBase<EulerAngles<_Scalar, _System>, 3>
+ {
+ public:
+ /** the scalar type of the angles */
+ typedef _Scalar Scalar;
+
+ /** the EulerSystem to use, which represents the axes of rotation. */
+ typedef _System System;
+
+ typedef Matrix<Scalar,3,3> Matrix3; /*!< the equivalent rotation matrix type */
+ typedef Matrix<Scalar,3,1> Vector3; /*!< the equivalent 3 dimension vector type */
+ typedef Quaternion<Scalar> QuaternionType; /*!< the equivalent quaternion type */
+ typedef AngleAxis<Scalar> AngleAxisType; /*!< the equivalent angle-axis type */
+
+ /** \returns the axis vector of the first (alpha) rotation */
+ static Vector3 AlphaAxisVector() {
+ const Vector3& u = Vector3::Unit(System::AlphaAxisAbs - 1);
+ return System::IsAlphaOpposite ? -u : u;
+ }
+
+ /** \returns the axis vector of the second (beta) rotation */
+ static Vector3 BetaAxisVector() {
+ const Vector3& u = Vector3::Unit(System::BetaAxisAbs - 1);
+ return System::IsBetaOpposite ? -u : u;
+ }
+
+ /** \returns the axis vector of the third (gamma) rotation */
+ static Vector3 GammaAxisVector() {
+ const Vector3& u = Vector3::Unit(System::GammaAxisAbs - 1);
+ return System::IsGammaOpposite ? -u : u;
+ }
+
+ private:
+ Vector3 m_angles;
+
+ public:
+ /** Default constructor without initialization. */
+ EulerAngles() {}
+ /** Constructs and initialize Euler angles(\p alpha, \p beta, \p gamma). */
+ EulerAngles(const Scalar& alpha, const Scalar& beta, const Scalar& gamma) :
+ m_angles(alpha, beta, gamma) {}
+
+ /** Constructs and initialize Euler angles from a 3x3 rotation matrix \p m.
+ *
+ * \note All angles will be in the range [-PI, PI].
+ */
+ template<typename Derived>
+ EulerAngles(const MatrixBase<Derived>& m) { *this = m; }
+
+ /** Constructs and initialize Euler angles from a 3x3 rotation matrix \p m,
+ * with options to choose for each angle the requested range.
+ *
+ * If positive range is true, then the specified angle will be in the range [0, +2*PI].
+ * Otherwise, the specified angle will be in the range [-PI, +PI].
+ *
+ * \param m The 3x3 rotation matrix to convert
+ * \param positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+ * \param positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+ * \param positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+ */
+ template<typename Derived>
+ EulerAngles(
+ const MatrixBase<Derived>& m,
+ bool positiveRangeAlpha,
+ bool positiveRangeBeta,
+ bool positiveRangeGamma) {
+
+ System::CalcEulerAngles(*this, m, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma);
+ }
+
+ /** Constructs and initialize Euler angles from a rotation \p rot.
+ *
+ * \note All angles will be in the range [-PI, PI], unless \p rot is an EulerAngles.
+ * If rot is an EulerAngles, expected EulerAngles range is __undefined__.
+ * (Use other functions here for enforcing range if this effect is desired)
+ */
+ template<typename Derived>
+ EulerAngles(const RotationBase<Derived, 3>& rot) { *this = rot; }
+
+ /** Constructs and initialize Euler angles from a rotation \p rot,
+ * with options to choose for each angle the requested range.
+ *
+ * If positive range is true, then the specified angle will be in the range [0, +2*PI].
+ * Otherwise, the specified angle will be in the range [-PI, +PI].
+ *
+ * \param rot The 3x3 rotation matrix to convert
+ * \param positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+ * \param positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+ * \param positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+ */
+ template<typename Derived>
+ EulerAngles(
+ const RotationBase<Derived, 3>& rot,
+ bool positiveRangeAlpha,
+ bool positiveRangeBeta,
+ bool positiveRangeGamma) {
+
+ System::CalcEulerAngles(*this, rot.toRotationMatrix(), positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma);
+ }
+
+ /** \returns The angle values stored in a vector (alpha, beta, gamma). */
+ const Vector3& angles() const { return m_angles; }
+ /** \returns A read-write reference to the angle values stored in a vector (alpha, beta, gamma). */
+ Vector3& angles() { return m_angles; }
+
+ /** \returns The value of the first angle. */
+ Scalar alpha() const { return m_angles[0]; }
+ /** \returns A read-write reference to the angle of the first angle. */
+ Scalar& alpha() { return m_angles[0]; }
+
+ /** \returns The value of the second angle. */
+ Scalar beta() const { return m_angles[1]; }
+ /** \returns A read-write reference to the angle of the second angle. */
+ Scalar& beta() { return m_angles[1]; }
+
+ /** \returns The value of the third angle. */
+ Scalar gamma() const { return m_angles[2]; }
+ /** \returns A read-write reference to the angle of the third angle. */
+ Scalar& gamma() { return m_angles[2]; }
+
+ /** \returns The Euler angles rotation inverse (which is as same as the negative),
+ * (-alpha, -beta, -gamma).
+ */
+ EulerAngles inverse() const
+ {
+ EulerAngles res;
+ res.m_angles = -m_angles;
+ return res;
+ }
+
+ /** \returns The Euler angles rotation negative (which is as same as the inverse),
+ * (-alpha, -beta, -gamma).
+ */
+ EulerAngles operator -() const
+ {
+ return inverse();
+ }
+
+ /** Constructs and initialize Euler angles from a 3x3 rotation matrix \p m,
+ * with options to choose for each angle the requested range (__only in compile time__).
+ *
+ * If positive range is true, then the specified angle will be in the range [0, +2*PI].
+ * Otherwise, the specified angle will be in the range [-PI, +PI].
+ *
+ * \param m The 3x3 rotation matrix to convert
+ * \tparam positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+ * \tparam positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+ * \tparam positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+ */
+ template<
+ bool PositiveRangeAlpha,
+ bool PositiveRangeBeta,
+ bool PositiveRangeGamma,
+ typename Derived>
+ static EulerAngles FromRotation(const MatrixBase<Derived>& m)
+ {
+ EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Derived, 3, 3)
+
+ EulerAngles e;
+ System::template CalcEulerAngles<
+ PositiveRangeAlpha, PositiveRangeBeta, PositiveRangeGamma, _Scalar>(e, m);
+ return e;
+ }
+
+ /** Constructs and initialize Euler angles from a rotation \p rot,
+ * with options to choose for each angle the requested range (__only in compile time__).
+ *
+ * If positive range is true, then the specified angle will be in the range [0, +2*PI].
+ * Otherwise, the specified angle will be in the range [-PI, +PI].
+ *
+ * \param rot The 3x3 rotation matrix to convert
+ * \tparam positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+ * \tparam positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+ * \tparam positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+ */
+ template<
+ bool PositiveRangeAlpha,
+ bool PositiveRangeBeta,
+ bool PositiveRangeGamma,
+ typename Derived>
+ static EulerAngles FromRotation(const RotationBase<Derived, 3>& rot)
+ {
+ return FromRotation<PositiveRangeAlpha, PositiveRangeBeta, PositiveRangeGamma>(rot.toRotationMatrix());
+ }
+
+ /*EulerAngles& fromQuaternion(const QuaternionType& q)
+ {
+ // TODO: Implement it in a faster way for quaternions
+ // According to http://www.euclideanspace.com/maths/geometry/rotations/conversions/quaternionToEuler/
+ // we can compute only the needed matrix cells and then convert to euler angles. (see ZYX example below)
+ // Currently we compute all matrix cells from quaternion.
+
+ // Special case only for ZYX
+ //Scalar y2 = q.y() * q.y();
+ //m_angles[0] = std::atan2(2*(q.w()*q.z() + q.x()*q.y()), (1 - 2*(y2 + q.z()*q.z())));
+ //m_angles[1] = std::asin( 2*(q.w()*q.y() - q.z()*q.x()));
+ //m_angles[2] = std::atan2(2*(q.w()*q.x() + q.y()*q.z()), (1 - 2*(q.x()*q.x() + y2)));
+ }*/
+
+ /** Set \c *this from a rotation matrix(i.e. pure orthogonal matrix with determinant of +1). */
+ template<typename Derived>
+ EulerAngles& operator=(const MatrixBase<Derived>& m) {
+ EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Derived, 3, 3)
+
+ System::CalcEulerAngles(*this, m);
+ return *this;
+ }
+
+ // TODO: Assign and construct from another EulerAngles (with different system)
+
+ /** Set \c *this from a rotation. */
+ template<typename Derived>
+ EulerAngles& operator=(const RotationBase<Derived, 3>& rot) {
+ System::CalcEulerAngles(*this, rot.toRotationMatrix());
+ return *this;
+ }
+
+ // TODO: Support isApprox function
+
+ /** \returns an equivalent 3x3 rotation matrix. */
+ Matrix3 toRotationMatrix() const
+ {
+ return static_cast<QuaternionType>(*this).toRotationMatrix();
+ }
+
+ /** Convert the Euler angles to quaternion. */
+ operator QuaternionType() const
+ {
+ return
+ AngleAxisType(alpha(), AlphaAxisVector()) *
+ AngleAxisType(beta(), BetaAxisVector()) *
+ AngleAxisType(gamma(), GammaAxisVector());
+ }
+
+ friend std::ostream& operator<<(std::ostream& s, const EulerAngles<Scalar, System>& eulerAngles)
+ {
+ s << eulerAngles.angles().transpose();
+ return s;
+ }
+ };
+
+#define EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(AXES, SCALAR_TYPE, SCALAR_POSTFIX) \
+ /** \ingroup EulerAngles_Module */ \
+ typedef EulerAngles<SCALAR_TYPE, EulerSystem##AXES> EulerAngles##AXES##SCALAR_POSTFIX;
+
+#define EIGEN_EULER_ANGLES_TYPEDEFS(SCALAR_TYPE, SCALAR_POSTFIX) \
+ EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XYZ, SCALAR_TYPE, SCALAR_POSTFIX) \
+ EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XYX, SCALAR_TYPE, SCALAR_POSTFIX) \
+ EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XZY, SCALAR_TYPE, SCALAR_POSTFIX) \
+ EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XZX, SCALAR_TYPE, SCALAR_POSTFIX) \
+ \
+ EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YZX, SCALAR_TYPE, SCALAR_POSTFIX) \
+ EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YZY, SCALAR_TYPE, SCALAR_POSTFIX) \
+ EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YXZ, SCALAR_TYPE, SCALAR_POSTFIX) \
+ EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YXY, SCALAR_TYPE, SCALAR_POSTFIX) \
+ \
+ EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZXY, SCALAR_TYPE, SCALAR_POSTFIX) \
+ EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZXZ, SCALAR_TYPE, SCALAR_POSTFIX) \
+ EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZYX, SCALAR_TYPE, SCALAR_POSTFIX) \
+ EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZYZ, SCALAR_TYPE, SCALAR_POSTFIX)
+
+EIGEN_EULER_ANGLES_TYPEDEFS(float, f)
+EIGEN_EULER_ANGLES_TYPEDEFS(double, d)
+
+ namespace internal
+ {
+ template<typename _Scalar, class _System>
+ struct traits<EulerAngles<_Scalar, _System> >
+ {
+ typedef _Scalar Scalar;
+ };
+ }
+
+}
+
+#endif // EIGEN_EULERANGLESCLASS_H
diff --git a/unsupported/Eigen/src/EulerAngles/EulerSystem.h b/unsupported/Eigen/src/EulerAngles/EulerSystem.h
new file mode 100644
index 000000000..98f9f647d
--- /dev/null
+++ b/unsupported/Eigen/src/EulerAngles/EulerSystem.h
@@ -0,0 +1,326 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EULERSYSTEM_H
+#define EIGEN_EULERSYSTEM_H
+
+namespace Eigen
+{
+ // Forward declerations
+ template <typename _Scalar, class _System>
+ class EulerAngles;
+
+ namespace internal
+ {
+ // TODO: Check if already exists on the rest API
+ template <int Num, bool IsPositive = (Num > 0)>
+ struct Abs
+ {
+ enum { value = Num };
+ };
+
+ template <int Num>
+ struct Abs<Num, false>
+ {
+ enum { value = -Num };
+ };
+
+ template <int Axis>
+ struct IsValidAxis
+ {
+ enum { value = Axis != 0 && Abs<Axis>::value <= 3 };
+ };
+ }
+
+ #define EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(COND,MSG) typedef char static_assertion_##MSG[(COND)?1:-1]
+
+ /** \brief Representation of a fixed signed rotation axis for EulerSystem.
+ *
+ * \ingroup EulerAngles_Module
+ *
+ * Values here represent:
+ * - The axis of the rotation: X, Y or Z.
+ * - The sign (i.e. direction of the rotation along the axis): positive(+) or negative(-)
+ *
+ * Therefore, this could express all the axes {+X,+Y,+Z,-X,-Y,-Z}
+ *
+ * For positive axis, use +EULER_{axis}, and for negative axis use -EULER_{axis}.
+ */
+ enum EulerAxis
+ {
+ EULER_X = 1, /*!< the X axis */
+ EULER_Y = 2, /*!< the Y axis */
+ EULER_Z = 3 /*!< the Z axis */
+ };
+
+ /** \class EulerSystem
+ *
+ * \ingroup EulerAngles_Module
+ *
+ * \brief Represents a fixed Euler rotation system.
+ *
+ * This meta-class goal is to represent the Euler system in compilation time, for EulerAngles.
+ *
+ * You can use this class to get two things:
+ * - Build an Euler system, and then pass it as a template parameter to EulerAngles.
+ * - Query some compile time data about an Euler system. (e.g. Whether it's tait bryan)
+ *
+ * Euler rotation is a set of three rotation on fixed axes. (see \ref EulerAngles)
+ * This meta-class store constantly those signed axes. (see \ref EulerAxis)
+ *
+ * ### Types of Euler systems ###
+ *
+ * All and only valid 3 dimension Euler rotation over standard
+ * signed axes{+X,+Y,+Z,-X,-Y,-Z} are supported:
+ * - all axes X, Y, Z in each valid order (see below what order is valid)
+ * - rotation over the axis is supported both over the positive and negative directions.
+ * - both tait bryan and proper/classic Euler angles (i.e. the opposite).
+ *
+ * Since EulerSystem support both positive and negative directions,
+ * you may call this rotation distinction in other names:
+ * - _right handed_ or _left handed_
+ * - _counterclockwise_ or _clockwise_
+ *
+ * Notice all axed combination are valid, and would trigger a static assertion.
+ * Same unsigned axes can't be neighbors, e.g. {X,X,Y} is invalid.
+ * This yield two and only two classes:
+ * - _tait bryan_ - all unsigned axes are distinct, e.g. {X,Y,Z}
+ * - _proper/classic Euler angles_ - The first and the third unsigned axes is equal,
+ * and the second is different, e.g. {X,Y,X}
+ *
+ * ### Intrinsic vs extrinsic Euler systems ###
+ *
+ * Only intrinsic Euler systems are supported for simplicity.
+ * If you want to use extrinsic Euler systems,
+ * just use the equal intrinsic opposite order for axes and angles.
+ * I.e axes (A,B,C) becomes (C,B,A), and angles (a,b,c) becomes (c,b,a).
+ *
+ * ### Convenient user typedefs ###
+ *
+ * Convenient typedefs for EulerSystem exist (only for positive axes Euler systems),
+ * in a form of EulerSystem{A}{B}{C}, e.g. \ref EulerSystemXYZ.
+ *
+ * ### Additional reading ###
+ *
+ * More information about Euler angles: https://en.wikipedia.org/wiki/Euler_angles
+ *
+ * \tparam _AlphaAxis the first fixed EulerAxis
+ *
+ * \tparam _AlphaAxis the second fixed EulerAxis
+ *
+ * \tparam _AlphaAxis the third fixed EulerAxis
+ */
+ template <int _AlphaAxis, int _BetaAxis, int _GammaAxis>
+ class EulerSystem
+ {
+ public:
+ // It's defined this way and not as enum, because I think
+ // that enum is not guerantee to support negative numbers
+
+ /** The first rotation axis */
+ static const int AlphaAxis = _AlphaAxis;
+
+ /** The second rotation axis */
+ static const int BetaAxis = _BetaAxis;
+
+ /** The third rotation axis */
+ static const int GammaAxis = _GammaAxis;
+
+ enum
+ {
+ AlphaAxisAbs = internal::Abs<AlphaAxis>::value, /*!< the first rotation axis unsigned */
+ BetaAxisAbs = internal::Abs<BetaAxis>::value, /*!< the second rotation axis unsigned */
+ GammaAxisAbs = internal::Abs<GammaAxis>::value, /*!< the third rotation axis unsigned */
+
+ IsAlphaOpposite = (AlphaAxis < 0) ? 1 : 0, /*!< weather alpha axis is negative */
+ IsBetaOpposite = (BetaAxis < 0) ? 1 : 0, /*!< weather beta axis is negative */
+ IsGammaOpposite = (GammaAxis < 0) ? 1 : 0, /*!< weather gamma axis is negative */
+
+ IsOdd = ((AlphaAxisAbs)%3 == (BetaAxisAbs - 1)%3) ? 0 : 1, /*!< weather the Euler system is odd */
+ IsEven = IsOdd ? 0 : 1, /*!< weather the Euler system is even */
+
+ IsTaitBryan = ((unsigned)AlphaAxisAbs != (unsigned)GammaAxisAbs) ? 1 : 0 /*!< weather the Euler system is tait bryan */
+ };
+
+ private:
+
+ EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(internal::IsValidAxis<AlphaAxis>::value,
+ ALPHA_AXIS_IS_INVALID);
+
+ EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(internal::IsValidAxis<BetaAxis>::value,
+ BETA_AXIS_IS_INVALID);
+
+ EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(internal::IsValidAxis<GammaAxis>::value,
+ GAMMA_AXIS_IS_INVALID);
+
+ EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT((unsigned)AlphaAxisAbs != (unsigned)BetaAxisAbs,
+ ALPHA_AXIS_CANT_BE_EQUAL_TO_BETA_AXIS);
+
+ EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT((unsigned)BetaAxisAbs != (unsigned)GammaAxisAbs,
+ BETA_AXIS_CANT_BE_EQUAL_TO_GAMMA_AXIS);
+
+ enum
+ {
+ // I, J, K are the pivot indexes permutation for the rotation matrix, that match this Euler system.
+ // They are used in this class converters.
+ // They are always different from each other, and their possible values are: 0, 1, or 2.
+ I = AlphaAxisAbs - 1,
+ J = (AlphaAxisAbs - 1 + 1 + IsOdd)%3,
+ K = (AlphaAxisAbs - 1 + 2 - IsOdd)%3
+ };
+
+ // TODO: Get @mat parameter in form that avoids double evaluation.
+ template <typename Derived>
+ static void CalcEulerAngles_imp(Matrix<typename MatrixBase<Derived>::Scalar, 3, 1>& res, const MatrixBase<Derived>& mat, internal::true_type /*isTaitBryan*/)
+ {
+ using std::atan2;
+ using std::sin;
+ using std::cos;
+
+ typedef typename Derived::Scalar Scalar;
+ typedef Matrix<Scalar,2,1> Vector2;
+
+ res[0] = atan2(mat(J,K), mat(K,K));
+ Scalar c2 = Vector2(mat(I,I), mat(I,J)).norm();
+ if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0))) {
+ if(res[0] > Scalar(0)) {
+ res[0] -= Scalar(EIGEN_PI);
+ }
+ else {
+ res[0] += Scalar(EIGEN_PI);
+ }
+ res[1] = atan2(-mat(I,K), -c2);
+ }
+ else
+ res[1] = atan2(-mat(I,K), c2);
+ Scalar s1 = sin(res[0]);
+ Scalar c1 = cos(res[0]);
+ res[2] = atan2(s1*mat(K,I)-c1*mat(J,I), c1*mat(J,J) - s1 * mat(K,J));
+ }
+
+ template <typename Derived>
+ static void CalcEulerAngles_imp(Matrix<typename MatrixBase<Derived>::Scalar,3,1>& res, const MatrixBase<Derived>& mat, internal::false_type /*isTaitBryan*/)
+ {
+ using std::atan2;
+ using std::sin;
+ using std::cos;
+
+ typedef typename Derived::Scalar Scalar;
+ typedef Matrix<Scalar,2,1> Vector2;
+
+ res[0] = atan2(mat(J,I), mat(K,I));
+ if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0)))
+ {
+ if(res[0] > Scalar(0)) {
+ res[0] -= Scalar(EIGEN_PI);
+ }
+ else {
+ res[0] += Scalar(EIGEN_PI);
+ }
+ Scalar s2 = Vector2(mat(J,I), mat(K,I)).norm();
+ res[1] = -atan2(s2, mat(I,I));
+ }
+ else
+ {
+ Scalar s2 = Vector2(mat(J,I), mat(K,I)).norm();
+ res[1] = atan2(s2, mat(I,I));
+ }
+
+ // With a=(0,1,0), we have i=0; j=1; k=2, and after computing the first two angles,
+ // we can compute their respective rotation, and apply its inverse to M. Since the result must
+ // be a rotation around x, we have:
+ //
+ // c2 s1.s2 c1.s2 1 0 0
+ // 0 c1 -s1 * M = 0 c3 s3
+ // -s2 s1.c2 c1.c2 0 -s3 c3
+ //
+ // Thus: m11.c1 - m21.s1 = c3 & m12.c1 - m22.s1 = s3
+
+ Scalar s1 = sin(res[0]);
+ Scalar c1 = cos(res[0]);
+ res[2] = atan2(c1*mat(J,K)-s1*mat(K,K), c1*mat(J,J) - s1 * mat(K,J));
+ }
+
+ template<typename Scalar>
+ static void CalcEulerAngles(
+ EulerAngles<Scalar, EulerSystem>& res,
+ const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat)
+ {
+ CalcEulerAngles(res, mat, false, false, false);
+ }
+
+ template<
+ bool PositiveRangeAlpha,
+ bool PositiveRangeBeta,
+ bool PositiveRangeGamma,
+ typename Scalar>
+ static void CalcEulerAngles(
+ EulerAngles<Scalar, EulerSystem>& res,
+ const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat)
+ {
+ CalcEulerAngles(res, mat, PositiveRangeAlpha, PositiveRangeBeta, PositiveRangeGamma);
+ }
+
+ template<typename Scalar>
+ static void CalcEulerAngles(
+ EulerAngles<Scalar, EulerSystem>& res,
+ const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat,
+ bool PositiveRangeAlpha,
+ bool PositiveRangeBeta,
+ bool PositiveRangeGamma)
+ {
+ CalcEulerAngles_imp(
+ res.angles(), mat,
+ typename internal::conditional<IsTaitBryan, internal::true_type, internal::false_type>::type());
+
+ if (IsAlphaOpposite == IsOdd)
+ res.alpha() = -res.alpha();
+
+ if (IsBetaOpposite == IsOdd)
+ res.beta() = -res.beta();
+
+ if (IsGammaOpposite == IsOdd)
+ res.gamma() = -res.gamma();
+
+ // Saturate results to the requested range
+ if (PositiveRangeAlpha && (res.alpha() < 0))
+ res.alpha() += Scalar(2 * EIGEN_PI);
+
+ if (PositiveRangeBeta && (res.beta() < 0))
+ res.beta() += Scalar(2 * EIGEN_PI);
+
+ if (PositiveRangeGamma && (res.gamma() < 0))
+ res.gamma() += Scalar(2 * EIGEN_PI);
+ }
+
+ template <typename _Scalar, class _System>
+ friend class Eigen::EulerAngles;
+ };
+
+#define EIGEN_EULER_SYSTEM_TYPEDEF(A, B, C) \
+ /** \ingroup EulerAngles_Module */ \
+ typedef EulerSystem<EULER_##A, EULER_##B, EULER_##C> EulerSystem##A##B##C;
+
+ EIGEN_EULER_SYSTEM_TYPEDEF(X,Y,Z)
+ EIGEN_EULER_SYSTEM_TYPEDEF(X,Y,X)
+ EIGEN_EULER_SYSTEM_TYPEDEF(X,Z,Y)
+ EIGEN_EULER_SYSTEM_TYPEDEF(X,Z,X)
+
+ EIGEN_EULER_SYSTEM_TYPEDEF(Y,Z,X)
+ EIGEN_EULER_SYSTEM_TYPEDEF(Y,Z,Y)
+ EIGEN_EULER_SYSTEM_TYPEDEF(Y,X,Z)
+ EIGEN_EULER_SYSTEM_TYPEDEF(Y,X,Y)
+
+ EIGEN_EULER_SYSTEM_TYPEDEF(Z,X,Y)
+ EIGEN_EULER_SYSTEM_TYPEDEF(Z,X,Z)
+ EIGEN_EULER_SYSTEM_TYPEDEF(Z,Y,X)
+ EIGEN_EULER_SYSTEM_TYPEDEF(Z,Y,Z)
+}
+
+#endif // EIGEN_EULERSYSTEM_H
diff --git a/unsupported/Eigen/src/FFT/CMakeLists.txt b/unsupported/Eigen/src/FFT/CMakeLists.txt
deleted file mode 100644
index edcffcb18..000000000
--- a/unsupported/Eigen/src/FFT/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_FFT_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_FFT_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/FFT COMPONENT Devel
- )
diff --git a/unsupported/Eigen/src/IterativeSolvers/CMakeLists.txt b/unsupported/Eigen/src/IterativeSolvers/CMakeLists.txt
deleted file mode 100644
index 7986afc5e..000000000
--- a/unsupported/Eigen/src/IterativeSolvers/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_IterativeSolvers_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_IterativeSolvers_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/IterativeSolvers COMPONENT Devel
- )
diff --git a/unsupported/Eigen/src/IterativeSolvers/GMRES.h b/unsupported/Eigen/src/IterativeSolvers/GMRES.h
index fbe21fc7e..5a82b0df6 100644
--- a/unsupported/Eigen/src/IterativeSolvers/GMRES.h
+++ b/unsupported/Eigen/src/IterativeSolvers/GMRES.h
@@ -62,7 +62,7 @@ bool gmres(const MatrixType & mat, const Rhs & rhs, Dest & x, const Precondition
typedef typename Dest::RealScalar RealScalar;
typedef typename Dest::Scalar Scalar;
typedef Matrix < Scalar, Dynamic, 1 > VectorType;
- typedef Matrix < Scalar, Dynamic, Dynamic > FMatrixType;
+ typedef Matrix < Scalar, Dynamic, Dynamic, ColMajor> FMatrixType;
RealScalar tol = tol_error;
const Index maxIters = iters;
@@ -157,7 +157,8 @@ bool gmres(const MatrixType & mat, const Rhs & rhs, Dest & x, const Precondition
// insert coefficients into upper matrix triangle
H.col(k-1).head(k) = v.head(k);
- bool stop = (k==m || abs(w(k)) < tol * r0Norm || iters == maxIters);
+ tol_error = abs(w(k)) / r0Norm;
+ bool stop = (k==m || tol_error < tol || iters == maxIters);
if (stop || k == restart)
{
diff --git a/unsupported/Eigen/src/KroneckerProduct/CMakeLists.txt b/unsupported/Eigen/src/KroneckerProduct/CMakeLists.txt
deleted file mode 100644
index 4daefebee..000000000
--- a/unsupported/Eigen/src/KroneckerProduct/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_KroneckerProduct_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_KroneckerProduct_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/KroneckerProduct COMPONENT Devel
- )
diff --git a/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h b/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
index 4d3e5358e..582fa8512 100644
--- a/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
+++ b/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
@@ -203,7 +203,7 @@ struct traits<KroneckerProduct<_Lhs,_Rhs> >
{
typedef typename remove_all<_Lhs>::type Lhs;
typedef typename remove_all<_Rhs>::type Rhs;
- typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
+ typedef typename ScalarBinaryOpTraits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
typedef typename promote_index_type<typename Lhs::StorageIndex, typename Rhs::StorageIndex>::type StorageIndex;
enum {
@@ -222,7 +222,7 @@ struct traits<KroneckerProductSparse<_Lhs,_Rhs> >
typedef MatrixXpr XprKind;
typedef typename remove_all<_Lhs>::type Lhs;
typedef typename remove_all<_Rhs>::type Rhs;
- typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
+ typedef typename ScalarBinaryOpTraits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
typedef typename cwise_promote_storage_type<typename traits<Lhs>::StorageKind, typename traits<Rhs>::StorageKind, scalar_product_op<typename Lhs::Scalar, typename Rhs::Scalar> >::ret StorageKind;
typedef typename promote_index_type<typename Lhs::StorageIndex, typename Rhs::StorageIndex>::type StorageIndex;
@@ -239,7 +239,7 @@ struct traits<KroneckerProductSparse<_Lhs,_Rhs> >
RemovedBits = ~(EvalToRowMajor ? 0 : RowMajorBit),
Flags = ((LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
- | EvalBeforeNestingBit | EvalBeforeAssigningBit,
+ | EvalBeforeNestingBit,
CoeffReadCost = HugeCost
};
diff --git a/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt b/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt
deleted file mode 100644
index d9690854d..000000000
--- a/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_LevenbergMarquardt_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_LevenbergMarquardt_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/LevenbergMarquardt COMPONENT Devel
- )
diff --git a/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h b/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
index b30e0a90a..995427978 100644
--- a/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
+++ b/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
@@ -304,7 +304,7 @@ LevenbergMarquardt<FunctorType>::minimizeInit(FVectorType &x)
// m_fjac.reserve(VectorXi::Constant(n,5)); // FIXME Find a better alternative
if (!m_useExternalScaling)
m_diag.resize(n);
- eigen_assert( (!m_useExternalScaling || m_diag.size()==n) || "When m_useExternalScaling is set, the caller must provide a valid 'm_diag'");
+ eigen_assert( (!m_useExternalScaling || m_diag.size()==n) && "When m_useExternalScaling is set, the caller must provide a valid 'm_diag'");
m_qtf.resize(n);
/* Function Body */
diff --git a/unsupported/Eigen/src/MatrixFunctions/CMakeLists.txt b/unsupported/Eigen/src/MatrixFunctions/CMakeLists.txt
deleted file mode 100644
index cdde64d2c..000000000
--- a/unsupported/Eigen/src/MatrixFunctions/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_MatrixFunctions_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_MatrixFunctions_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/MatrixFunctions COMPONENT Devel
- )
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
index bbb7e5776..4bb1852b6 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
@@ -65,7 +65,7 @@ template <typename MatrixType>
void matrix_exp_pade3(const MatrixType &A, MatrixType &U, MatrixType &V)
{
typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
- const RealScalar b[] = {120., 60., 12., 1.};
+ const RealScalar b[] = {120.L, 60.L, 12.L, 1.L};
const MatrixType A2 = A * A;
const MatrixType tmp = b[3] * A2 + b[1] * MatrixType::Identity(A.rows(), A.cols());
U.noalias() = A * tmp;
@@ -81,7 +81,7 @@ template <typename MatrixType>
void matrix_exp_pade5(const MatrixType &A, MatrixType &U, MatrixType &V)
{
typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
- const RealScalar b[] = {30240., 15120., 3360., 420., 30., 1.};
+ const RealScalar b[] = {30240.L, 15120.L, 3360.L, 420.L, 30.L, 1.L};
const MatrixType A2 = A * A;
const MatrixType A4 = A2 * A2;
const MatrixType tmp = b[5] * A4 + b[3] * A2 + b[1] * MatrixType::Identity(A.rows(), A.cols());
@@ -98,7 +98,7 @@ template <typename MatrixType>
void matrix_exp_pade7(const MatrixType &A, MatrixType &U, MatrixType &V)
{
typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
- const RealScalar b[] = {17297280., 8648640., 1995840., 277200., 25200., 1512., 56., 1.};
+ const RealScalar b[] = {17297280.L, 8648640.L, 1995840.L, 277200.L, 25200.L, 1512.L, 56.L, 1.L};
const MatrixType A2 = A * A;
const MatrixType A4 = A2 * A2;
const MatrixType A6 = A4 * A2;
@@ -118,8 +118,8 @@ template <typename MatrixType>
void matrix_exp_pade9(const MatrixType &A, MatrixType &U, MatrixType &V)
{
typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
- const RealScalar b[] = {17643225600., 8821612800., 2075673600., 302702400., 30270240.,
- 2162160., 110880., 3960., 90., 1.};
+ const RealScalar b[] = {17643225600.L, 8821612800.L, 2075673600.L, 302702400.L, 30270240.L,
+ 2162160.L, 110880.L, 3960.L, 90.L, 1.L};
const MatrixType A2 = A * A;
const MatrixType A4 = A2 * A2;
const MatrixType A6 = A4 * A2;
@@ -139,9 +139,9 @@ template <typename MatrixType>
void matrix_exp_pade13(const MatrixType &A, MatrixType &U, MatrixType &V)
{
typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
- const RealScalar b[] = {64764752532480000., 32382376266240000., 7771770303897600.,
- 1187353796428800., 129060195264000., 10559470521600., 670442572800.,
- 33522128640., 1323241920., 40840800., 960960., 16380., 182., 1.};
+ const RealScalar b[] = {64764752532480000.L, 32382376266240000.L, 7771770303897600.L,
+ 1187353796428800.L, 129060195264000.L, 10559470521600.L, 670442572800.L,
+ 33522128640.L, 1323241920.L, 40840800.L, 960960.L, 16380.L, 182.L, 1.L};
const MatrixType A2 = A * A;
const MatrixType A4 = A2 * A2;
const MatrixType A6 = A4 * A2;
@@ -210,9 +210,9 @@ struct matrix_exp_computeUV<MatrixType, float>
using std::pow;
const float l1norm = arg.cwiseAbs().colwise().sum().maxCoeff();
squarings = 0;
- if (l1norm < 4.258730016922831e-001) {
+ if (l1norm < 4.258730016922831e-001f) {
matrix_exp_pade3(arg, U, V);
- } else if (l1norm < 1.880152677804762e+000) {
+ } else if (l1norm < 1.880152677804762e+000f) {
matrix_exp_pade5(arg, U, V);
} else {
const float maxnorm = 3.925724783138660f;
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
index 8f7a6f3b0..db2449d02 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
@@ -132,6 +132,7 @@ template <typename EivalsType, typename Cluster>
void matrix_function_partition_eigenvalues(const EivalsType& eivals, std::list<Cluster>& clusters)
{
typedef typename EivalsType::Index Index;
+ typedef typename EivalsType::RealScalar RealScalar;
for (Index i=0; i<eivals.rows(); ++i) {
// Find cluster containing i-th ei'val, adding a new cluster if necessary
typename std::list<Cluster>::iterator qi = matrix_function_find_cluster(i, clusters);
@@ -145,7 +146,7 @@ void matrix_function_partition_eigenvalues(const EivalsType& eivals, std::list<C
// Look for other element to add to the set
for (Index j=i+1; j<eivals.rows(); ++j) {
- if (abs(eivals(j) - eivals(i)) <= matrix_function_separation
+ if (abs(eivals(j) - eivals(i)) <= RealScalar(matrix_function_separation)
&& std::find(qi->begin(), qi->end(), j) == qi->end()) {
typename std::list<Cluster>::iterator qj = matrix_function_find_cluster(j, clusters);
if (qj == clusters.end()) {
@@ -403,11 +404,10 @@ struct matrix_function_compute<MatrixType, 0>
typedef internal::traits<MatrixType> Traits;
typedef typename Traits::Scalar Scalar;
static const int Rows = Traits::RowsAtCompileTime, Cols = Traits::ColsAtCompileTime;
- static const int Options = MatrixType::Options;
static const int MaxRows = Traits::MaxRowsAtCompileTime, MaxCols = Traits::MaxColsAtCompileTime;
typedef std::complex<Scalar> ComplexScalar;
- typedef Matrix<ComplexScalar, Rows, Cols, Options, MaxRows, MaxCols> ComplexMatrix;
+ typedef Matrix<ComplexScalar, Rows, Cols, 0, MaxRows, MaxCols> ComplexMatrix;
ComplexMatrix CA = A.template cast<ComplexScalar>();
ComplexMatrix Cresult;
@@ -508,9 +508,8 @@ template<typename Derived> class MatrixFunctionReturnValue
typedef internal::traits<NestedEvalTypeClean> Traits;
static const int RowsAtCompileTime = Traits::RowsAtCompileTime;
static const int ColsAtCompileTime = Traits::ColsAtCompileTime;
- static const int Options = NestedEvalTypeClean::Options;
typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
- typedef Matrix<ComplexScalar, Dynamic, Dynamic, Options, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
+ typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
typedef internal::MatrixFunctionAtomic<DynMatrixType> AtomicType;
AtomicType atomic(m_f);
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
index e43e86e90..1acfbed9e 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
@@ -37,6 +37,7 @@ template <typename MatrixType>
void matrix_log_compute_2x2(const MatrixType& A, MatrixType& result)
{
typedef typename MatrixType::Scalar Scalar;
+ typedef typename MatrixType::RealScalar RealScalar;
using std::abs;
using std::ceil;
using std::imag;
@@ -54,14 +55,14 @@ void matrix_log_compute_2x2(const MatrixType& A, MatrixType& result)
{
result(0,1) = A(0,1) / A(0,0);
}
- else if ((abs(A(0,0)) < 0.5*abs(A(1,1))) || (abs(A(0,0)) > 2*abs(A(1,1))))
+ else if ((abs(A(0,0)) < RealScalar(0.5)*abs(A(1,1))) || (abs(A(0,0)) > 2*abs(A(1,1))))
{
result(0,1) = A(0,1) * (logA11 - logA00) / y;
}
else
{
// computation in previous branch is inaccurate if A(1,1) \approx A(0,0)
- int unwindingNumber = static_cast<int>(ceil((imag(logA11 - logA00) - EIGEN_PI) / (2*EIGEN_PI)));
+ int unwindingNumber = static_cast<int>(ceil((imag(logA11 - logA00) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI)));
result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,2*EIGEN_PI*unwindingNumber)) / y;
}
}
@@ -232,8 +233,8 @@ void matrix_log_compute_big(const MatrixType& A, MatrixType& result)
MatrixType T = A, sqrtT;
int maxPadeDegree = matrix_log_max_pade_degree<Scalar>::value;
- const RealScalar maxNormForPade = maxPadeDegree<= 5? 5.3149729967117310e-1: // single precision
- maxPadeDegree<= 7? 2.6429608311114350e-1: // double precision
+ const RealScalar maxNormForPade = maxPadeDegree<= 5? 5.3149729967117310e-1L: // single precision
+ maxPadeDegree<= 7? 2.6429608311114350e-1L: // double precision
maxPadeDegree<= 8? 2.32777776523703892094e-1L: // extended precision
maxPadeDegree<=10? 1.05026503471351080481093652651105e-1L: // double-double
1.1880960220216759245467951592883642e-1L; // quadruple precision
@@ -333,9 +334,8 @@ public:
typedef internal::traits<DerivedEvalTypeClean> Traits;
static const int RowsAtCompileTime = Traits::RowsAtCompileTime;
static const int ColsAtCompileTime = Traits::ColsAtCompileTime;
- static const int Options = DerivedEvalTypeClean::Options;
typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
- typedef Matrix<ComplexScalar, Dynamic, Dynamic, Options, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
+ typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
typedef internal::MatrixLogarithmAtomic<DynMatrixType> AtomicType;
AtomicType atomic;
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
index f37d31c3f..ebc433d89 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
@@ -196,11 +196,11 @@ void MatrixPowerAtomic<MatrixType>::computeBig(ResultType& res) const
{
using std::ldexp;
const int digits = std::numeric_limits<RealScalar>::digits;
- const RealScalar maxNormForPade = digits <= 24? 4.3386528e-1f: // sigle precision
- digits <= 53? 2.789358995219730e-1: // double precision
- digits <= 64? 2.4471944416607995472e-1L: // extended precision
- digits <= 106? 1.1016843812851143391275867258512e-1L: // double-double
- 9.134603732914548552537150753385375e-2L; // quadruple precision
+ const RealScalar maxNormForPade = digits <= 24? 4.3386528e-1L // single precision
+ : digits <= 53? 2.789358995219730e-1L // double precision
+ : digits <= 64? 2.4471944416607995472e-1L // extended precision
+ : digits <= 106? 1.1016843812851143391275867258512e-1L // double-double
+ : 9.134603732914548552537150753385375e-2L; // quadruple precision
MatrixType IminusT, sqrtT, T = m_A.template triangularView<Upper>();
RealScalar normIminusT;
int degree, degree2, numberOfSquareRoots = 0;
@@ -264,7 +264,7 @@ inline int MatrixPowerAtomic<MatrixType>::getPadeDegree(long double normIminusT)
1.999045567181744e-1L, 2.789358995219730e-1L };
#elif LDBL_MANT_DIG <= 64
const int maxPadeDegree = 8;
- const double maxNormForPade[] = { 6.3854693117491799460e-3L /* degree = 3 */ , 2.6394893435456973676e-2L,
+ const long double maxNormForPade[] = { 6.3854693117491799460e-3L /* degree = 3 */ , 2.6394893435456973676e-2L,
6.4216043030404063729e-2L, 1.1701165502926694307e-1L, 1.7904284231268670284e-1L, 2.4471944416607995472e-1L };
#elif LDBL_MANT_DIG <= 106
const int maxPadeDegree = 10;
@@ -298,7 +298,7 @@ MatrixPowerAtomic<MatrixType>::computeSuperDiag(const ComplexScalar& curr, const
ComplexScalar logCurr = log(curr);
ComplexScalar logPrev = log(prev);
- int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - EIGEN_PI) / (2*EIGEN_PI));
+ int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI));
ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, EIGEN_PI*unwindingNumber);
return RealScalar(2) * exp(RealScalar(0.5) * p * (logCurr + logPrev)) * sinh(p * w) / (curr - prev);
}
@@ -383,7 +383,7 @@ class MatrixPower : internal::noncopyable
private:
typedef std::complex<RealScalar> ComplexScalar;
- typedef Matrix<ComplexScalar, Dynamic, Dynamic, MatrixType::Options,
+ typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0,
MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime> ComplexMatrix;
/** \brief Reference to the base of matrix power. */
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h b/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
index 9f08c6162..afd88ec4d 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
@@ -65,21 +65,6 @@ void matrix_sqrt_quasi_triangular_2x1_off_diagonal_block(const MatrixType& T, ty
sqrtT.template block<2,1>(i,j) = A.fullPivLu().solve(rhs);
}
-// similar to compute1x1offDiagonalBlock()
-template <typename MatrixType, typename ResultType>
-void matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
-{
- typedef typename traits<MatrixType>::Scalar Scalar;
- Matrix<Scalar,2,2> A = sqrtT.template block<2,2>(i,i);
- Matrix<Scalar,2,2> B = sqrtT.template block<2,2>(j,j);
- Matrix<Scalar,2,2> C = T.template block<2,2>(i,j);
- if (j-i > 2)
- C -= sqrtT.block(i, i+2, 2, j-i-2) * sqrtT.block(i+2, j, j-i-2, 2);
- Matrix<Scalar,2,2> X;
- matrix_sqrt_quasi_triangular_solve_auxiliary_equation(X, A, B, C);
- sqrtT.template block<2,2>(i,j) = X;
-}
-
// solves the equation A X + X B = C where all matrices are 2-by-2
template <typename MatrixType>
void matrix_sqrt_quasi_triangular_solve_auxiliary_equation(MatrixType& X, const MatrixType& A, const MatrixType& B, const MatrixType& C)
@@ -98,13 +83,13 @@ void matrix_sqrt_quasi_triangular_solve_auxiliary_equation(MatrixType& X, const
coeffMatrix.coeffRef(2,3) = B.coeff(1,0);
coeffMatrix.coeffRef(3,1) = A.coeff(1,0);
coeffMatrix.coeffRef(3,2) = B.coeff(0,1);
-
+
Matrix<Scalar,4,1> rhs;
rhs.coeffRef(0) = C.coeff(0,0);
rhs.coeffRef(1) = C.coeff(0,1);
rhs.coeffRef(2) = C.coeff(1,0);
rhs.coeffRef(3) = C.coeff(1,1);
-
+
Matrix<Scalar,4,1> result;
result = coeffMatrix.fullPivLu().solve(rhs);
@@ -114,6 +99,20 @@ void matrix_sqrt_quasi_triangular_solve_auxiliary_equation(MatrixType& X, const
X.coeffRef(1,1) = result.coeff(3);
}
+// similar to compute1x1offDiagonalBlock()
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
+{
+ typedef typename traits<MatrixType>::Scalar Scalar;
+ Matrix<Scalar,2,2> A = sqrtT.template block<2,2>(i,i);
+ Matrix<Scalar,2,2> B = sqrtT.template block<2,2>(j,j);
+ Matrix<Scalar,2,2> C = T.template block<2,2>(i,j);
+ if (j-i > 2)
+ C -= sqrtT.block(i, i+2, 2, j-i-2) * sqrtT.block(i+2, j, j-i-2, 2);
+ Matrix<Scalar,2,2> X;
+ matrix_sqrt_quasi_triangular_solve_auxiliary_equation(X, A, B, C);
+ sqrtT.template block<2,2>(i,j) = X;
+}
// pre: T is quasi-upper-triangular and sqrtT is a zero matrix of the same size
// post: the diagonal blocks of sqrtT are the square roots of the diagonal blocks of T
diff --git a/unsupported/Eigen/src/MoreVectorization/CMakeLists.txt b/unsupported/Eigen/src/MoreVectorization/CMakeLists.txt
deleted file mode 100644
index 1b887cc8e..000000000
--- a/unsupported/Eigen/src/MoreVectorization/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_MoreVectorization_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_MoreVectorization_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/MoreVectorization COMPONENT Devel
- )
diff --git a/unsupported/Eigen/src/NonLinearOptimization/CMakeLists.txt b/unsupported/Eigen/src/NonLinearOptimization/CMakeLists.txt
deleted file mode 100644
index 9322ddadf..000000000
--- a/unsupported/Eigen/src/NonLinearOptimization/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_NonLinearOptimization_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_NonLinearOptimization_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/NonLinearOptimization COMPONENT Devel
- )
diff --git a/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h b/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
index b8ba6ddcb..8fe3ed86b 100644
--- a/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
+++ b/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
@@ -150,7 +150,7 @@ HybridNonLinearSolver<FunctorType,Scalar>::solveInit(FVectorType &x)
fjac.resize(n, n);
if (!useExternalScaling)
diag.resize(n);
- eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'");
+ eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'");
/* Function Body */
nfev = 0;
@@ -390,7 +390,7 @@ HybridNonLinearSolver<FunctorType,Scalar>::solveNumericalDiffInit(FVectorType &
fvec.resize(n);
if (!useExternalScaling)
diag.resize(n);
- eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'");
+ eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'");
/* Function Body */
nfev = 0;
diff --git a/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h b/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
index 69106ddc5..fe3b79ca7 100644
--- a/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
+++ b/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
@@ -179,7 +179,7 @@ LevenbergMarquardt<FunctorType,Scalar>::minimizeInit(FVectorType &x)
fjac.resize(m, n);
if (!useExternalScaling)
diag.resize(n);
- eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'");
+ eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'");
qtf.resize(n);
/* Function Body */
@@ -215,7 +215,7 @@ LevenbergMarquardt<FunctorType,Scalar>::minimizeOneStep(FVectorType &x)
{
using std::abs;
using std::sqrt;
-
+
eigen_assert(x.size()==n); // check the caller is not cheating us
/* calculate the jacobian matrix. */
@@ -398,7 +398,7 @@ LevenbergMarquardt<FunctorType,Scalar>::minimizeOptimumStorageInit(FVectorType
fjac.resize(n, n);
if (!useExternalScaling)
diag.resize(n);
- eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'");
+ eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'");
qtf.resize(n);
/* Function Body */
diff --git a/unsupported/Eigen/src/NumericalDiff/CMakeLists.txt b/unsupported/Eigen/src/NumericalDiff/CMakeLists.txt
deleted file mode 100644
index 1199aca2f..000000000
--- a/unsupported/Eigen/src/NumericalDiff/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_NumericalDiff_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_NumericalDiff_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/NumericalDiff COMPONENT Devel
- )
diff --git a/unsupported/Eigen/src/Polynomials/CMakeLists.txt b/unsupported/Eigen/src/Polynomials/CMakeLists.txt
deleted file mode 100644
index 51f13f3cb..000000000
--- a/unsupported/Eigen/src/Polynomials/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Polynomials_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_Polynomials_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Polynomials COMPONENT Devel
- )
diff --git a/unsupported/Eigen/src/Skyline/CMakeLists.txt b/unsupported/Eigen/src/Skyline/CMakeLists.txt
deleted file mode 100644
index 3bf1b0dd4..000000000
--- a/unsupported/Eigen/src/Skyline/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Skyline_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_Skyline_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Skyline COMPONENT Devel
- )
diff --git a/unsupported/Eigen/src/SparseExtra/CMakeLists.txt b/unsupported/Eigen/src/SparseExtra/CMakeLists.txt
deleted file mode 100644
index 7ea32ca5e..000000000
--- a/unsupported/Eigen/src/SparseExtra/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_SparseExtra_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_SparseExtra_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/SparseExtra COMPONENT Devel
- )
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
new file mode 100644
index 000000000..ed415db99
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
@@ -0,0 +1,124 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H
+#define EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H
+
+namespace Eigen {
+
+/** \cpp11 \returns an expression of the coefficient-wise igamma(\a a, \a x) to the given arrays.
+ *
+ * This function computes the coefficient-wise incomplete gamma function.
+ *
+ * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+ * or float/double in non c++11 mode, the user has to provide implementations of igammac(T,T) for any scalar
+ * type T to be supported.
+ *
+ * \sa Eigen::igammac(), Eigen::lgamma()
+ */
+template<typename Derived,typename ExponentDerived>
+inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
+igamma(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x)
+{
+ return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
+ a.derived(),
+ x.derived()
+ );
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise igammac(\a a, \a x) to the given arrays.
+ *
+ * This function computes the coefficient-wise complementary incomplete gamma function.
+ *
+ * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+ * or float/double in non c++11 mode, the user has to provide implementations of igammac(T,T) for any scalar
+ * type T to be supported.
+ *
+ * \sa Eigen::igamma(), Eigen::lgamma()
+ */
+template<typename Derived,typename ExponentDerived>
+inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
+igammac(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x)
+{
+ return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
+ a.derived(),
+ x.derived()
+ );
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise polygamma(\a n, \a x) to the given arrays.
+ *
+ * It returns the \a n -th derivative of the digamma(psi) evaluated at \c x.
+ *
+ * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+ * or float/double in non c++11 mode, the user has to provide implementations of polygamma(T,T) for any scalar
+ * type T to be supported.
+ *
+ * \sa Eigen::digamma()
+ */
+// * \warning Be careful with the order of the parameters: x.polygamma(n) is equivalent to polygamma(n,x)
+// * \sa ArrayBase::polygamma()
+template<typename DerivedN,typename DerivedX>
+inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX>
+polygamma(const Eigen::ArrayBase<DerivedN>& n, const Eigen::ArrayBase<DerivedX>& x)
+{
+ return Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX>(
+ n.derived(),
+ x.derived()
+ );
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given arrays.
+ *
+ * This function computes the regularized incomplete beta function (integral).
+ *
+ * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+ * or float/double in non c++11 mode, the user has to provide implementations of betainc(T,T,T) for any scalar
+ * type T to be supported.
+ *
+ * \sa Eigen::betainc(), Eigen::lgamma()
+ */
+template<typename ArgADerived, typename ArgBDerived, typename ArgXDerived>
+inline const Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived>
+betainc(const Eigen::ArrayBase<ArgADerived>& a, const Eigen::ArrayBase<ArgBDerived>& b, const Eigen::ArrayBase<ArgXDerived>& x)
+{
+ return Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived>(
+ a.derived(),
+ b.derived(),
+ x.derived()
+ );
+}
+
+
+/** \returns an expression of the coefficient-wise zeta(\a x, \a q) to the given arrays.
+ *
+ * It returns the Riemann zeta function of two arguments \a x and \a q:
+ *
+ * \param x is the exposent, it must be > 1
+ * \param q is the shift, it must be > 0
+ *
+ * \note This function supports only float and double scalar types. To support other scalar types, the user has
+ * to provide implementations of zeta(T,T) for any scalar type T to be supported.
+ *
+ * \sa ArrayBase::zeta()
+ */
+template<typename DerivedX,typename DerivedQ>
+inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>
+zeta(const Eigen::ArrayBase<DerivedX>& x, const Eigen::ArrayBase<DerivedQ>& q)
+{
+ return Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>(
+ x.derived(),
+ q.derived()
+ );
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
new file mode 100644
index 000000000..d8f2363be
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
@@ -0,0 +1,236 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_FUNCTORS_H
+#define EIGEN_SPECIALFUNCTIONS_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+
+/** \internal
+ * \brief Template functor to compute the incomplete gamma function igamma(a, x)
+ *
+ * \sa class CwiseBinaryOp, Cwise::igamma
+ */
+template<typename Scalar> struct scalar_igamma_op : binary_op_base<Scalar,Scalar>
+{
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_igamma_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const {
+ using numext::igamma; return igamma(a, x);
+ }
+ template<typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const {
+ return internal::pigamma(a, x);
+ }
+};
+template<typename Scalar>
+struct functor_traits<scalar_igamma_op<Scalar> > {
+ enum {
+ // Guesstimate
+ Cost = 20 * NumTraits<Scalar>::MulCost + 10 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasIGamma
+ };
+};
+
+
+/** \internal
+ * \brief Template functor to compute the complementary incomplete gamma function igammac(a, x)
+ *
+ * \sa class CwiseBinaryOp, Cwise::igammac
+ */
+template<typename Scalar> struct scalar_igammac_op : binary_op_base<Scalar,Scalar>
+{
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_igammac_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const {
+ using numext::igammac; return igammac(a, x);
+ }
+ template<typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const
+ {
+ return internal::pigammac(a, x);
+ }
+};
+template<typename Scalar>
+struct functor_traits<scalar_igammac_op<Scalar> > {
+ enum {
+ // Guesstimate
+ Cost = 20 * NumTraits<Scalar>::MulCost + 10 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasIGammac
+ };
+};
+
+
+/** \internal
+ * \brief Template functor to compute the incomplete beta integral betainc(a, b, x)
+ *
+ */
+template<typename Scalar> struct scalar_betainc_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_betainc_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& x, const Scalar& a, const Scalar& b) const {
+ using numext::betainc; return betainc(x, a, b);
+ }
+ template<typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& x, const Packet& a, const Packet& b) const
+ {
+ return internal::pbetainc(x, a, b);
+ }
+};
+template<typename Scalar>
+struct functor_traits<scalar_betainc_op<Scalar> > {
+ enum {
+ // Guesstimate
+ Cost = 400 * NumTraits<Scalar>::MulCost + 400 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasBetaInc
+ };
+};
+
+
+/** \internal
+ * \brief Template functor to compute the natural log of the absolute
+ * value of Gamma of a scalar
+ * \sa class CwiseUnaryOp, Cwise::lgamma()
+ */
+template<typename Scalar> struct scalar_lgamma_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_lgamma_op)
+ EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+ using numext::lgamma; return lgamma(a);
+ }
+ typedef typename packet_traits<Scalar>::type Packet;
+ EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plgamma(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_lgamma_op<Scalar> >
+{
+ enum {
+ // Guesstimate
+ Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasLGamma
+ };
+};
+
+/** \internal
+ * \brief Template functor to compute psi, the derivative of lgamma of a scalar.
+ * \sa class CwiseUnaryOp, Cwise::digamma()
+ */
+template<typename Scalar> struct scalar_digamma_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_digamma_op)
+ EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+ using numext::digamma; return digamma(a);
+ }
+ typedef typename packet_traits<Scalar>::type Packet;
+ EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pdigamma(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_digamma_op<Scalar> >
+{
+ enum {
+ // Guesstimate
+ Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasDiGamma
+ };
+};
+
+/** \internal
+ * \brief Template functor to compute the Riemann Zeta function of two arguments.
+ * \sa class CwiseUnaryOp, Cwise::zeta()
+ */
+template<typename Scalar> struct scalar_zeta_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_zeta_op)
+ EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& x, const Scalar& q) const {
+ using numext::zeta; return zeta(x, q);
+ }
+ typedef typename packet_traits<Scalar>::type Packet;
+ EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x, const Packet& q) const { return internal::pzeta(x, q); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_zeta_op<Scalar> >
+{
+ enum {
+ // Guesstimate
+ Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasZeta
+ };
+};
+
+/** \internal
+ * \brief Template functor to compute the polygamma function.
+ * \sa class CwiseUnaryOp, Cwise::polygamma()
+ */
+template<typename Scalar> struct scalar_polygamma_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_polygamma_op)
+ EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& n, const Scalar& x) const {
+ using numext::polygamma; return polygamma(n, x);
+ }
+ typedef typename packet_traits<Scalar>::type Packet;
+ EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& n, const Packet& x) const { return internal::ppolygamma(n, x); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_polygamma_op<Scalar> >
+{
+ enum {
+ // Guesstimate
+ Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasPolygamma
+ };
+};
+
+/** \internal
+ * \brief Template functor to compute the Gauss error function of a
+ * scalar
+ * \sa class CwiseUnaryOp, Cwise::erf()
+ */
+template<typename Scalar> struct scalar_erf_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_erf_op)
+ EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+ using numext::erf; return erf(a);
+ }
+ typedef typename packet_traits<Scalar>::type Packet;
+ EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perf(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_erf_op<Scalar> >
+{
+ enum {
+ // Guesstimate
+ Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasErf
+ };
+};
+
+/** \internal
+ * \brief Template functor to compute the Complementary Error Function
+ * of a scalar
+ * \sa class CwiseUnaryOp, Cwise::erfc()
+ */
+template<typename Scalar> struct scalar_erfc_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_erfc_op)
+ EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+ using numext::erfc; return erfc(a);
+ }
+ typedef typename packet_traits<Scalar>::type Packet;
+ EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perfc(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_erfc_op<Scalar> >
+{
+ enum {
+ // Guesstimate
+ Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasErfc
+ };
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPECIALFUNCTIONS_FUNCTORS_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
new file mode 100644
index 000000000..553bcda6a
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
@@ -0,0 +1,47 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_HALF_H
+#define EIGEN_SPECIALFUNCTIONS_HALF_H
+
+namespace Eigen {
+namespace numext {
+
+#if EIGEN_HAS_C99_MATH
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half lgamma(const Eigen::half& a) {
+ return Eigen::half(Eigen::numext::lgamma(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half digamma(const Eigen::half& a) {
+ return Eigen::half(Eigen::numext::digamma(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half zeta(const Eigen::half& x, const Eigen::half& q) {
+ return Eigen::half(Eigen::numext::zeta(static_cast<float>(x), static_cast<float>(q)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half polygamma(const Eigen::half& n, const Eigen::half& x) {
+ return Eigen::half(Eigen::numext::polygamma(static_cast<float>(n), static_cast<float>(x)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erf(const Eigen::half& a) {
+ return Eigen::half(Eigen::numext::erf(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erfc(const Eigen::half& a) {
+ return Eigen::half(Eigen::numext::erfc(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma(const Eigen::half& a, const Eigen::half& x) {
+ return Eigen::half(Eigen::numext::igamma(static_cast<float>(a), static_cast<float>(x)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igammac(const Eigen::half& a, const Eigen::half& x) {
+ return Eigen::half(Eigen::numext::igammac(static_cast<float>(a), static_cast<float>(x)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half betainc(const Eigen::half& a, const Eigen::half& b, const Eigen::half& x) {
+ return Eigen::half(Eigen::numext::betainc(static_cast<float>(a), static_cast<float>(b), static_cast<float>(x)));
+}
+#endif
+
+} // end namespace numext
+} // end namespace Eigen
+
+#endif // EIGEN_SPECIALFUNCTIONS_HALF_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
new file mode 100644
index 000000000..52619fc0c
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
@@ -0,0 +1,1551 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIAL_FUNCTIONS_H
+#define EIGEN_SPECIAL_FUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+// Parts of this code are based on the Cephes Math Library.
+//
+// Cephes Math Library Release 2.8: June, 2000
+// Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier
+//
+// Permission has been kindly provided by the original author
+// to incorporate the Cephes software into the Eigen codebase:
+//
+// From: Stephen Moshier
+// To: Eugene Brevdo
+// Subject: Re: Permission to wrap several cephes functions in Eigen
+//
+// Hello Eugene,
+//
+// Thank you for writing.
+//
+// If your licensing is similar to BSD, the formal way that has been
+// handled is simply to add a statement to the effect that you are incorporating
+// the Cephes software by permission of the author.
+//
+// Good luck with your project,
+// Steve
+
+namespace cephes {
+
+/* polevl (modified for Eigen)
+ *
+ * Evaluate polynomial
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * int N;
+ * Scalar x, y, coef[N+1];
+ *
+ * y = polevl<decltype(x), N>( x, coef);
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Evaluates polynomial of degree N:
+ *
+ * 2 N
+ * y = C + C x + C x +...+ C x
+ * 0 1 2 N
+ *
+ * Coefficients are stored in reverse order:
+ *
+ * coef[0] = C , ..., coef[N] = C .
+ * N 0
+ *
+ * The function p1evl() assumes that coef[N] = 1.0 and is
+ * omitted from the array. Its calling arguments are
+ * otherwise the same as polevl().
+ *
+ *
+ * The Eigen implementation is templatized. For best speed, store
+ * coef as a const array (constexpr), e.g.
+ *
+ * const double coef[] = {1.0, 2.0, 3.0, ...};
+ *
+ */
+template <typename Scalar, int N>
+struct polevl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(const Scalar x, const Scalar coef[]) {
+ EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ return polevl<Scalar, N - 1>::run(x, coef) * x + coef[N];
+ }
+};
+
+template <typename Scalar>
+struct polevl<Scalar, 0> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(const Scalar, const Scalar coef[]) {
+ return coef[0];
+ }
+};
+
+} // end namespace cephes
+
+/****************************************************************************
+ * Implementation of lgamma, requires C++11/C99 *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct lgamma_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return Scalar(0);
+ }
+};
+
+template <typename Scalar>
+struct lgamma_retval {
+ typedef Scalar type;
+};
+
+#if EIGEN_HAS_C99_MATH
+template <>
+struct lgamma_impl<float> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE float run(float x) { return ::lgammaf(x); }
+};
+
+template <>
+struct lgamma_impl<double> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE double run(double x) { return ::lgamma(x); }
+};
+#endif
+
+/****************************************************************************
+ * Implementation of digamma (psi), based on Cephes *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct digamma_retval {
+ typedef Scalar type;
+};
+
+/*
+ *
+ * Polynomial evaluation helper for the Psi (digamma) function.
+ *
+ * digamma_impl_maybe_poly::run(s) evaluates the asymptotic Psi expansion for
+ * input Scalar s, assuming s is above 10.0.
+ *
+ * If s is above a certain threshold for the given Scalar type, zero
+ * is returned. Otherwise the polynomial is evaluated with enough
+ * coefficients for results matching Scalar machine precision.
+ *
+ *
+ */
+template <typename Scalar>
+struct digamma_impl_maybe_poly {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return Scalar(0);
+ }
+};
+
+
+template <>
+struct digamma_impl_maybe_poly<float> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE float run(const float s) {
+ const float A[] = {
+ -4.16666666666666666667E-3f,
+ 3.96825396825396825397E-3f,
+ -8.33333333333333333333E-3f,
+ 8.33333333333333333333E-2f
+ };
+
+ float z;
+ if (s < 1.0e8f) {
+ z = 1.0f / (s * s);
+ return z * cephes::polevl<float, 3>::run(z, A);
+ } else return 0.0f;
+ }
+};
+
+template <>
+struct digamma_impl_maybe_poly<double> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE double run(const double s) {
+ const double A[] = {
+ 8.33333333333333333333E-2,
+ -2.10927960927960927961E-2,
+ 7.57575757575757575758E-3,
+ -4.16666666666666666667E-3,
+ 3.96825396825396825397E-3,
+ -8.33333333333333333333E-3,
+ 8.33333333333333333333E-2
+ };
+
+ double z;
+ if (s < 1.0e17) {
+ z = 1.0 / (s * s);
+ return z * cephes::polevl<double, 6>::run(z, A);
+ }
+ else return 0.0;
+ }
+};
+
+template <typename Scalar>
+struct digamma_impl {
+ EIGEN_DEVICE_FUNC
+ static Scalar run(Scalar x) {
+ /*
+ *
+ * Psi (digamma) function (modified for Eigen)
+ *
+ *
+ * SYNOPSIS:
+ *
+ * double x, y, psi();
+ *
+ * y = psi( x );
+ *
+ *
+ * DESCRIPTION:
+ *
+ * d -
+ * psi(x) = -- ln | (x)
+ * dx
+ *
+ * is the logarithmic derivative of the gamma function.
+ * For integer x,
+ * n-1
+ * -
+ * psi(n) = -EUL + > 1/k.
+ * -
+ * k=1
+ *
+ * If x is negative, it is transformed to a positive argument by the
+ * reflection formula psi(1-x) = psi(x) + pi cot(pi x).
+ * For general positive x, the argument is made greater than 10
+ * using the recurrence psi(x+1) = psi(x) + 1/x.
+ * Then the following asymptotic expansion is applied:
+ *
+ * inf. B
+ * - 2k
+ * psi(x) = log(x) - 1/2x - > -------
+ * - 2k
+ * k=1 2k x
+ *
+ * where the B2k are Bernoulli numbers.
+ *
+ * ACCURACY (float):
+ * Relative error (except absolute when |psi| < 1):
+ * arithmetic domain # trials peak rms
+ * IEEE 0,30 30000 1.3e-15 1.4e-16
+ * IEEE -30,0 40000 1.5e-15 2.2e-16
+ *
+ * ACCURACY (double):
+ * Absolute error, relative when |psi| > 1 :
+ * arithmetic domain # trials peak rms
+ * IEEE -33,0 30000 8.2e-7 1.2e-7
+ * IEEE 0,33 100000 7.3e-7 7.7e-8
+ *
+ * ERROR MESSAGES:
+ * message condition value returned
+ * psi singularity x integer <=0 INFINITY
+ */
+
+ Scalar p, q, nz, s, w, y;
+ bool negative = false;
+
+ const Scalar maxnum = NumTraits<Scalar>::infinity();
+ const Scalar m_pi = Scalar(EIGEN_PI);
+
+ const Scalar zero = Scalar(0);
+ const Scalar one = Scalar(1);
+ const Scalar half = Scalar(0.5);
+ nz = zero;
+
+ if (x <= zero) {
+ negative = true;
+ q = x;
+ p = numext::floor(q);
+ if (p == q) {
+ return maxnum;
+ }
+ /* Remove the zeros of tan(m_pi x)
+ * by subtracting the nearest integer from x
+ */
+ nz = q - p;
+ if (nz != half) {
+ if (nz > half) {
+ p += one;
+ nz = q - p;
+ }
+ nz = m_pi / numext::tan(m_pi * nz);
+ }
+ else {
+ nz = zero;
+ }
+ x = one - x;
+ }
+
+ /* use the recurrence psi(x+1) = psi(x) + 1/x. */
+ s = x;
+ w = zero;
+ while (s < Scalar(10)) {
+ w += one / s;
+ s += one;
+ }
+
+ y = digamma_impl_maybe_poly<Scalar>::run(s);
+
+ y = numext::log(s) - (half / s) - y - w;
+
+ return (negative) ? y - nz : y;
+ }
+};
+
+/****************************************************************************
+ * Implementation of erf, requires C++11/C99 *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct erf_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return Scalar(0);
+ }
+};
+
+template <typename Scalar>
+struct erf_retval {
+ typedef Scalar type;
+};
+
+#if EIGEN_HAS_C99_MATH
+template <>
+struct erf_impl<float> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE float run(float x) { return ::erff(x); }
+};
+
+template <>
+struct erf_impl<double> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE double run(double x) { return ::erf(x); }
+};
+#endif // EIGEN_HAS_C99_MATH
+
+/***************************************************************************
+* Implementation of erfc, requires C++11/C99 *
+****************************************************************************/
+
+template <typename Scalar>
+struct erfc_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return Scalar(0);
+ }
+};
+
+template <typename Scalar>
+struct erfc_retval {
+ typedef Scalar type;
+};
+
+#if EIGEN_HAS_C99_MATH
+template <>
+struct erfc_impl<float> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE float run(const float x) { return ::erfcf(x); }
+};
+
+template <>
+struct erfc_impl<double> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE double run(const double x) { return ::erfc(x); }
+};
+#endif // EIGEN_HAS_C99_MATH
+
+/**************************************************************************************************************
+ * Implementation of igammac (complemented incomplete gamma integral), based on Cephes but requires C++11/C99 *
+ **************************************************************************************************************/
+
+template <typename Scalar>
+struct igammac_retval {
+ typedef Scalar type;
+};
+
+// NOTE: cephes_helper is also used to implement zeta
+template <typename Scalar>
+struct cephes_helper {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar machep() { assert(false && "machep not supported for this type"); return 0.0; }
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar big() { assert(false && "big not supported for this type"); return 0.0; }
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar biginv() { assert(false && "biginv not supported for this type"); return 0.0; }
+};
+
+template <>
+struct cephes_helper<float> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE float machep() {
+ return NumTraits<float>::epsilon() / 2; // 1.0 - machep == 1.0
+ }
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE float big() {
+ // use epsneg (1.0 - epsneg == 1.0)
+ return 1.0f / (NumTraits<float>::epsilon() / 2);
+ }
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE float biginv() {
+ // epsneg
+ return machep();
+ }
+};
+
+template <>
+struct cephes_helper<double> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE double machep() {
+ return NumTraits<double>::epsilon() / 2; // 1.0 - machep == 1.0
+ }
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE double big() {
+ return 1.0 / NumTraits<double>::epsilon();
+ }
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE double biginv() {
+ // inverse of eps
+ return NumTraits<double>::epsilon();
+ }
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct igammac_impl {
+ EIGEN_DEVICE_FUNC
+ static Scalar run(Scalar a, Scalar x) {
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return Scalar(0);
+ }
+};
+
+#else
+
+template <typename Scalar> struct igamma_impl; // predeclare igamma_impl
+
+template <typename Scalar>
+struct igammac_impl {
+ EIGEN_DEVICE_FUNC
+ static Scalar run(Scalar a, Scalar x) {
+ /* igamc()
+ *
+ * Incomplete gamma integral (modified for Eigen)
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * double a, x, y, igamc();
+ *
+ * y = igamc( a, x );
+ *
+ * DESCRIPTION:
+ *
+ * The function is defined by
+ *
+ *
+ * igamc(a,x) = 1 - igam(a,x)
+ *
+ * inf.
+ * -
+ * 1 | | -t a-1
+ * = ----- | e t dt.
+ * - | |
+ * | (a) -
+ * x
+ *
+ *
+ * In this implementation both arguments must be positive.
+ * The integral is evaluated by either a power series or
+ * continued fraction expansion, depending on the relative
+ * values of a and x.
+ *
+ * ACCURACY (float):
+ *
+ * Relative error:
+ * arithmetic domain # trials peak rms
+ * IEEE 0,30 30000 7.8e-6 5.9e-7
+ *
+ *
+ * ACCURACY (double):
+ *
+ * Tested at random a, x.
+ * a x Relative error:
+ * arithmetic domain domain # trials peak rms
+ * IEEE 0.5,100 0,100 200000 1.9e-14 1.7e-15
+ * IEEE 0.01,0.5 0,100 200000 1.4e-13 1.6e-15
+ *
+ */
+ /*
+ Cephes Math Library Release 2.2: June, 1992
+ Copyright 1985, 1987, 1992 by Stephen L. Moshier
+ Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+ */
+ const Scalar zero = 0;
+ const Scalar one = 1;
+ const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+
+ if ((x < zero) || (a <= zero)) {
+ // domain error
+ return nan;
+ }
+
+ if ((x < one) || (x < a)) {
+ /* The checks above ensure that we meet the preconditions for
+ * igamma_impl::Impl(), so call it, rather than igamma_impl::Run().
+ * Calling Run() would also work, but in that case the compiler may not be
+ * able to prove that igammac_impl::Run and igamma_impl::Run are not
+ * mutually recursive. This leads to worse code, particularly on
+ * platforms like nvptx, where recursion is allowed only begrudgingly.
+ */
+ return (one - igamma_impl<Scalar>::Impl(a, x));
+ }
+
+ return Impl(a, x);
+ }
+
+ private:
+ /* igamma_impl calls igammac_impl::Impl. */
+ friend struct igamma_impl<Scalar>;
+
+ /* Actually computes igamc(a, x).
+ *
+ * Preconditions:
+ * a > 0
+ * x >= 1
+ * x >= a
+ */
+ EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) {
+ const Scalar zero = 0;
+ const Scalar one = 1;
+ const Scalar two = 2;
+ const Scalar machep = cephes_helper<Scalar>::machep();
+ const Scalar maxlog = numext::log(NumTraits<Scalar>::highest());
+ const Scalar big = cephes_helper<Scalar>::big();
+ const Scalar biginv = cephes_helper<Scalar>::biginv();
+ const Scalar inf = NumTraits<Scalar>::infinity();
+
+ Scalar ans, ax, c, yc, r, t, y, z;
+ Scalar pk, pkm1, pkm2, qk, qkm1, qkm2;
+
+ if (x == inf) return zero; // std::isinf crashes on CUDA
+
+ /* Compute x**a * exp(-x) / gamma(a) */
+ ax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a);
+ if (ax < -maxlog) { // underflow
+ return zero;
+ }
+ ax = numext::exp(ax);
+
+ // continued fraction
+ y = one - a;
+ z = x + y + one;
+ c = zero;
+ pkm2 = one;
+ qkm2 = x;
+ pkm1 = x + one;
+ qkm1 = z * x;
+ ans = pkm1 / qkm1;
+
+ while (true) {
+ c += one;
+ y += one;
+ z += two;
+ yc = y * c;
+ pk = pkm1 * z - pkm2 * yc;
+ qk = qkm1 * z - qkm2 * yc;
+ if (qk != zero) {
+ r = pk / qk;
+ t = numext::abs((ans - r) / r);
+ ans = r;
+ } else {
+ t = one;
+ }
+ pkm2 = pkm1;
+ pkm1 = pk;
+ qkm2 = qkm1;
+ qkm1 = qk;
+ if (numext::abs(pk) > big) {
+ pkm2 *= biginv;
+ pkm1 *= biginv;
+ qkm2 *= biginv;
+ qkm1 *= biginv;
+ }
+ if (t <= machep) {
+ break;
+ }
+ }
+
+ return (ans * ax);
+ }
+};
+
+#endif // EIGEN_HAS_C99_MATH
+
+/************************************************************************************************
+ * Implementation of igamma (incomplete gamma integral), based on Cephes but requires C++11/C99 *
+ ************************************************************************************************/
+
+template <typename Scalar>
+struct igamma_retval {
+ typedef Scalar type;
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct igamma_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar x) {
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return Scalar(0);
+ }
+};
+
+#else
+
+template <typename Scalar>
+struct igamma_impl {
+ EIGEN_DEVICE_FUNC
+ static Scalar run(Scalar a, Scalar x) {
+ /* igam()
+ * Incomplete gamma integral
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * double a, x, y, igam();
+ *
+ * y = igam( a, x );
+ *
+ * DESCRIPTION:
+ *
+ * The function is defined by
+ *
+ * x
+ * -
+ * 1 | | -t a-1
+ * igam(a,x) = ----- | e t dt.
+ * - | |
+ * | (a) -
+ * 0
+ *
+ *
+ * In this implementation both arguments must be positive.
+ * The integral is evaluated by either a power series or
+ * continued fraction expansion, depending on the relative
+ * values of a and x.
+ *
+ * ACCURACY (double):
+ *
+ * Relative error:
+ * arithmetic domain # trials peak rms
+ * IEEE 0,30 200000 3.6e-14 2.9e-15
+ * IEEE 0,100 300000 9.9e-14 1.5e-14
+ *
+ *
+ * ACCURACY (float):
+ *
+ * Relative error:
+ * arithmetic domain # trials peak rms
+ * IEEE 0,30 20000 7.8e-6 5.9e-7
+ *
+ */
+ /*
+ Cephes Math Library Release 2.2: June, 1992
+ Copyright 1985, 1987, 1992 by Stephen L. Moshier
+ Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+ */
+
+
+ /* left tail of incomplete gamma function:
+ *
+ * inf. k
+ * a -x - x
+ * x e > ----------
+ * - -
+ * k=0 | (a+k+1)
+ *
+ */
+ const Scalar zero = 0;
+ const Scalar one = 1;
+ const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+
+ if (x == zero) return zero;
+
+ if ((x < zero) || (a <= zero)) { // domain error
+ return nan;
+ }
+
+ if ((x > one) && (x > a)) {
+ /* The checks above ensure that we meet the preconditions for
+ * igammac_impl::Impl(), so call it, rather than igammac_impl::Run().
+ * Calling Run() would also work, but in that case the compiler may not be
+ * able to prove that igammac_impl::Run and igamma_impl::Run are not
+ * mutually recursive. This leads to worse code, particularly on
+ * platforms like nvptx, where recursion is allowed only begrudgingly.
+ */
+ return (one - igammac_impl<Scalar>::Impl(a, x));
+ }
+
+ return Impl(a, x);
+ }
+
+ private:
+ /* igammac_impl calls igamma_impl::Impl. */
+ friend struct igammac_impl<Scalar>;
+
+ /* Actually computes igam(a, x).
+ *
+ * Preconditions:
+ * x > 0
+ * a > 0
+ * !(x > 1 && x > a)
+ */
+ EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) {
+ const Scalar zero = 0;
+ const Scalar one = 1;
+ const Scalar machep = cephes_helper<Scalar>::machep();
+ const Scalar maxlog = numext::log(NumTraits<Scalar>::highest());
+
+ Scalar ans, ax, c, r;
+
+ /* Compute x**a * exp(-x) / gamma(a) */
+ ax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a);
+ if (ax < -maxlog) {
+ // underflow
+ return zero;
+ }
+ ax = numext::exp(ax);
+
+ /* power series */
+ r = a;
+ c = one;
+ ans = one;
+
+ while (true) {
+ r += one;
+ c *= x/r;
+ ans += c;
+ if (c/ans <= machep) {
+ break;
+ }
+ }
+
+ return (ans * ax / a);
+ }
+};
+
+#endif // EIGEN_HAS_C99_MATH
+
+/*****************************************************************************
+ * Implementation of Riemann zeta function of two arguments, based on Cephes *
+ *****************************************************************************/
+
+template <typename Scalar>
+struct zeta_retval {
+ typedef Scalar type;
+};
+
+template <typename Scalar>
+struct zeta_impl_series {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return Scalar(0);
+ }
+};
+
+template <>
+struct zeta_impl_series<float> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE bool run(float& a, float& b, float& s, const float x, const float machep) {
+ int i = 0;
+ while(i < 9)
+ {
+ i += 1;
+ a += 1.0f;
+ b = numext::pow( a, -x );
+ s += b;
+ if( numext::abs(b/s) < machep )
+ return true;
+ }
+
+ //Return whether we are done
+ return false;
+ }
+};
+
+template <>
+struct zeta_impl_series<double> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE bool run(double& a, double& b, double& s, const double x, const double machep) {
+ int i = 0;
+ while( (i < 9) || (a <= 9.0) )
+ {
+ i += 1;
+ a += 1.0;
+ b = numext::pow( a, -x );
+ s += b;
+ if( numext::abs(b/s) < machep )
+ return true;
+ }
+
+ //Return whether we are done
+ return false;
+ }
+};
+
+template <typename Scalar>
+struct zeta_impl {
+ EIGEN_DEVICE_FUNC
+ static Scalar run(Scalar x, Scalar q) {
+ /* zeta.c
+ *
+ * Riemann zeta function of two arguments
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * double x, q, y, zeta();
+ *
+ * y = zeta( x, q );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ *
+ *
+ * inf.
+ * - -x
+ * zeta(x,q) = > (k+q)
+ * -
+ * k=0
+ *
+ * where x > 1 and q is not a negative integer or zero.
+ * The Euler-Maclaurin summation formula is used to obtain
+ * the expansion
+ *
+ * n
+ * - -x
+ * zeta(x,q) = > (k+q)
+ * -
+ * k=1
+ *
+ * 1-x inf. B x(x+1)...(x+2j)
+ * (n+q) 1 - 2j
+ * + --------- - ------- + > --------------------
+ * x-1 x - x+2j+1
+ * 2(n+q) j=1 (2j)! (n+q)
+ *
+ * where the B2j are Bernoulli numbers. Note that (see zetac.c)
+ * zeta(x,1) = zetac(x) + 1.
+ *
+ *
+ *
+ * ACCURACY:
+ *
+ * Relative error for single precision:
+ * arithmetic domain # trials peak rms
+ * IEEE 0,25 10000 6.9e-7 1.0e-7
+ *
+ * Large arguments may produce underflow in powf(), in which
+ * case the results are inaccurate.
+ *
+ * REFERENCE:
+ *
+ * Gradshteyn, I. S., and I. M. Ryzhik, Tables of Integrals,
+ * Series, and Products, p. 1073; Academic Press, 1980.
+ *
+ */
+
+ int i;
+ Scalar p, r, a, b, k, s, t, w;
+
+ const Scalar A[] = {
+ Scalar(12.0),
+ Scalar(-720.0),
+ Scalar(30240.0),
+ Scalar(-1209600.0),
+ Scalar(47900160.0),
+ Scalar(-1.8924375803183791606e9), /*1.307674368e12/691*/
+ Scalar(7.47242496e10),
+ Scalar(-2.950130727918164224e12), /*1.067062284288e16/3617*/
+ Scalar(1.1646782814350067249e14), /*5.109094217170944e18/43867*/
+ Scalar(-4.5979787224074726105e15), /*8.028576626982912e20/174611*/
+ Scalar(1.8152105401943546773e17), /*1.5511210043330985984e23/854513*/
+ Scalar(-7.1661652561756670113e18) /*1.6938241367317436694528e27/236364091*/
+ };
+
+ const Scalar maxnum = NumTraits<Scalar>::infinity();
+ const Scalar zero = 0.0, half = 0.5, one = 1.0;
+ const Scalar machep = cephes_helper<Scalar>::machep();
+ const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+
+ if( x == one )
+ return maxnum;
+
+ if( x < one )
+ {
+ return nan;
+ }
+
+ if( q <= zero )
+ {
+ if(q == numext::floor(q))
+ {
+ return maxnum;
+ }
+ p = x;
+ r = numext::floor(p);
+ if (p != r)
+ return nan;
+ }
+
+ /* Permit negative q but continue sum until n+q > +9 .
+ * This case should be handled by a reflection formula.
+ * If q<0 and x is an integer, there is a relation to
+ * the polygamma function.
+ */
+ s = numext::pow( q, -x );
+ a = q;
+ b = zero;
+ // Run the summation in a helper function that is specific to the floating precision
+ if (zeta_impl_series<Scalar>::run(a, b, s, x, machep)) {
+ return s;
+ }
+
+ w = a;
+ s += b*w/(x-one);
+ s -= half * b;
+ a = one;
+ k = zero;
+ for( i=0; i<12; i++ )
+ {
+ a *= x + k;
+ b /= w;
+ t = a*b/A[i];
+ s = s + t;
+ t = numext::abs(t/s);
+ if( t < machep ) {
+ break;
+ }
+ k += one;
+ a *= x + k;
+ b /= w;
+ k += one;
+ }
+ return s;
+ }
+};
+
+/****************************************************************************
+ * Implementation of polygamma function, requires C++11/C99 *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct polygamma_retval {
+ typedef Scalar type;
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct polygamma_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(Scalar n, Scalar x) {
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return Scalar(0);
+ }
+};
+
+#else
+
+template <typename Scalar>
+struct polygamma_impl {
+ EIGEN_DEVICE_FUNC
+ static Scalar run(Scalar n, Scalar x) {
+ Scalar zero = 0.0, one = 1.0;
+ Scalar nplus = n + one;
+ const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+
+ // Check that n is an integer
+ if (numext::floor(n) != n) {
+ return nan;
+ }
+ // Just return the digamma function for n = 1
+ else if (n == zero) {
+ return digamma_impl<Scalar>::run(x);
+ }
+ // Use the same implementation as scipy
+ else {
+ Scalar factorial = numext::exp(lgamma_impl<Scalar>::run(nplus));
+ return numext::pow(-one, nplus) * factorial * zeta_impl<Scalar>::run(nplus, x);
+ }
+ }
+};
+
+#endif // EIGEN_HAS_C99_MATH
+
+/************************************************************************************************
+ * Implementation of betainc (incomplete beta integral), based on Cephes but requires C++11/C99 *
+ ************************************************************************************************/
+
+template <typename Scalar>
+struct betainc_retval {
+ typedef Scalar type;
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct betainc_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar b, Scalar x) {
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return Scalar(0);
+ }
+};
+
+#else
+
+template <typename Scalar>
+struct betainc_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(Scalar, Scalar, Scalar) {
+ /* betaincf.c
+ *
+ * Incomplete beta integral
+ *
+ *
+ * SYNOPSIS:
+ *
+ * float a, b, x, y, betaincf();
+ *
+ * y = betaincf( a, b, x );
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Returns incomplete beta integral of the arguments, evaluated
+ * from zero to x. The function is defined as
+ *
+ * x
+ * - -
+ * | (a+b) | | a-1 b-1
+ * ----------- | t (1-t) dt.
+ * - - | |
+ * | (a) | (b) -
+ * 0
+ *
+ * The domain of definition is 0 <= x <= 1. In this
+ * implementation a and b are restricted to positive values.
+ * The integral from x to 1 may be obtained by the symmetry
+ * relation
+ *
+ * 1 - betainc( a, b, x ) = betainc( b, a, 1-x ).
+ *
+ * The integral is evaluated by a continued fraction expansion.
+ * If a < 1, the function calls itself recursively after a
+ * transformation to increase a to a+1.
+ *
+ * ACCURACY (float):
+ *
+ * Tested at random points (a,b,x) with a and b in the indicated
+ * interval and x between 0 and 1.
+ *
+ * arithmetic domain # trials peak rms
+ * Relative error:
+ * IEEE 0,30 10000 3.7e-5 5.1e-6
+ * IEEE 0,100 10000 1.7e-4 2.5e-5
+ * The useful domain for relative error is limited by underflow
+ * of the single precision exponential function.
+ * Absolute error:
+ * IEEE 0,30 100000 2.2e-5 9.6e-7
+ * IEEE 0,100 10000 6.5e-5 3.7e-6
+ *
+ * Larger errors may occur for extreme ratios of a and b.
+ *
+ * ACCURACY (double):
+ * arithmetic domain # trials peak rms
+ * IEEE 0,5 10000 6.9e-15 4.5e-16
+ * IEEE 0,85 250000 2.2e-13 1.7e-14
+ * IEEE 0,1000 30000 5.3e-12 6.3e-13
+ * IEEE 0,10000 250000 9.3e-11 7.1e-12
+ * IEEE 0,100000 10000 8.7e-10 4.8e-11
+ * Outputs smaller than the IEEE gradual underflow threshold
+ * were excluded from these statistics.
+ *
+ * ERROR MESSAGES:
+ * message condition value returned
+ * incbet domain x<0, x>1 nan
+ * incbet underflow nan
+ */
+
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return Scalar(0);
+ }
+};
+
+/* Continued fraction expansion #1 for incomplete beta integral (small_branch = True)
+ * Continued fraction expansion #2 for incomplete beta integral (small_branch = False)
+ */
+template <typename Scalar>
+struct incbeta_cfe {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar b, Scalar x, bool small_branch) {
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, float>::value ||
+ internal::is_same<Scalar, double>::value),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ const Scalar big = cephes_helper<Scalar>::big();
+ const Scalar machep = cephes_helper<Scalar>::machep();
+ const Scalar biginv = cephes_helper<Scalar>::biginv();
+
+ const Scalar zero = 0;
+ const Scalar one = 1;
+ const Scalar two = 2;
+
+ Scalar xk, pk, pkm1, pkm2, qk, qkm1, qkm2;
+ Scalar k1, k2, k3, k4, k5, k6, k7, k8, k26update;
+ Scalar ans;
+ int n;
+
+ const int num_iters = (internal::is_same<Scalar, float>::value) ? 100 : 300;
+ const Scalar thresh =
+ (internal::is_same<Scalar, float>::value) ? machep : Scalar(3) * machep;
+ Scalar r = (internal::is_same<Scalar, float>::value) ? zero : one;
+
+ if (small_branch) {
+ k1 = a;
+ k2 = a + b;
+ k3 = a;
+ k4 = a + one;
+ k5 = one;
+ k6 = b - one;
+ k7 = k4;
+ k8 = a + two;
+ k26update = one;
+ } else {
+ k1 = a;
+ k2 = b - one;
+ k3 = a;
+ k4 = a + one;
+ k5 = one;
+ k6 = a + b;
+ k7 = a + one;
+ k8 = a + two;
+ k26update = -one;
+ x = x / (one - x);
+ }
+
+ pkm2 = zero;
+ qkm2 = one;
+ pkm1 = one;
+ qkm1 = one;
+ ans = one;
+ n = 0;
+
+ do {
+ xk = -(x * k1 * k2) / (k3 * k4);
+ pk = pkm1 + pkm2 * xk;
+ qk = qkm1 + qkm2 * xk;
+ pkm2 = pkm1;
+ pkm1 = pk;
+ qkm2 = qkm1;
+ qkm1 = qk;
+
+ xk = (x * k5 * k6) / (k7 * k8);
+ pk = pkm1 + pkm2 * xk;
+ qk = qkm1 + qkm2 * xk;
+ pkm2 = pkm1;
+ pkm1 = pk;
+ qkm2 = qkm1;
+ qkm1 = qk;
+
+ if (qk != zero) {
+ r = pk / qk;
+ if (numext::abs(ans - r) < numext::abs(r) * thresh) {
+ return r;
+ }
+ ans = r;
+ }
+
+ k1 += one;
+ k2 += k26update;
+ k3 += two;
+ k4 += two;
+ k5 += one;
+ k6 -= k26update;
+ k7 += two;
+ k8 += two;
+
+ if ((numext::abs(qk) + numext::abs(pk)) > big) {
+ pkm2 *= biginv;
+ pkm1 *= biginv;
+ qkm2 *= biginv;
+ qkm1 *= biginv;
+ }
+ if ((numext::abs(qk) < biginv) || (numext::abs(pk) < biginv)) {
+ pkm2 *= big;
+ pkm1 *= big;
+ qkm2 *= big;
+ qkm1 *= big;
+ }
+ } while (++n < num_iters);
+
+ return ans;
+ }
+};
+
+/* Helper functions depending on the Scalar type */
+template <typename Scalar>
+struct betainc_helper {};
+
+template <>
+struct betainc_helper<float> {
+ /* Core implementation, assumes a large (> 1.0) */
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float incbsa(float aa, float bb,
+ float xx) {
+ float ans, a, b, t, x, onemx;
+ bool reversed_a_b = false;
+
+ onemx = 1.0f - xx;
+
+ /* see if x is greater than the mean */
+ if (xx > (aa / (aa + bb))) {
+ reversed_a_b = true;
+ a = bb;
+ b = aa;
+ t = xx;
+ x = onemx;
+ } else {
+ a = aa;
+ b = bb;
+ t = onemx;
+ x = xx;
+ }
+
+ /* Choose expansion for optimal convergence */
+ if (b > 10.0f) {
+ if (numext::abs(b * x / a) < 0.3f) {
+ t = betainc_helper<float>::incbps(a, b, x);
+ if (reversed_a_b) t = 1.0f - t;
+ return t;
+ }
+ }
+
+ ans = x * (a + b - 2.0f) / (a - 1.0f);
+ if (ans < 1.0f) {
+ ans = incbeta_cfe<float>::run(a, b, x, true /* small_branch */);
+ t = b * numext::log(t);
+ } else {
+ ans = incbeta_cfe<float>::run(a, b, x, false /* small_branch */);
+ t = (b - 1.0f) * numext::log(t);
+ }
+
+ t += a * numext::log(x) + lgamma_impl<float>::run(a + b) -
+ lgamma_impl<float>::run(a) - lgamma_impl<float>::run(b);
+ t += numext::log(ans / a);
+ t = numext::exp(t);
+
+ if (reversed_a_b) t = 1.0f - t;
+ return t;
+ }
+
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE float incbps(float a, float b, float x) {
+ float t, u, y, s;
+ const float machep = cephes_helper<float>::machep();
+
+ y = a * numext::log(x) + (b - 1.0f) * numext::log1p(-x) - numext::log(a);
+ y -= lgamma_impl<float>::run(a) + lgamma_impl<float>::run(b);
+ y += lgamma_impl<float>::run(a + b);
+
+ t = x / (1.0f - x);
+ s = 0.0f;
+ u = 1.0f;
+ do {
+ b -= 1.0f;
+ if (b == 0.0f) {
+ break;
+ }
+ a += 1.0f;
+ u *= t * b / a;
+ s += u;
+ } while (numext::abs(u) > machep);
+
+ return numext::exp(y) * (1.0f + s);
+ }
+};
+
+template <>
+struct betainc_impl<float> {
+ EIGEN_DEVICE_FUNC
+ static float run(float a, float b, float x) {
+ const float nan = NumTraits<float>::quiet_NaN();
+ float ans, t;
+
+ if (a <= 0.0f) return nan;
+ if (b <= 0.0f) return nan;
+ if ((x <= 0.0f) || (x >= 1.0f)) {
+ if (x == 0.0f) return 0.0f;
+ if (x == 1.0f) return 1.0f;
+ // mtherr("betaincf", DOMAIN);
+ return nan;
+ }
+
+ /* transformation for small aa */
+ if (a <= 1.0f) {
+ ans = betainc_helper<float>::incbsa(a + 1.0f, b, x);
+ t = a * numext::log(x) + b * numext::log1p(-x) +
+ lgamma_impl<float>::run(a + b) - lgamma_impl<float>::run(a + 1.0f) -
+ lgamma_impl<float>::run(b);
+ return (ans + numext::exp(t));
+ } else {
+ return betainc_helper<float>::incbsa(a, b, x);
+ }
+ }
+};
+
+template <>
+struct betainc_helper<double> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE double incbps(double a, double b, double x) {
+ const double machep = cephes_helper<double>::machep();
+
+ double s, t, u, v, n, t1, z, ai;
+
+ ai = 1.0 / a;
+ u = (1.0 - b) * x;
+ v = u / (a + 1.0);
+ t1 = v;
+ t = u;
+ n = 2.0;
+ s = 0.0;
+ z = machep * ai;
+ while (numext::abs(v) > z) {
+ u = (n - b) * x / n;
+ t *= u;
+ v = t / (a + n);
+ s += v;
+ n += 1.0;
+ }
+ s += t1;
+ s += ai;
+
+ u = a * numext::log(x);
+ // TODO: gamma() is not directly implemented in Eigen.
+ /*
+ if ((a + b) < maxgam && numext::abs(u) < maxlog) {
+ t = gamma(a + b) / (gamma(a) * gamma(b));
+ s = s * t * pow(x, a);
+ } else {
+ */
+ t = lgamma_impl<double>::run(a + b) - lgamma_impl<double>::run(a) -
+ lgamma_impl<double>::run(b) + u + numext::log(s);
+ return s = numext::exp(t);
+ }
+};
+
+template <>
+struct betainc_impl<double> {
+ EIGEN_DEVICE_FUNC
+ static double run(double aa, double bb, double xx) {
+ const double nan = NumTraits<double>::quiet_NaN();
+ const double machep = cephes_helper<double>::machep();
+ // const double maxgam = 171.624376956302725;
+
+ double a, b, t, x, xc, w, y;
+ bool reversed_a_b = false;
+
+ if (aa <= 0.0 || bb <= 0.0) {
+ return nan; // goto domerr;
+ }
+
+ if ((xx <= 0.0) || (xx >= 1.0)) {
+ if (xx == 0.0) return (0.0);
+ if (xx == 1.0) return (1.0);
+ // mtherr("incbet", DOMAIN);
+ return nan;
+ }
+
+ if ((bb * xx) <= 1.0 && xx <= 0.95) {
+ return betainc_helper<double>::incbps(aa, bb, xx);
+ }
+
+ w = 1.0 - xx;
+
+ /* Reverse a and b if x is greater than the mean. */
+ if (xx > (aa / (aa + bb))) {
+ reversed_a_b = true;
+ a = bb;
+ b = aa;
+ xc = xx;
+ x = w;
+ } else {
+ a = aa;
+ b = bb;
+ xc = w;
+ x = xx;
+ }
+
+ if (reversed_a_b && (b * x) <= 1.0 && x <= 0.95) {
+ t = betainc_helper<double>::incbps(a, b, x);
+ if (t <= machep) {
+ t = 1.0 - machep;
+ } else {
+ t = 1.0 - t;
+ }
+ return t;
+ }
+
+ /* Choose expansion for better convergence. */
+ y = x * (a + b - 2.0) - (a - 1.0);
+ if (y < 0.0) {
+ w = incbeta_cfe<double>::run(a, b, x, true /* small_branch */);
+ } else {
+ w = incbeta_cfe<double>::run(a, b, x, false /* small_branch */) / xc;
+ }
+
+ /* Multiply w by the factor
+ a b _ _ _
+ x (1-x) | (a+b) / ( a | (a) | (b) ) . */
+
+ y = a * numext::log(x);
+ t = b * numext::log(xc);
+ // TODO: gamma is not directly implemented in Eigen.
+ /*
+ if ((a + b) < maxgam && numext::abs(y) < maxlog && numext::abs(t) < maxlog)
+ {
+ t = pow(xc, b);
+ t *= pow(x, a);
+ t /= a;
+ t *= w;
+ t *= gamma(a + b) / (gamma(a) * gamma(b));
+ } else {
+ */
+ /* Resort to logarithms. */
+ y += t + lgamma_impl<double>::run(a + b) - lgamma_impl<double>::run(a) -
+ lgamma_impl<double>::run(b);
+ y += numext::log(w / a);
+ t = numext::exp(y);
+
+ /* } */
+ // done:
+
+ if (reversed_a_b) {
+ if (t <= machep) {
+ t = 1.0 - machep;
+ } else {
+ t = 1.0 - t;
+ }
+ }
+ return t;
+ }
+};
+
+#endif // EIGEN_HAS_C99_MATH
+
+} // end namespace internal
+
+namespace numext {
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(lgamma, Scalar)
+ lgamma(const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(lgamma, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(digamma, Scalar)
+ digamma(const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(digamma, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(zeta, Scalar)
+zeta(const Scalar& x, const Scalar& q) {
+ return EIGEN_MATHFUNC_IMPL(zeta, Scalar)::run(x, q);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(polygamma, Scalar)
+polygamma(const Scalar& n, const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(polygamma, Scalar)::run(n, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erf, Scalar)
+ erf(const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(erf, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erfc, Scalar)
+ erfc(const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(erfc, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igamma, Scalar)
+ igamma(const Scalar& a, const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(igamma, Scalar)::run(a, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igammac, Scalar)
+ igammac(const Scalar& a, const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(igammac, Scalar)::run(a, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(betainc, Scalar)
+ betainc(const Scalar& a, const Scalar& b, const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(betainc, Scalar)::run(a, b, x);
+}
+
+} // end namespace numext
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPECIAL_FUNCTIONS_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
new file mode 100644
index 000000000..46d60d323
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
@@ -0,0 +1,58 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_PACKETMATH_H
+#define EIGEN_SPECIALFUNCTIONS_PACKETMATH_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal \returns the ln(|gamma(\a a)|) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet plgamma(const Packet& a) { using numext::lgamma; return lgamma(a); }
+
+/** \internal \returns the derivative of lgamma, psi(\a a) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pdigamma(const Packet& a) { using numext::digamma; return digamma(a); }
+
+/** \internal \returns the zeta function of two arguments (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pzeta(const Packet& x, const Packet& q) { using numext::zeta; return zeta(x, q); }
+
+/** \internal \returns the polygamma function (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet ppolygamma(const Packet& n, const Packet& x) { using numext::polygamma; return polygamma(n, x); }
+
+/** \internal \returns the erf(\a a) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet perf(const Packet& a) { using numext::erf; return erf(a); }
+
+/** \internal \returns the erfc(\a a) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet perfc(const Packet& a) { using numext::erfc; return erfc(a); }
+
+/** \internal \returns the incomplete gamma function igamma(\a a, \a x) */
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Packet pigamma(const Packet& a, const Packet& x) { using numext::igamma; return igamma(a, x); }
+
+/** \internal \returns the complementary incomplete gamma function igammac(\a a, \a x) */
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Packet pigammac(const Packet& a, const Packet& x) { using numext::igammac; return igammac(a, x); }
+
+/** \internal \returns the complementary incomplete gamma function betainc(\a a, \a b, \a x) */
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Packet pbetainc(const Packet& a, const Packet& b,const Packet& x) { using numext::betainc; return betainc(a, b, x); }
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPECIALFUNCTIONS_PACKETMATH_H
+
diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h
new file mode 100644
index 000000000..ec4fa8448
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h
@@ -0,0 +1,165 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CUDA_SPECIALFUNCTIONS_H
+#define EIGEN_CUDA_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 plgamma<float4>(const float4& a)
+{
+ return make_float4(lgammaf(a.x), lgammaf(a.y), lgammaf(a.z), lgammaf(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 plgamma<double2>(const double2& a)
+{
+ using numext::lgamma;
+ return make_double2(lgamma(a.x), lgamma(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pdigamma<float4>(const float4& a)
+{
+ using numext::digamma;
+ return make_float4(digamma(a.x), digamma(a.y), digamma(a.z), digamma(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pdigamma<double2>(const double2& a)
+{
+ using numext::digamma;
+ return make_double2(digamma(a.x), digamma(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pzeta<float4>(const float4& x, const float4& q)
+{
+ using numext::zeta;
+ return make_float4(zeta(x.x, q.x), zeta(x.y, q.y), zeta(x.z, q.z), zeta(x.w, q.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pzeta<double2>(const double2& x, const double2& q)
+{
+ using numext::zeta;
+ return make_double2(zeta(x.x, q.x), zeta(x.y, q.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 ppolygamma<float4>(const float4& n, const float4& x)
+{
+ using numext::polygamma;
+ return make_float4(polygamma(n.x, x.x), polygamma(n.y, x.y), polygamma(n.z, x.z), polygamma(n.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 ppolygamma<double2>(const double2& n, const double2& x)
+{
+ using numext::polygamma;
+ return make_double2(polygamma(n.x, x.x), polygamma(n.y, x.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 perf<float4>(const float4& a)
+{
+ return make_float4(erff(a.x), erff(a.y), erff(a.z), erff(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 perf<double2>(const double2& a)
+{
+ using numext::erf;
+ return make_double2(erf(a.x), erf(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 perfc<float4>(const float4& a)
+{
+ using numext::erfc;
+ return make_float4(erfc(a.x), erfc(a.y), erfc(a.z), erfc(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 perfc<double2>(const double2& a)
+{
+ using numext::erfc;
+ return make_double2(erfc(a.x), erfc(a.y));
+}
+
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pigamma<float4>(const float4& a, const float4& x)
+{
+ using numext::igamma;
+ return make_float4(
+ igamma(a.x, x.x),
+ igamma(a.y, x.y),
+ igamma(a.z, x.z),
+ igamma(a.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pigamma<double2>(const double2& a, const double2& x)
+{
+ using numext::igamma;
+ return make_double2(igamma(a.x, x.x), igamma(a.y, x.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pigammac<float4>(const float4& a, const float4& x)
+{
+ using numext::igammac;
+ return make_float4(
+ igammac(a.x, x.x),
+ igammac(a.y, x.y),
+ igammac(a.z, x.z),
+ igammac(a.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pigammac<double2>(const double2& a, const double2& x)
+{
+ using numext::igammac;
+ return make_double2(igammac(a.x, x.x), igammac(a.y, x.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pbetainc<float4>(const float4& a, const float4& b, const float4& x)
+{
+ using numext::betainc;
+ return make_float4(
+ betainc(a.x, b.x, x.x),
+ betainc(a.y, b.y, x.y),
+ betainc(a.z, b.z, x.z),
+ betainc(a.w, b.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pbetainc<double2>(const double2& a, const double2& b, const double2& x)
+{
+ using numext::betainc;
+ return make_double2(betainc(a.x, b.x, x.x), betainc(a.y, b.y, x.y));
+}
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CUDA_SPECIALFUNCTIONS_H
diff --git a/unsupported/Eigen/src/Splines/CMakeLists.txt b/unsupported/Eigen/src/Splines/CMakeLists.txt
deleted file mode 100644
index 55c6271e9..000000000
--- a/unsupported/Eigen/src/Splines/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Splines_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_Splines_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Splines COMPONENT Devel
- )
diff --git a/unsupported/Eigen/src/Splines/Spline.h b/unsupported/Eigen/src/Splines/Spline.h
index d1636f466..627f6e482 100644
--- a/unsupported/Eigen/src/Splines/Spline.h
+++ b/unsupported/Eigen/src/Splines/Spline.h
@@ -94,7 +94,7 @@ namespace Eigen
const KnotVectorType& knots() const { return m_knots; }
/**
- * \brief Returns the knots of the underlying spline.
+ * \brief Returns the ctrls of the underlying spline.
**/
const ControlPointVectorType& ctrls() const { return m_ctrls; }
@@ -394,7 +394,7 @@ namespace Eigen
Matrix<Scalar,Order,Order> ndu(p+1,p+1);
- double saved, temp;
+ Scalar saved, temp; // FIXME These were double instead of Scalar. Was there a reason for that?
ndu(0,0) = 1.0;
@@ -433,7 +433,7 @@ namespace Eigen
// Compute the k-th derivative
for (DenseIndex k=1; k<=static_cast<DenseIndex>(n); ++k)
{
- double d = 0.0;
+ Scalar d = 0.0;
DenseIndex rk,pk,j1,j2;
rk = r-k; pk = p-k;