aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src
diff options
context:
space:
mode:
Diffstat (limited to 'unsupported/Eigen/CXX11/src')
-rw-r--r--unsupported/Eigen/CXX11/src/CMakeLists.txt4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/README.md42
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/Tensor.h20
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h8
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBase.h152
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h12
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h7
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h12
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h41
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h10
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h74
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h670
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h40
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h48
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h157
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h120
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h30
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h108
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h59
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h82
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h8
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h10
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h588
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h33
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorIO.h69
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h202
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h50
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMap.h6
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h51
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h284
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h112
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h276
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h104
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h495
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorRef.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h6
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorScan.h287
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h23
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h4
-rw-r--r--unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt8
-rw-r--r--unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h9
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h206
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h4
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h37
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h4
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h7
-rw-r--r--unsupported/Eigen/CXX11/src/util/CMakeLists.txt6
-rw-r--r--unsupported/Eigen/CXX11/src/util/EmulateArray.h4
-rw-r--r--unsupported/Eigen/CXX11/src/util/MaxSizeVector.h11
66 files changed, 3511 insertions, 1152 deletions
diff --git a/unsupported/Eigen/CXX11/src/CMakeLists.txt b/unsupported/Eigen/CXX11/src/CMakeLists.txt
deleted file mode 100644
index 1734262bb..000000000
--- a/unsupported/Eigen/CXX11/src/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-add_subdirectory(util)
-add_subdirectory(ThreadPool)
-add_subdirectory(Tensor)
-add_subdirectory(TensorSymmetry)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt b/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt
deleted file mode 100644
index 6d4b3ea0d..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_Tensor_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_CXX11_Tensor_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/Tensor COMPONENT Devel
- )
diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md
index eeca2f69e..02146527b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/README.md
+++ b/unsupported/Eigen/CXX11/src/Tensor/README.md
@@ -1102,7 +1102,7 @@ Example: Reduction along two dimensions.
As a special case, if you pass no parameter to a reduction operation the
original tensor is reduced along *all* its dimensions. The result is a
-one-dimension tensor with a single value.
+scalar, represented as a zero-dimension tensor.
Eigen::Tensor<float, 3> a(2, 3, 4);
a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f},
@@ -1112,7 +1112,7 @@ one-dimension tensor with a single value.
{19.0f, 18.0f, 17.0f, 16.0f},
{20.0f, 21.0f, 22.0f, 23.0f}}});
// Reduce along all dimensions using the sum() operator.
- Eigen::Tensor<float, 1> b = a.sum();
+ Eigen::Tensor<float, 0> b = a.sum();
cout << "b" << endl << b << endl << endl;
=>
b
@@ -1168,6 +1168,44 @@ Reduce a tensor using a user-defined reduction operator. See ```SumReducer```
in TensorFunctors.h for information on how to implement a reduction operator.
+## Scan Operations
+
+A *Scan* operation returns a tensor with the same dimensions as the original
+tensor. The operation performs an inclusive scan along the specified
+axis, which means it computes a running total along the axis for a given
+reduction operation.
+If the reduction operation corresponds to summation, then this computes the
+prefix sum of the tensor along the given axis.
+
+Example:
+dd a comment to this line
+
+ // Create a tensor of 2 dimensions
+ Eigen::Tensor<int, 2> a(2, 3);
+ a.setValues({{1, 2, 3}, {4, 5, 6}});
+ // Scan it along the second dimension (1) using summation
+ Eigen::Tensor<int, 2> b = a.cumsum(1);
+ // The result is a tensor with the same size as the input
+ cout << "a" << endl << a << endl << endl;
+ cout << "b" << endl << b << endl << endl;
+ =>
+ a
+ 1 2 3
+ 6 5 4
+
+ b
+ 1 3 6
+ 4 9 15
+
+### <Operation> cumsum(const Index& axis)
+
+Perform a scan by summing consecutive entries.
+
+### <Operation> cumprod(const Index& axis)
+
+Perform a scan by multiplying consecutive entries.
+
+
## Convolutions
### <Operation> convolve(const Kernel& kernel, const Dimensions& dims)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
index 759dede3f..1940a9692 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
@@ -110,7 +110,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
inline Self& base() { return *this; }
inline const Self& base() const { return *this; }
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
{
@@ -150,7 +150,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
return m_storage.data()[index];
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
inline Scalar& coeffRef(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
{
@@ -190,7 +190,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
return m_storage.data()[index];
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
inline const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
{
@@ -257,7 +257,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
return coeff(index);
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
inline Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
{
@@ -336,7 +336,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
{
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index firstDimension, IndexTypes... otherDimensions)
: m_storage(firstDimension, otherDimensions...)
@@ -350,22 +350,22 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
{
EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1, Index dim2)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2)
: m_storage(dim1*dim2, array<Index, 2>(dim1, dim2))
{
EIGEN_STATIC_ASSERT(2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1, Index dim2, Index dim3)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3)
: m_storage(dim1*dim2*dim3, array<Index, 3>(dim1, dim2, dim3))
{
EIGEN_STATIC_ASSERT(3 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4)
: m_storage(dim1*dim2*dim3*dim4, array<Index, 4>(dim1, dim2, dim3, dim4))
{
EIGEN_STATIC_ASSERT(4 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5)
: m_storage(dim1*dim2*dim3*dim4*dim5, array<Index, 5>(dim1, dim2, dim3, dim4, dim5))
{
EIGEN_STATIC_ASSERT(5 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
@@ -418,7 +418,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
return *this;
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
void resize(Index firstDimension, IndexTypes... otherDimensions)
{
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
index babafe108..d06f40cd8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
@@ -254,6 +254,14 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ const double compute_cost = 1.0 +
+ (m_return_dim < 0 ? 0.0 : (TensorOpCost::ModCost<Index>() + TensorOpCost::DivCost<Index>()));
+ return m_orig_impl.costPerCoeff(vectorized) +
+ m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost);
+ }
+
private:
EIGEN_DEVICE_FUNC void gen_strides(const InputDimensions& dims, StrideDims& strides) {
if (m_return_dim < 0) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index 1a34f3ccc..7a45a5cf4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -192,6 +192,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
}
EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log1p_op<Scalar>, const Derived>
+ log1p() const {
+ return unaryExpr(internal::scalar_log1p_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived>
abs() const {
return unaryExpr(internal::scalar_abs_op<Scalar>());
@@ -204,34 +210,74 @@ class TensorBase<Derived, ReadOnlyAccessors>
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_pow_op<Scalar>, const Derived>
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_pow_op<Scalar,Scalar> >, const Derived>
pow(Scalar exponent) const {
- return unaryExpr(internal::scalar_pow_op<Scalar>(exponent));
+ return unaryExpr(internal::bind2nd_op<internal::scalar_pow_op<Scalar,Scalar> >(exponent));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_real_op<Scalar>, const Derived>
+ real() const {
+ return unaryExpr(internal::scalar_real_op<Scalar>());
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_imag_op<Scalar>, const Derived>
+ imag() const {
+ return unaryExpr(internal::scalar_imag_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_sum_op<Scalar,Scalar> >, const Derived>
operator+ (Scalar rhs) const {
- return unaryExpr(internal::scalar_add_op<Scalar>(rhs));
+ return unaryExpr(internal::bind2nd_op<internal::scalar_sum_op<Scalar,Scalar> >(rhs));
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sub_op<Scalar>, const Derived>
+ EIGEN_STRONG_INLINE friend
+ const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_sum_op<Scalar> >, const Derived>
+ operator+ (Scalar lhs, const Derived& rhs) {
+ return rhs.unaryExpr(internal::bind1st_op<internal::scalar_sum_op<Scalar> >(lhs));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_difference_op<Scalar,Scalar> >, const Derived>
operator- (Scalar rhs) const {
EIGEN_STATIC_ASSERT((NumTraits<Scalar>::IsSigned || internal::is_same<Scalar, const std::complex<float> >::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
- return unaryExpr(internal::scalar_sub_op<Scalar>(rhs));
+ return unaryExpr(internal::bind2nd_op<internal::scalar_difference_op<Scalar,Scalar> >(rhs));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE friend
+ const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_difference_op<Scalar> >, const Derived>
+ operator- (Scalar lhs, const Derived& rhs) {
+ return rhs.unaryExpr(internal::bind1st_op<internal::scalar_difference_op<Scalar> >(lhs));
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Derived>
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_product_op<Scalar,Scalar> >, const Derived>
operator* (Scalar rhs) const {
- return unaryExpr(internal::scalar_multiple_op<Scalar>(rhs));
+ return unaryExpr(internal::bind2nd_op<internal::scalar_product_op<Scalar,Scalar> >(rhs));
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_quotient1_op<Scalar>, const Derived>
+ EIGEN_STRONG_INLINE friend
+ const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_product_op<Scalar> >, const Derived>
+ operator* (Scalar lhs, const Derived& rhs) {
+ return rhs.unaryExpr(internal::bind1st_op<internal::scalar_product_op<Scalar> >(lhs));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_quotient_op<Scalar,Scalar> >, const Derived>
operator/ (Scalar rhs) const {
- return unaryExpr(internal::scalar_quotient1_op<Scalar>(rhs));
+ return unaryExpr(internal::bind2nd_op<internal::scalar_quotient_op<Scalar,Scalar> >(rhs));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE friend
+ const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_quotient_op<Scalar> >, const Derived>
+ operator/ (Scalar lhs, const Derived& rhs) {
+ return rhs.unaryExpr(internal::bind1st_op<internal::scalar_quotient_op<Scalar> >(lhs));
}
EIGEN_DEVICE_FUNC
@@ -277,7 +323,6 @@ class TensorBase<Derived, ReadOnlyAccessors>
return unaryExpr(internal::scalar_floor_op<Scalar>());
}
-
// Generic binary operation support.
template <typename CustomBinaryOp, typename OtherDerived> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>
@@ -342,66 +387,66 @@ class TensorBase<Derived, ReadOnlyAccessors>
// Comparisons and tests.
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LT>, const Derived, const OtherDerived>
+ const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const OtherDerived>
operator<(const OtherDerived& other) const {
- return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_LT>());
+ return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>());
}
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LE>, const Derived, const OtherDerived>
+ const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const OtherDerived>
operator<=(const OtherDerived& other) const {
- return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_LE>());
+ return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>());
}
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GT>, const Derived, const OtherDerived>
+ const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const OtherDerived>
operator>(const OtherDerived& other) const {
- return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_GT>());
+ return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>());
}
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GE>, const Derived, const OtherDerived>
+ const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const OtherDerived>
operator>=(const OtherDerived& other) const {
- return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_GE>());
+ return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>());
}
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_EQ>, const Derived, const OtherDerived>
+ const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const OtherDerived>
operator==(const OtherDerived& other) const {
- return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_EQ>());
+ return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>());
}
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>, const Derived, const OtherDerived>
+ const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const OtherDerived>
operator!=(const OtherDerived& other) const {
- return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>());
+ return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>());
}
// comparisons and tests for Scalars
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+ EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
operator<(Scalar threshold) const {
return operator<(constant(threshold));
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+ EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
operator<=(Scalar threshold) const {
return operator<=(constant(threshold));
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+ EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
operator>(Scalar threshold) const {
return operator>(constant(threshold));
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+ EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
operator>=(Scalar threshold) const {
return operator>=(constant(threshold));
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_EQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+ EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
operator==(Scalar threshold) const {
return operator==(constant(threshold));
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+ EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
operator!=(Scalar threshold) const {
return operator!=(constant(threshold));
}
@@ -453,6 +498,28 @@ class TensorBase<Derived, ReadOnlyAccessors>
return TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>(derived(), fft);
}
+ // Scan.
+ typedef TensorScanOp<internal::SumReducer<CoeffReturnType>, const Derived> TensorScanSumOp;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorScanSumOp
+ cumsum(const Index& axis, bool exclusive = false) const {
+ return TensorScanSumOp(derived(), axis, exclusive);
+ }
+
+ typedef TensorScanOp<internal::ProdReducer<CoeffReturnType>, const Derived> TensorScanProdOp;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorScanProdOp
+ cumprod(const Index& axis, bool exclusive = false) const {
+ return TensorScanProdOp(derived(), axis, exclusive);
+ }
+
+ template <typename Reducer>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorScanOp<Reducer, const Derived>
+ scan(const Index& axis, const Reducer& reducer, bool exclusive = false) const {
+ return TensorScanOp<Reducer, const Derived>(derived(), axis, exclusive, reducer);
+ }
+
// Reductions.
template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived>
@@ -676,6 +743,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
slice(const StartIndices& startIndices, const Sizes& sizes) const {
return TensorSlicingOp<const StartIndices, const Sizes, const Derived>(derived(), startIndices, sizes);
}
+ template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, const Derived>
+ stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const {
+ return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides,
+ const Derived>(derived(), startIndices, stopIndices, strides);
+ }
template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorChippingOp<DimId, const Derived>
chip(const Index offset) const {
@@ -750,8 +823,8 @@ class TensorBase<Derived, ReadOnlyAccessors>
EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
};
-template<typename Derived>
-class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyAccessors> {
+template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value>
+class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
public:
typedef internal::traits<Derived> DerivedTraits;
typedef typename DerivedTraits::Scalar Scalar;
@@ -761,7 +834,7 @@ class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyA
template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor;
template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize;
- template <typename OtherDerived, int AccessLevel> friend class TensorBase;
+ template <typename OtherDerived, int OtherAccessLevel> friend class TensorBase;
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Derived& setZero() {
@@ -780,7 +853,7 @@ class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyA
return derived() = this->template random<RandomGenerator>();
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Derived& setValues(
const typename internal::Initializer<Derived, NumDimensions>::InitList& vals) {
@@ -851,6 +924,19 @@ class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyA
return TensorSlicingOp<const StartIndices, const Sizes, Derived>(derived(), startIndices, sizes);
}
+ template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, const Derived>
+ stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const {
+ return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides,
+ const Derived>(derived(), startIndices, stopIndices, strides);
+ }
+ template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, Derived>
+ stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) {
+ return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides,
+ Derived>(derived(), startIndices, stopIndices, strides);
+ }
+
template <DenseIndex DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorChippingOp<DimId, const Derived>
chip(const Index offset) const {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index c771496e2..5d67f69f3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -106,7 +106,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
enum {
- IsAligned = false,
+ IsAligned = true,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = false
@@ -118,7 +118,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
// The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar
// and store the result in a scalar. Instead one should reshape the scalar into a a N-D
// tensor with N >= 1 of 1 element first and then broadcast.
- EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
const InputDimensions& input_dims = m_impl.dimensions();
const Broadcast& broadcast = op.broadcast();
for (int i = 0; i < NumDims; ++i) {
@@ -247,7 +247,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
const Index originalIndex = index;
@@ -299,7 +299,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
const Index originalIndex = index;
@@ -354,11 +354,11 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
if (NumDims > 0) {
for (int i = NumDims - 1; i > 0; --i) {
compute_cost += TensorOpCost::DivCost<Index>();
- if (internal::index_statically_eq<Broadcast>()(i, 1)) {
+ if (internal::index_statically_eq<Broadcast>(i, 1)) {
compute_cost +=
TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
} else {
- if (!internal::index_statically_eq<InputDimensions>()(i, 1)) {
+ if (!internal::index_statically_eq<InputDimensions>(i, 1)) {
compute_cost += TensorOpCost::MulCost<Index>() +
TensorOpCost::ModCost<Index>() +
TensorOpCost::AddCost<Index>();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index 2742dbb95..1ba7ef170 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -152,8 +152,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_dim(op.dim()), m_device(device)
{
- // We could also support the case where NumInputDims==1 if needed.
- EIGEN_STATIC_ASSERT(NumInputDims >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
eigen_assert(NumInputDims > m_dim.actualDim());
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
@@ -203,7 +202,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
@@ -342,7 +341,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == 0) ||
(static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index 839c6e3e5..59bf90d93 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -128,8 +128,8 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
: m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis())
{
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
- EIGEN_STATIC_ASSERT(NumDims == RightNumDims, YOU_MADE_A_PROGRAMMING_MISTAKE);
- EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((NumDims == RightNumDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
eigen_assert(0 <= m_axis && m_axis < NumDims);
const Dimensions& lhs_dims = m_leftImpl.dimensions();
@@ -248,8 +248,8 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
- EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+ EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
@@ -344,8 +344,8 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
{
- static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
- EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+ EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize());
EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 6f113b903..20b29e5fd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -25,8 +25,9 @@ template<typename Dimensions, typename LhsXprType, typename RhsXprType>
struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
{
// Type promotion to handle the case where the types of the lhs and the rhs are different.
- typedef typename internal::promote_storage_type<typename LhsXprType::Scalar,
- typename RhsXprType::Scalar>::ret Scalar;
+ typedef typename gebp_traits<typename remove_const<typename LhsXprType::Scalar>::type,
+ typename remove_const<typename RhsXprType::Scalar>::type>::ResScalar Scalar;
+
typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
typename traits<RhsXprType>::StorageKind>::ret StorageKind;
typedef typename promote_index_type<typename traits<LhsXprType>::Index,
@@ -37,7 +38,7 @@ struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
typedef typename remove_reference<RhsNested>::type _RhsNested;
// From NumDims below.
- static const int NumDimensions = max_n_1<traits<RhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value>::size;
+ static const int NumDimensions = traits<RhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value;
static const int Layout = traits<LhsXprType>::Layout;
enum {
@@ -65,7 +66,7 @@ struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_,
typedef Device_ Device;
// From NumDims below.
- static const int NumDimensions = max_n_1<traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value>::size;
+ static const int NumDimensions = traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value;
};
} // end namespace internal
@@ -75,8 +76,8 @@ class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXp
{
public:
typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar;
- typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
- typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
+ typedef typename internal::gebp_traits<typename LhsXprType::CoeffReturnType,
+ typename RhsXprType::CoeffReturnType>::ResScalar CoeffReturnType;
typedef typename Eigen::internal::nested<TensorContractionOp>::type Nested;
typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index;
@@ -140,11 +141,11 @@ struct TensorContractionEvaluatorBase
static const int RDims =
internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
static const int ContractDims = internal::array_size<Indices>::value;
- static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
+ static const int NumDims = LDims + RDims - 2 * ContractDims;
typedef array<Index, ContractDims> contract_t;
- typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
- typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+ typedef array<Index, LDims - ContractDims> left_nocontract_t;
+ typedef array<Index, RDims - ContractDims> right_nocontract_t;
typedef DSizes<Index, NumDims> Dimensions;
@@ -218,11 +219,9 @@ struct TensorContractionEvaluatorBase
rhs_strides[i+1] = rhs_strides[i] * eval_right_dims[i];
}
- m_i_strides[0] = 1;
- m_j_strides[0] = 1;
- if(ContractDims) {
- m_k_strides[0] = 1;
- }
+ if (m_i_strides.size() > 0) m_i_strides[0] = 1;
+ if (m_j_strides.size() > 0) m_j_strides[0] = 1;
+ if (m_k_strides.size() > 0) m_k_strides[0] = 1;
m_i_size = 1;
m_j_size = 1;
@@ -318,11 +317,6 @@ struct TensorContractionEvaluatorBase
}
}
- // Scalar case. We represent the result as a 1d tensor of size 1.
- if (LDims + RDims == 2 * ContractDims) {
- m_dimensions[0] = 1;
- }
-
// If the layout is RowMajor, we need to reverse the m_dimensions
if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) {
for (int i = 0, j = NumDims - 1; i < j; i++, j--) {
@@ -510,7 +504,7 @@ struct TensorContractionEvaluatorBase
// call gebp (matrix kernel)
// The parameters here are copied from Eigen's GEMM implementation
- gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, 1.0, -1, -1, 0, 0);
+ gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, Scalar(1), -1, -1, 0, 0);
}
}
}
@@ -607,15 +601,14 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
static const int ContractDims = internal::array_size<Indices>::value;
typedef array<Index, ContractDims> contract_t;
- typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
- typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+ typedef array<Index, LDims - ContractDims> left_nocontract_t;
+ typedef array<Index, RDims - ContractDims> right_nocontract_t;
- static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
+ static const int NumDims = LDims + RDims - 2 * ContractDims;
// Could we use NumDimensions here?
typedef DSizes<Index, NumDims> Dimensions;
-
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
Base(op, device) { }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index 6a3ef14ef..d65dbb40f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -461,8 +461,8 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
#undef writeResultShmem
#undef writeRow
- const int max_i_write = (min)((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
- const int max_j_write = (min)((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
+ const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
+ const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
if (threadIdx.x < max_i_write) {
if (max_j_write == 8) {
@@ -1240,10 +1240,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
typedef array<Index, RDims> right_dim_mapper_t;
typedef array<Index, ContractDims> contract_t;
- typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
- typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+ typedef array<Index, LDims - ContractDims> left_nocontract_t;
+ typedef array<Index, RDims - ContractDims> right_nocontract_t;
- static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
+ static const int NumDims = LDims + RDims - 2 * ContractDims;
typedef DSizes<Index, NumDims> Dimensions;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
index b27e1a1b4..9b2cb3ff6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@@ -130,19 +130,19 @@ class SimpleTensorContractionMapper {
}
Index contract_val = left ? col : row;
- for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
- const Index idx = contract_val / m_k_strides[i];
- linidx += idx * m_contract_strides[i];
- contract_val -= idx * m_k_strides[i];
- }
-
if(array_size<contract_t>::value > 0) {
- if (side == Rhs && inner_dim_contiguous) {
- eigen_assert(m_contract_strides[0] == 1);
- linidx += contract_val;
- } else {
- linidx += contract_val * m_contract_strides[0];
- }
+ for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
+ const Index idx = contract_val / m_k_strides[i];
+ linidx += idx * m_contract_strides[i];
+ contract_val -= idx * m_k_strides[i];
+ }
+
+ if (side == Rhs && inner_dim_contiguous) {
+ eigen_assert(m_contract_strides[0] == 1);
+ linidx += contract_val;
+ } else {
+ linidx += contract_val * m_contract_strides[0];
+ }
}
return linidx;
@@ -153,15 +153,15 @@ class SimpleTensorContractionMapper {
const bool left = (side == Lhs);
Index nocontract_val[2] = {left ? row : col, left ? row + distance : col};
Index linidx[2] = {0, 0};
- for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
- const Index idx0 = nocontract_val[0] / m_ij_strides[i];
- const Index idx1 = nocontract_val[1] / m_ij_strides[i];
- linidx[0] += idx0 * m_nocontract_strides[i];
- linidx[1] += idx1 * m_nocontract_strides[i];
- nocontract_val[0] -= idx0 * m_ij_strides[i];
- nocontract_val[1] -= idx1 * m_ij_strides[i];
- }
if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+ for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
+ const Index idx0 = nocontract_val[0] / m_ij_strides[i];
+ const Index idx1 = nocontract_val[1] / m_ij_strides[i];
+ linidx[0] += idx0 * m_nocontract_strides[i];
+ linidx[1] += idx1 * m_nocontract_strides[i];
+ nocontract_val[0] -= idx0 * m_ij_strides[i];
+ nocontract_val[1] -= idx1 * m_ij_strides[i];
+ }
if (side == Lhs && inner_dim_contiguous) {
eigen_assert(m_nocontract_strides[0] == 1);
linidx[0] += nocontract_val[0];
@@ -173,22 +173,24 @@ class SimpleTensorContractionMapper {
}
Index contract_val[2] = {left ? col : row, left ? col : row + distance};
- for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
- const Index idx0 = contract_val[0] / m_k_strides[i];
- const Index idx1 = contract_val[1] / m_k_strides[i];
- linidx[0] += idx0 * m_contract_strides[i];
- linidx[1] += idx1 * m_contract_strides[i];
- contract_val[0] -= idx0 * m_k_strides[i];
- contract_val[1] -= idx1 * m_k_strides[i];
- }
+ if (array_size<contract_t>::value> 0) {
+ for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
+ const Index idx0 = contract_val[0] / m_k_strides[i];
+ const Index idx1 = contract_val[1] / m_k_strides[i];
+ linidx[0] += idx0 * m_contract_strides[i];
+ linidx[1] += idx1 * m_contract_strides[i];
+ contract_val[0] -= idx0 * m_k_strides[i];
+ contract_val[1] -= idx1 * m_k_strides[i];
+ }
- if (side == Rhs && inner_dim_contiguous) {
- eigen_assert(m_contract_strides[0] == 1);
- linidx[0] += contract_val[0];
- linidx[1] += contract_val[1];
- } else {
- linidx[0] += contract_val[0] * m_contract_strides[0];
- linidx[1] += contract_val[1] * m_contract_strides[0];
+ if (side == Rhs && inner_dim_contiguous) {
+ eigen_assert(m_contract_strides[0] == 1);
+ linidx[0] += contract_val[0];
+ linidx[1] += contract_val[1];
+ } else {
+ linidx[0] += contract_val[0] * m_contract_strides[0];
+ linidx[1] += contract_val[1] * m_contract_strides[0];
+ }
}
return IndexPair<Index>(linidx[0], linidx[1]);
}
@@ -200,7 +202,7 @@ class SimpleTensorContractionMapper {
return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size;
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const {
- return ((side == Lhs) && inner_dim_contiguous) ? m_contract_strides[0] : 1;
+ return ((side == Lhs) && inner_dim_contiguous && array_size<contract_t>::value > 0) ? m_contract_strides[0] : 1;
}
protected:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index 9044454fd..ee16cde9b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -14,6 +14,8 @@
#ifdef EIGEN_USE_THREADS
namespace Eigen {
+
+#ifdef EIGEN_USE_SIMPLE_THREAD_POOL
namespace internal {
template<typename LhsScalar, typename LhsMapper, typename Index>
@@ -52,7 +54,7 @@ struct packRhsAndKernelArg {
};
} // end namespace internal
-
+#endif // EIGEN_USE_SIMPLE_THREAD_POOL
template<typename Indices, typename LeftArgType, typename RightArgType>
struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> :
@@ -92,10 +94,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
typedef array<Index, RDims> right_dim_mapper_t;
typedef array<Index, ContractDims> contract_t;
- typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
- typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+ typedef array<Index, LDims - ContractDims> left_nocontract_t;
+ typedef array<Index, RDims - ContractDims> right_nocontract_t;
- static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
+ static const int NumDims = LDims + RDims - 2 * ContractDims;
typedef DSizes<Index, NumDims> Dimensions;
@@ -110,6 +112,623 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
TensorEvaluator(const XprType& op, const Device& device) :
Base(op, device) {}
+#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
+ template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
+ bool rhs_inner_dim_reordered, int Alignment>
+ void evalProduct(Scalar* buffer) const {
+ typedef
+ typename internal::remove_const<typename EvalLeftArgType::Scalar>::type
+ LhsScalar;
+ typedef
+ typename internal::remove_const<typename EvalRightArgType::Scalar>::type
+ RhsScalar;
+ typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+ typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+ typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+ typedef internal::TensorContractionInputMapper<
+ LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
+ contract_t, internal::packet_traits<LhsScalar>::size,
+ lhs_inner_dim_contiguous, false, Unaligned>
+ LhsMapper;
+ typedef internal::TensorContractionInputMapper<
+ RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
+ contract_t, internal::packet_traits<RhsScalar>::size,
+ rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned>
+ RhsMapper;
+ typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+ typedef internal::gemm_pack_lhs<LhsScalar, Index,
+ typename LhsMapper::SubMapper, Traits::mr,
+ Traits::LhsProgress, ColMajor>
+ LhsPacker;
+ typedef internal::gemm_pack_rhs<
+ RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor>
+ RhsPacker;
+ typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
+ Traits::mr, Traits::nr, false, false>
+ GebpKernel;
+
+ const Index m = this->m_i_size;
+ const Index n = this->m_j_size;
+ const Index k = this->m_k_size;
+ if (m == 0 || n == 0 || k == 0) return;
+
+ // Compute a set of algorithm parameters:
+ // - kernel block sizes (bm, bn, bk)
+ // - task grain sizes (number of kernels executed per task: gm, gn)
+ // - number of threads
+ // - sharding by row/column
+ // - parallel packing or first lhs then rhs
+ // and some derived parameters:
+ // - number of tasks (nm, nn, nk)
+ // - number of kernels (nm0, nn0)
+ // Unfortunately, all these parameters are tightly interdependent.
+ // So in some cases we first compute approximate values, then compute other
+ // values based on these approximations and then refine the approximations.
+
+ // There are lots of heuristics here. There is some reasoning behind them,
+ // but ultimately they are just tuned on contraction benchmarks for
+ // different input configurations, thread counts and instruction sets.
+ // So feel free to question any of them.
+
+ // Compute whether we want to shard by row or by column.
+ // This is a first approximation, it will be refined later. Since we don't
+ // know number of threads yet we use 2, because what's we are most
+ // interested in at this point is whether it makes sense to use
+ // parallelization at all or not.
+ bool shard_by_col = shardByCol(m, n, 2);
+
+ // First approximation of kernel blocking sizes.
+ // Again, we don't know number of threads yet, so we use 2.
+ Index bm, bn, bk;
+ if (shard_by_col) {
+ internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+ internal::ShardByCol>
+ blocking(k, m, n, 2);
+ bm = blocking.mc();
+ bn = blocking.nc();
+ bk = blocking.kc();
+ } else {
+ internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+ internal::ShardByRow>
+ blocking(k, m, n, 2);
+ bm = blocking.mc();
+ bn = blocking.nc();
+ bk = blocking.kc();
+ }
+
+ // Compute optimal number of threads.
+ // Note: we use bk instead of k here because we are interested in amount of
+ // _parallelizable_ computations, and computations are not parallelizable
+ // across k dimension.
+ const TensorOpCost cost =
+ contractionCost(m, n, bm, bn, bk, shard_by_col, false);
+ int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
+ static_cast<double>(n) * m, cost, this->m_device.numThreads());
+
+ // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost
+ // model is not tuned. Remove this when the cost model is tuned.
+ if (n == 1) num_threads = 1;
+
+ if (num_threads == 1) {
+ // The single-threaded algorithm should be faster in this case.
+ if (n == 1)
+ this->template evalGemv<lhs_inner_dim_contiguous,
+ rhs_inner_dim_contiguous,
+ rhs_inner_dim_reordered, Alignment>(buffer);
+ else
+ this->template evalGemm<lhs_inner_dim_contiguous,
+ rhs_inner_dim_contiguous,
+ rhs_inner_dim_reordered, Alignment>(buffer);
+ return;
+ }
+
+ // Now that we know number of threads, recalculate sharding and blocking.
+ shard_by_col = shardByCol(m, n, num_threads);
+ if (shard_by_col) {
+ internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+ internal::ShardByCol>
+ blocking(k, m, n, num_threads);
+ bm = blocking.mc();
+ bn = blocking.nc();
+ bk = blocking.kc();
+ } else {
+ internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+ internal::ShardByRow>
+ blocking(k, m, n, num_threads);
+ bm = blocking.mc();
+ bn = blocking.nc();
+ bk = blocking.kc();
+ }
+
+ // Number of kernels for each dimension.
+ Index nm0 = divup(m, bm);
+ Index nn0 = divup(n, bn);
+ Index nk = divup(k, bk);
+
+ // Calculate task grain size (number of kernels executed per task).
+ // This task size coarsening serves two purposes:
+ // 1. It reduces per-task overheads including synchronization overheads.
+ // 2. It allows to use caches better (reuse the same packed rhs in several
+ // consecutive kernels).
+ Index gm = 1;
+ Index gn = 1;
+ // If we are sharding by column, then we prefer to reduce rows first.
+ if (shard_by_col) {
+ gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col);
+ gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col);
+ } else {
+ gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col);
+ gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col);
+ }
+ // Number of tasks in each dimension.
+ Index nm = divup(nm0, gm);
+ Index nn = divup(nn0, gn);
+
+ // Last by not least, decide whether we want to issue both lhs and rhs
+ // packing in parallel; or issue lhs packing first, and then issue rhs
+ // packing when lhs packing completes (for !shard_by_col lhs and rhs are
+ // swapped). Parallel packing allows more parallelism (for both packing and
+ // kernels), while sequential packing provides better locality (once
+ // a thread finishes rhs packing it proceed to kernels with that rhs).
+ // First, we are interested in parallel packing if there are few tasks.
+ bool parallel_pack = num_threads >= nm * nn;
+ // Also do parallel packing if all data fits into L2$.
+ if (m * bk * Index(sizeof(LhsScalar)) + n * bk * Index(sizeof(RhsScalar)) <=
+ l2CacheSize() * num_threads)
+ parallel_pack = true;
+ // But don't do it if we will use each rhs only once. Locality seems to be
+ // more important in this case.
+ if ((shard_by_col ? nm : nn) == 1) parallel_pack = false;
+
+ LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides,
+ this->m_i_strides, this->m_left_contracting_strides,
+ this->m_k_strides);
+
+ RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides,
+ this->m_j_strides, this->m_right_contracting_strides,
+ this->m_k_strides);
+
+ Context<LhsPacker, RhsPacker, GebpKernel, LhsMapper, RhsMapper,
+ OutputMapper>(this->m_device, num_threads, lhs, rhs, buffer, m, n,
+ k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0,
+ shard_by_col, parallel_pack)
+ .run();
+ }
+
+ // Context coordinates a single parallel gemm operation.
+ template <typename LhsPacker, typename RhsPacker, typename GebpKernel,
+ typename LhsMapper, typename RhsMapper, typename OutputMapper>
+ class Context {
+ public:
+ Context(const Device& device, int num_threads, LhsMapper& lhs,
+ RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm,
+ Index bn, Index bk, Index nm, Index nn, Index nk, Index gm,
+ Index gn, Index nm0, Index nn0, bool shard_by_col,
+ bool parallel_pack)
+ : device_(device),
+ lhs_(lhs),
+ rhs_(rhs),
+ buffer_(buffer),
+ output_(buffer, tm),
+ num_threads_(num_threads),
+ shard_by_col_(shard_by_col),
+ parallel_pack_(parallel_pack),
+ m_(tm),
+ n_(tn),
+ k_(tk),
+ bm_(bm),
+ bn_(bn),
+ bk_(bk),
+ nm_(nm),
+ nn_(nn),
+ nk_(nk),
+ gm_(gm),
+ gn_(gn),
+ nm0_(nm0),
+ nn0_(nn0)
+ {
+ for (Index x = 0; x < P; x++) {
+ // Normal number of notifications for k slice switch is
+ // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only
+ // nm_ + nn_ notifications, because they will not receive notifications
+ // from preceeding kernels.
+ state_switch_[x] =
+ x == 0
+ ? 1
+ : (parallel_pack_ ? nn_ + nm_ : (shard_by_col_ ? nn_ : nm_)) +
+ (x == P - 1 ? nm_ * nn_ : 0);
+ state_packing_ready_[x] =
+ parallel_pack_ ? 0 : (shard_by_col_ ? nm_ : nn_);
+ state_kernel_[x] = new std::atomic<uint8_t>*[nm_];
+ for (Index m = 0; m < nm_; m++) {
+ state_kernel_[x][m] = new std::atomic<uint8_t>[nn_];
+ // Kernels generally receive 3 notifications (previous kernel + 2
+ // packing), but the first slice won't get notifications from previous
+ // kernels.
+ for (Index n = 0; n < nn_; n++)
+ state_kernel_[x][m][n].store(
+ (x == 0 ? 0 : 1) + (parallel_pack_ ? 2 : 1),
+ std::memory_order_relaxed);
+ }
+ }
+
+ // Allocate memory for packed rhs/lhs matrices.
+ size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
+ size_t lhs_size =
+ divup<size_t>(bm_ * bk_ * sizeof(LhsScalar), align) * align;
+ size_t rhs_size =
+ divup<size_t>(bn_ * bk_ * sizeof(RhsScalar), align) * align;
+ packed_mem_ = static_cast<char*>(internal::aligned_malloc(
+ (nm0_ * lhs_size + nn0_ * rhs_size) * std::min<size_t>(nk_, P - 1)));
+ char* mem = static_cast<char*>(packed_mem_);
+ for (Index x = 0; x < numext::mini<Index>(nk_, P - 1); x++) {
+ packed_lhs_[x].resize(nm0_);
+ for (Index m = 0; m < nm0_; m++) {
+ packed_lhs_[x][m] = reinterpret_cast<LhsScalar*>(mem);
+ mem += lhs_size;
+ }
+ packed_rhs_[x].resize(nn0_);
+ for (Index n = 0; n < nn0_; n++) {
+ packed_rhs_[x][n] = reinterpret_cast<RhsScalar*>(mem);
+ mem += rhs_size;
+ }
+ }
+ }
+
+ ~Context() {
+ for (Index x = 0; x < P; x++) {
+ for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m];
+ delete[] state_kernel_[x];
+ }
+ internal::aligned_free(packed_mem_);
+ }
+
+ void run() {
+ // Kick off packing of the first slice.
+ signal_switch(0, 1);
+ // Wait for overall completion.
+ // TODO(dvyukov): this wait can lead to deadlock.
+ // If nthreads contractions are concurrently submitted from worker
+ // threads, this wait will block all worker threads and the system will
+ // deadlock.
+ done_.Wait();
+ }
+
+ private:
+ Notification done_;
+ const Device& device_;
+ LhsMapper& lhs_;
+ RhsMapper& rhs_;
+ Scalar* const buffer_;
+ OutputMapper output_;
+ const int num_threads_;
+ const bool shard_by_col_;
+ const bool parallel_pack_;
+ // Matrix sizes.
+ const Index m_;
+ const Index n_;
+ const Index k_;
+ // Block sizes.
+ const Index bm_;
+ const Index bn_;
+ const Index bk_;
+ // Number of tasks.
+ const Index nm_;
+ const Index nn_;
+ const Index nk_;
+ // Task grain sizes (number of kernels executed per task).
+ const Index gm_;
+ const Index gn_;
+ // Number of blocks (this is different from ni_/nn_ because of task size
+ // coarsening).
+ const Index nm0_;
+ const Index nn0_;
+
+ // Parallelization strategy.
+ //
+ // Blocks related to the same k block can run in parallel because they write
+ // to different output blocks. So we parallelize within k slices, this
+ // gives us parallelism level of m x n. Before we can start any kernels
+ // related to k-th slice, we need to issue m lhs packing tasks and n rhs
+ // packing tasks.
+ //
+ // However, there is a bottleneck when we are finishing kernels for k-th
+ // slice (at the very end there is only 1 runnable kernel). To mitigate this
+ // bottleneck we allow kernels from k-th and k+1-th slices to run in
+ // parallel. Note that (m, n, k) and (m, n, k+1) kernels write to the same
+ // output block, so they must not run in parallel.
+ //
+ // This gives us the following dependency graph.
+ // On each k slice we have m x n kernel tasks, m lhs paking tasks and n rhs
+ // packing tasks.
+ // Kernel (m, n, k) can start when:
+ // - kernel (m, n, k-1) has finished
+ // - lhs packing (m, k) has finished
+ // - rhs packing (n, k) has finished
+ // Lhs/rhs packing can start when:
+ // - all k-1 packing has finished (artificially imposed to limit amount of
+ // parallel packing)
+ //
+ // On top of that we limit runnable tasks to two consecutive k slices.
+ // This is done to limit amount of memory we need for packed lhs/rhs
+ // (for each k slice we need m*bk + n*bk memory in packed_lhs_/packed_rhs_).
+ //
+ // state_switch_ tracks when we are ready to switch to the next k slice.
+ // state_kernel_[m][n] tracks when we are ready to kick off kernel (m, n).
+ // These variable are rolling over 3 consecutive k slices: first two we are
+ // actively executing + one to track completion of kernels in the second
+ // slice.
+ static const Index P = 3;
+ void* packed_mem_;
+ std::vector<LhsScalar*> packed_lhs_[P - 1];
+ std::vector<RhsScalar*> packed_rhs_[P - 1];
+ std::atomic<uint8_t>** state_kernel_[P];
+ // state_switch_ is frequently modified by worker threads, while other
+ // fields are read-only after constructor. Let's move it to a separate cache
+ // line to reduce cache-coherency traffic.
+ char pad_[128];
+ std::atomic<Index> state_packing_ready_[P];
+ std::atomic<Index> state_switch_[P];
+
+ void pack_lhs(Index m, Index k) {
+ const Index mend = m * gm_ + gm(m);
+ for (Index m1 = m * gm_; m1 < mend; m1++)
+ LhsPacker()(packed_lhs_[k % (P - 1)][m1],
+ lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1));
+
+ if (!parallel_pack_ && shard_by_col_) {
+ signal_packing(k);
+ } else {
+ signal_switch(k + 1);
+ for (Index n = nn_ - 1; n >= 0; n--) signal_kernel(m, n, k, n == 0);
+ }
+ }
+
+ void pack_rhs(Index n, Index k) {
+ const Index nend = n * gn_ + gn(n);
+ for (Index n1 = n * gn_; n1 < nend; n1++) {
+ if (k == 0) {
+ // Zero the output memory in parallel.
+ // On 10000x2x10000 mm zeroing can easily take half of time.
+ // Zero (bn x m) row. Safe to do here because all kernels that will
+ // write to this memory depend on completion of this task.
+ // Note: don't call device_.memset() here. device_.memset() blocks on
+ // thread pool worker thread, which can lead to underutilization and
+ // deadlocks.
+ memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar));
+ }
+ RhsPacker()(packed_rhs_[k % (P - 1)][n1],
+ rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1));
+ }
+
+ if (parallel_pack_ || shard_by_col_) {
+ signal_switch(k + 1);
+ for (Index m = nm_ - 1; m >= 0; m--) signal_kernel(m, n, k, m == 0);
+ } else {
+ signal_packing(k);
+ }
+ }
+
+ void kernel(Index m, Index n, Index k) {
+ // Note: order of iteration matters here. Iteration over m is innermost
+ // because we want to reuse the same packed rhs in consequetive tasks
+ // (rhs fits into L2$ while lhs only into L3$).
+ const Index nend = n * gn_ + gn(n);
+ const Index mend = m * gm_ + gm(m);
+ if (shard_by_col_) {
+ for (Index n1 = n * gn_; n1 < nend; n1++) {
+ for (Index m1 = m * gm_; m1 < mend; m1++)
+ GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_),
+ packed_lhs_[k % (P - 1)][m1],
+ packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
+ Scalar(1), -1, -1, 0, 0);
+ }
+ } else {
+ for (Index m1 = m * gm_; m1 < mend; m1++)
+ for (Index n1 = n * gn_; n1 < nend; n1++) {
+ GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_),
+ packed_lhs_[k % (P - 1)][m1],
+ packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
+ Scalar(1), -1, -1, 0, 0);
+ }
+ }
+ signal_kernel(m, n, k + 1, false);
+ signal_switch(k + 2);
+ }
+
+ void signal_packing(Index k) {
+ eigen_assert(!parallel_pack_);
+ Index s = state_packing_ready_[k % P].fetch_sub(1);
+ eigen_assert(s > 0);
+ if (s != 1) return;
+ state_packing_ready_[k % P] = shard_by_col_ ? nm_ : nn_;
+ enqueue_packing(k, shard_by_col_);
+ }
+
+ void signal_kernel(Index m, Index n, Index k, bool sync) {
+ std::atomic<uint8_t>* state = &state_kernel_[k % P][m][n];
+ Index s = state->load();
+ eigen_assert(s > 0);
+ if (s != 1 && state->fetch_sub(1) != 1) return;
+ state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed);
+ if (sync)
+ kernel(m, n, k);
+ else
+ device_.enqueueNoNotification([=]() { kernel(m, n, k); });
+ }
+
+ void signal_switch(Index k, Index v = 1) {
+ Index s = state_switch_[k % P].fetch_sub(v);
+ eigen_assert(s >= v);
+ if (s != v) return;
+
+ // Ready to switch to the next k slice.
+ // Reset counter for the next iteration.
+ state_switch_[k % P] =
+ (parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)) +
+ nm_ * nn_;
+ if (k < nk_) {
+ // Issue lhs/rhs packing. Their completion will in turn kick off
+ // kernels.
+ if (parallel_pack_) {
+ enqueue_packing(k, !shard_by_col_);
+ enqueue_packing(k, shard_by_col_);
+ } else if (shard_by_col_) {
+ enqueue_packing(k, false);
+ } else {
+ enqueue_packing(k, true);
+ }
+
+ // Termination handling.
+ // Because kernel completion signals k + 2 switch, we need to finish nk
+ // + 2 slices without issuing any tasks on nk + 1 slice. So here we
+ // pretend that all nk + 1 packing tasks just finish instantly; so that
+ // nk + 2 switch only waits for completion of nk kernels.
+ } else if (k == nk_) {
+ signal_switch(k + 1,
+ parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_));
+ } else {
+ done_.Notify();
+ }
+ }
+
+ // Enqueue all rhs/lhs packing for k-th slice.
+ void enqueue_packing(Index k, bool rhs) {
+ enqueue_packing_helper(0, rhs ? nn_ : nm_, k, rhs);
+ }
+
+ void enqueue_packing_helper(Index start, Index end, Index k, bool rhs) {
+ if (end - start == 1) {
+ if (rhs)
+ pack_rhs(start, k);
+ else
+ pack_lhs(start, k);
+ } else {
+ Index mid = (start + end) / 2;
+ device_.enqueueNoNotification(
+ [=]() { enqueue_packing_helper(mid, end, k, rhs); });
+ device_.enqueueNoNotification(
+ [=]() { enqueue_packing_helper(start, mid, k, rhs); });
+ }
+ }
+
+ // Block sizes with accounting for potentially incomplete last block.
+ Index bm(Index m) const { return m + 1 < nm0_ ? bm_ : m_ + bm_ - bm_ * nm0_; }
+ Index bn(Index n) const { return n + 1 < nn0_ ? bn_ : n_ + bn_ - bn_ * nn0_; }
+ Index bk(Index k) const { return k + 1 < nk_ ? bk_ : k_ + bk_ - bk_ * nk_; }
+ // Task grain sizes accounting for potentially incomplete last task.
+ Index gm(Index m) const { return m + 1 < nm_ ? gm_ : nm0_ + gm_ - gm_ * nm_; }
+ Index gn(Index n) const { return n + 1 < nn_ ? gn_ : nn0_ + gn_ - gn_ * nn_; }
+
+ Context(const Context&) = delete;
+ void operator=(const Context&) = delete;
+ };
+
+ // Decide whether we want to shard m x n contraction by columns or by rows.
+ static bool shardByCol(Index m, Index n, Index num_threads) {
+ // Note: we are comparing both n and m against Traits::nr, it is not
+ // a mistake. We are trying to figure out how both n and m will fit into
+ // the main sharding dimension.
+
+ // Sharding by column is the default
+ // ... unless there is enough data for vectorization over rows
+ if (m / num_threads >= Traits::nr &&
+ // and not enough data for vectorization over columns
+ (n / num_threads < Traits::nr ||
+ // ... or barely enough data for vectorization over columns,
+ // but it is not evenly dividable across threads
+ (n / num_threads < 4 * Traits::nr &&
+ (n % (num_threads * Traits::nr)) != 0 &&
+ // ... and it is evenly dividable across threads for rows
+ ((m % (num_threads * Traits::nr)) == 0 ||
+ // .. or it is not evenly dividable for both dimensions but
+ // there is much more data over rows so that corner effects are
+ // mitigated.
+ (m / n >= 6)))))
+ return false;
+ // Wait, or if matrices are just substantially prolonged over the other
+ // dimension.
+ if (n / num_threads < 16 * Traits::nr && m > n * 32) return false;
+ return true;
+ }
+
+ Index coarsenM(Index m, Index n, Index bm, Index bn, Index bk, Index gn,
+ int num_threads, bool shard_by_col) const {
+ Index gm = 1;
+ Index gm1 = 1;
+ Index nm0 = divup(m, bm);
+ Index nm1 = nm0;
+ for (;;) {
+ // Find the next candidate for m grain size. It needs to result in
+ // different number of blocks. E.g. if we have 10 kernels, we want to try
+ // 5 and 10, but not 6, 7, 8 and 9.
+ while (gm1 <= nm0 && nm1 == divup(nm0, gm1)) gm1++;
+ if (gm1 > nm0) break;
+ // Check the candidate.
+ int res = checkGrain(m, n, bm, bn, bk, gm1, gn, gm, gn, num_threads,
+ shard_by_col);
+ if (res < 0) break;
+ nm1 = divup(nm0, gm1);
+ if (res == 0) continue;
+ // Commit new grain size.
+ gm = gm1;
+ }
+ return gm;
+ }
+
+ Index coarsenN(Index m, Index n, Index bm, Index bn, Index bk, Index gm,
+ int num_threads, bool shard_by_col) const {
+ Index gn = 1;
+ Index gn1 = 1;
+ Index nn0 = divup(n, bn);
+ Index nn1 = nn0;
+ for (;;) {
+ while (gn1 <= nn0 && nn1 == divup(nn0, gn1)) gn1++;
+ if (gn1 > nn0) break;
+ int res = checkGrain(m, n, bm, bn, bk, gm, gn1, gm, gn, num_threads,
+ shard_by_col);
+ if (res < 0) break;
+ nn1 = divup(nn0, gn1);
+ if (res == 0) continue;
+ gn = gn1;
+ }
+ return gn;
+ }
+
+ // checkGrain checks whether grain (gm, gn) is suitable and is better than
+ // (oldgm, oldgn).
+ int checkGrain(Index m, Index n, Index bm, Index bn, Index bk, Index gm,
+ Index gn, Index oldgm, Index oldgn, int num_threads,
+ bool shard_by_col) const {
+ const TensorOpCost cost =
+ contractionCost(bm * gm, bn * gn, bm, bn, bk, shard_by_col, true);
+ double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(
+ static_cast<double>(bm) * gm * bn * gn, cost);
+ // If the task is too small, then we agree on it regardless of anything
+ // else. Otherwise synchronization overheads will dominate.
+ if (taskSize < 1) return 1;
+ // If it is too large, then we reject it and all larger tasks.
+ if (taskSize > 2) return -1;
+ // Now we are in presumably good task size range.
+ // The main deciding factor here is parallelism. Consider that we have 12
+ // kernels and 4 threads. Grains of 2, 3 and 4 all yield good task sizes.
+ // But 2/4 yield 6/3 tasks, which gives us parallelism of 0.75 (at most 3/4
+ // of cores will be busy). While grain size 3 gives us 4 tasks, which gives
+ // us parallelism of 1 (we can load all cores).
+ Index nm0 = divup(m, bm);
+ Index nn0 = divup(n, bn);
+ Index new_tasks = divup(nm0, gm) * divup(nn0, gn);
+ double new_parallelism = static_cast<double>(new_tasks) /
+ (divup<int>(new_tasks, num_threads) * num_threads);
+ Index old_tasks = divup(nm0, oldgm) * divup(nn0, oldgn);
+ double old_parallelism = static_cast<double>(old_tasks) /
+ (divup<int>(old_tasks, num_threads) * num_threads);
+ if (new_parallelism > old_parallelism || new_parallelism == 1) return 1;
+ return 0;
+ }
+
+#else // EIGEN_USE_SIMPLE_THREAD_POOL
+
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
void evalProduct(Scalar* buffer) const {
if (this->m_j_size == 1) {
@@ -376,7 +995,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
const Index actual_mc = numext::mini(m_base_start + arg.mc, arg.max_m) - m_base_start;
gebp(arg.output.getSubMapper(m_base_start, arg.n),
(*arg.blockAs)[blockAId], arg.blockB,
- actual_mc, arg.kc, arg.nc, 1.0, -1, -1, 0, 0);
+ actual_mc, arg.kc, arg.nc, Scalar(1), -1, -1, 0, 0);
// Notify that the kernel is done.
const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx;
@@ -384,6 +1003,47 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
}
}
}
+#endif // EIGEN_USE_SIMPLE_THREAD_POOL
+
+ TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk,
+ bool shard_by_col, bool prepacked) const {
+ const int packed_size = std::min<int>(PacketType<LhsScalar, Device>::size,
+ PacketType<RhsScalar, Device>::size);
+ const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
+ const double kd = static_cast<double>(bk);
+ // Peak VFMA bandwidth is 0.5. However if we have not enough data for
+ // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined
+ // experimentally.
+ double computeBandwidth = bk == 1 ? 4.0 :
+ (shard_by_col ? bn : bm) < Traits::nr ||
+ (shard_by_col ? bm : bn) < Traits::mr ? 2.0 : 0.5;
+#ifndef EIGEN_VECTORIZE_FMA
+ // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors.
+ // However for MULPS/ADDPS we have dependent sequence of 2 such instructions,
+ // so overall bandwidth is 1.0.
+ if (computeBandwidth == 0.5) computeBandwidth = 1.0;
+#endif
+ // Computations.
+ TensorOpCost cost = TensorOpCost(0, 0, kd * computeBandwidth, true, packed_size);
+ // Output stores.
+ cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size);
+ if (prepacked) {
+ // Packing and kernels are executed in different tasks. When we calculate
+ // task grain size we look only at kernel cost assuming that kernel
+ // is more expensive than packing.
+ return cost;
+ }
+ // Lhs/rhs loads + computations.
+ TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * (kd / n);
+ TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * (kd / m);
+ // Lhs packing memory cost does not contribute considerably to overall
+ // execution time because lhs is prefetched early and accessed sequentially.
+ if (shard_by_col)
+ lhsCost.dropMemoryCost();
+ else
+ rhsCost.dropMemoryCost();
+ return cost + lhsCost + rhsCost;
+ }
};
} // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index a2f1f71f5..860a6949a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -164,14 +164,14 @@ class TensorConversionOp : public TensorBase<TensorConversionOp<TargetType, XprT
};
template <bool SameType, typename Eval, typename Scalar> struct ConversionSubExprEval {
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool run(Eval& impl, Scalar*) {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar*) {
impl.evalSubExprsIfNeeded(NULL);
return true;
}
};
template <typename Eval, typename Scalar> struct ConversionSubExprEval<true, Eval, Scalar> {
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool run(Eval& impl, Scalar* data) {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar* data) {
return impl.evalSubExprsIfNeeded(data);
}
};
@@ -193,7 +193,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
enum {
IsAligned = false,
- PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess && internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
+ PacketAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = false
};
@@ -224,11 +224,9 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
- const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
- PacketConverter<TensorEvaluator<ArgType, Device>, PacketSourceType, PacketReturnType,
- SrcCoeffRatio, TgtCoeffRatio> converter(m_impl);
- return converter.template packet<LoadMode>(index);
+ const bool Vectorizable = TensorEvaluator<ArgType, Device>::PacketAccess &
+ internal::type_casting_traits<SrcType, TargetType>::VectorizedCast;
+ return PacketConv<LoadMode, Vectorizable>::run(m_impl, index);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
@@ -249,7 +247,31 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
protected:
- TensorEvaluator<ArgType, Device> m_impl;
+ template <int LoadMode, bool ActuallyVectorize>
+ struct PacketConv {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+ internal::scalar_cast_op<SrcType, TargetType> converter;
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ for (int i = 0; i < PacketSize; ++i) {
+ values[i] = converter(impl.coeff(index+i));
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+ };
+
+ template <int LoadMode>
+ struct PacketConv<LoadMode, true> {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+ const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
+ const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
+ PacketConverter<TensorEvaluator<ArgType, Device>, PacketSourceType, PacketReturnType,
+ SrcCoeffRatio, TgtCoeffRatio> converter(impl);
+ return converter.template packet<LoadMode>(index);
+ }
+ };
+
+ TensorEvaluator<ArgType, Device> m_impl;
};
} // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 091007ab7..abdf742c6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -254,7 +254,7 @@ struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, t
template<typename Indices, typename InputXprType, typename KernelXprType>
-class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType> >
+class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
index 0f6dcedaa..83c449cf1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@@ -10,10 +10,6 @@
#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
-//#if !defined(EIGEN_USE_GPU)
-//#define EIGEN_USE_COST_MODEL
-//#endif
-
namespace Eigen {
/** \class TensorEvaluator
@@ -32,45 +28,47 @@ class TensorOpCost {
// model based on minimal reciprocal throughput numbers from Intel or
// Agner Fog's tables would be better than what is there now.
template <typename ArgType>
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int MulCost() {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() {
return internal::functor_traits<
internal::scalar_product_op<ArgType, ArgType> >::Cost;
}
template <typename ArgType>
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int AddCost() {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() {
return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost;
}
template <typename ArgType>
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int DivCost() {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() {
return internal::functor_traits<
internal::scalar_quotient_op<ArgType, ArgType> >::Cost;
}
template <typename ArgType>
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int ModCost() {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() {
return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost;
}
template <typename SrcType, typename TargetType>
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int CastCost() {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() {
return internal::functor_traits<
internal::scalar_cast_op<SrcType, TargetType> >::Cost;
}
+ EIGEN_DEVICE_FUNC
TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
+ EIGEN_DEVICE_FUNC
TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
: bytes_loaded_(bytes_loaded),
bytes_stored_(bytes_stored),
compute_cycles_(compute_cycles) {}
+ EIGEN_DEVICE_FUNC
TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles,
bool vectorized, double packet_size)
: bytes_loaded_(bytes_loaded),
bytes_stored_(bytes_stored),
compute_cycles_(vectorized ? compute_cycles / packet_size
: compute_cycles) {
- using std::isfinite;
- eigen_assert(bytes_loaded >= 0 && (isfinite)(bytes_loaded));
- eigen_assert(bytes_stored >= 0 && (isfinite)(bytes_stored));
- eigen_assert(compute_cycles >= 0 && (isfinite)(compute_cycles));
+ eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded));
+ eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored));
+ eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles));
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const {
@@ -96,21 +94,21 @@ class TensorOpCost {
}
// TODO(rmlarsen): Define min in terms of total cost, not elementwise.
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMin(
- const TensorOpCost& rhs) {
- bytes_loaded_ = numext::mini(bytes_loaded_, rhs.bytes_loaded());
- bytes_stored_ = numext::mini(bytes_stored_, rhs.bytes_stored());
- compute_cycles_ = numext::mini(compute_cycles_, rhs.compute_cycles());
- return *this;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(
+ const TensorOpCost& rhs) const {
+ double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());
+ double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());
+ double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());
+ return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
}
// TODO(rmlarsen): Define max in terms of total cost, not elementwise.
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMax(
- const TensorOpCost& rhs) {
- bytes_loaded_ = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
- bytes_stored_ = numext::maxi(bytes_stored_, rhs.bytes_stored());
- compute_cycles_ = numext::maxi(compute_cycles_, rhs.compute_cycles());
- return *this;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(
+ const TensorOpCost& rhs) const {
+ double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
+ double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());
+ double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());
+ return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index 1d2d162dc..4f5767bc7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -12,6 +12,8 @@
namespace Eigen {
+static const int kCudaScratchSize = 1024;
+
// This defines an interface that GPUDevice can take to use
// CUDA streams underneath.
class StreamInterface {
@@ -24,6 +26,15 @@ class StreamInterface {
// Allocate memory on the actual device where the computation will run
virtual void* allocate(size_t num_bytes) const = 0;
virtual void deallocate(void* buffer) const = 0;
+
+ // Return a scratchpad buffer of size 1k
+ virtual void* scratchpad() const = 0;
+
+ // Return a semaphore. The semaphore is initially initialized to 0, and
+ // each kernel using it is responsible for resetting to 0 upon completion
+ // to maintain the invariant that the semaphore is always equal to 0 upon
+ // each kernel start.
+ virtual unsigned int* semaphore() const = 0;
};
static cudaDeviceProp* m_deviceProperties;
@@ -31,7 +42,21 @@ static bool m_devicePropInitialized = false;
static void initializeDeviceProp() {
if (!m_devicePropInitialized) {
- if (!m_devicePropInitialized) {
+ // Attempts to ensure proper behavior in the case of multiple threads
+ // calling this function simultaneously. This would be trivial to
+ // implement if we could use std::mutex, but unfortunately mutex don't
+ // compile with nvcc, so we resort to atomics and thread fences instead.
+ // Note that if the caller uses a compiler that doesn't support c++11 we
+ // can't ensure that the initialization is thread safe.
+#if __cplusplus >= 201103L
+ static std::atomic<bool> first(true);
+ if (first.exchange(false)) {
+#else
+ static bool first = true;
+ if (first) {
+ first = false;
+#endif
+ // We're the first thread to reach this point.
int num_devices;
cudaError_t status = cudaGetDeviceCount(&num_devices);
if (status != cudaSuccess) {
@@ -52,7 +77,19 @@ static void initializeDeviceProp() {
assert(status == cudaSuccess);
}
}
+
+#if __cplusplus >= 201103L
+ std::atomic_thread_fence(std::memory_order_release);
+#endif
m_devicePropInitialized = true;
+ } else {
+ // Wait for the other thread to inititialize the properties.
+ while (!m_devicePropInitialized) {
+#if __cplusplus >= 201103L
+ std::atomic_thread_fence(std::memory_order_acquire);
+#endif
+ sleep(1);
+ }
}
}
}
@@ -62,12 +99,12 @@ static const cudaStream_t default_stream = cudaStreamDefault;
class CudaStreamDevice : public StreamInterface {
public:
// Use the default stream on the current device
- CudaStreamDevice() : stream_(&default_stream) {
+ CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
cudaGetDevice(&device_);
initializeDeviceProp();
}
// Use the default stream on the specified device
- CudaStreamDevice(int device) : stream_(&default_stream), device_(device) {
+ CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
initializeDeviceProp();
}
// Use the specified stream. Note that it's the
@@ -75,7 +112,7 @@ class CudaStreamDevice : public StreamInterface {
// the specified device. If no device is specified the code
// assumes that the stream is associated to the current gpu device.
CudaStreamDevice(const cudaStream_t* stream, int device = -1)
- : stream_(stream), device_(device) {
+ : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
if (device < 0) {
cudaGetDevice(&device_);
} else {
@@ -89,6 +126,12 @@ class CudaStreamDevice : public StreamInterface {
initializeDeviceProp();
}
+ virtual ~CudaStreamDevice() {
+ if (scratch_) {
+ deallocate(scratch_);
+ }
+ }
+
const cudaStream_t& stream() const { return *stream_; }
const cudaDeviceProp& deviceProperties() const {
return m_deviceProperties[device_];
@@ -112,9 +155,29 @@ class CudaStreamDevice : public StreamInterface {
assert(err == cudaSuccess);
}
+ virtual void* scratchpad() const {
+ if (scratch_ == NULL) {
+ scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int));
+ }
+ return scratch_;
+ }
+
+ virtual unsigned int* semaphore() const {
+ if (semaphore_ == NULL) {
+ char* scratch = static_cast<char*>(scratchpad()) + kCudaScratchSize;
+ semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+ cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
+ EIGEN_UNUSED_VARIABLE(err)
+ assert(err == cudaSuccess);
+ }
+ return semaphore_;
+ }
+
private:
const cudaStream_t* stream_;
int device_;
+ mutable void* scratch_;
+ mutable unsigned int* semaphore_;
};
struct GpuDevice {
@@ -131,22 +194,20 @@ struct GpuDevice {
return stream_->stream();
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
return stream_->allocate(num_bytes);
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
- return NULL;
-#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
stream_->deallocate(buffer);
+ }
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
+ EIGEN_STRONG_INLINE void* scratchpad() const {
+ return stream_->scratchpad();
+ }
+
+ EIGEN_STRONG_INLINE unsigned int* semaphore() const {
+ return stream_->semaphore();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
@@ -156,30 +217,22 @@ struct GpuDevice {
EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess);
#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
cudaError_t err =
cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess);
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
cudaError_t err =
cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess);
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
@@ -188,21 +241,21 @@ struct GpuDevice {
EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess);
#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
+ EIGEN_STRONG_INLINE size_t numThreads() const {
// FIXME
return 32;
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+ EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
// FIXME
return 48*1024;
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+ EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
// We won't try to take advantage of the l2 cache for the time being, and
// there is no l3 cache on cuda devices.
return firstLevelCacheSize();
@@ -222,56 +275,26 @@ struct GpuDevice {
#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
return stream_->deviceProperties().multiProcessorCount;
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
- return 0;
-#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
return stream_->deviceProperties().maxThreadsPerBlock;
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
- return 0;
-#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
return stream_->deviceProperties().maxThreadsPerMultiProcessor;
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
- return 0;
-#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
return stream_->deviceProperties().sharedMemPerBlock;
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
- return 0;
-#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE int majorDeviceVersion() const {
return stream_->deviceProperties().major;
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
- return 0;
-#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int minorDeviceVersion() const {
-#ifndef __CUDA_ARCH__
+ EIGEN_STRONG_INLINE int minorDeviceVersion() const {
return stream_->deviceProperties().minor;
-#else
- eigen_assert(false && "The default device should be used instead to generate kernel code");
- return 0;
-#endif
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const {
+ EIGEN_STRONG_INLINE int maxBlocks() const {
return max_blocks_;
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index c02891465..069680a11 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -14,7 +14,7 @@ namespace Eigen {
// Use the SimpleThreadPool by default. We'll switch to the new non blocking
// thread pool later.
-#ifdef EIGEN_USE_NONBLOCKING_THREAD_POOL
+#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>;
typedef NonBlockingThreadPool ThreadPool;
#else
@@ -106,7 +106,7 @@ static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) {
// Build a thread pool device on top the an existing pool of threads.
struct ThreadPoolDevice {
// The ownership of the thread pool remains with the caller.
- ThreadPoolDevice(ThreadPoolInterface* pool, size_t num_cores) : pool_(pool), num_threads_(num_cores) { }
+ ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores) : pool_(pool), num_threads_(num_cores) { }
EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
return internal::aligned_malloc(num_bytes);
@@ -130,7 +130,7 @@ struct ThreadPoolDevice {
::memset(buffer, c, n);
}
- EIGEN_STRONG_INLINE size_t numThreads() const {
+ EIGEN_STRONG_INLINE int numThreads() const {
return num_threads_;
}
@@ -151,9 +151,7 @@ struct ThreadPoolDevice {
template <class Function, class... Args>
EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const {
Notification* n = new Notification();
- std::function<void()> func =
- std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, f, args...);
- pool_->Schedule(func);
+ pool_->Schedule(std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, f, args...));
return n;
}
@@ -161,20 +159,118 @@ struct ThreadPoolDevice {
EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b,
Function&& f,
Args&&... args) const {
- std::function<void()> func = std::bind(
- &FunctionWrapperWithBarrier<Function, Args...>::run, b, f, args...);
- pool_->Schedule(func);
+ pool_->Schedule(std::bind(
+ &FunctionWrapperWithBarrier<Function, Args...>::run, b, f, args...));
}
template <class Function, class... Args>
EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const {
- std::function<void()> func = std::bind(f, args...);
- pool_->Schedule(func);
+ pool_->Schedule(std::bind(f, args...));
+ }
+
+ // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if
+ // called from one of the threads in pool_. Returns -1 otherwise.
+ EIGEN_STRONG_INLINE int currentThreadId() const {
+ return pool_->CurrentThreadId();
+ }
+
+ // parallelFor executes f with [0, n) arguments in parallel and waits for
+ // completion. F accepts a half-open interval [first, last).
+ // Block size is choosen based on the iteration cost and resulting parallel
+ // efficiency. If block_align is not nullptr, it is called to round up the
+ // block size.
+ void parallelFor(Index n, const TensorOpCost& cost,
+ std::function<Index(Index)> block_align,
+ std::function<void(Index, Index)> f) const {
+ typedef TensorCostModel<ThreadPoolDevice> CostModel;
+ if (n <= 1 || numThreads() == 1 ||
+ CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
+ f(0, n);
+ return;
+ }
+
+ // Calculate block size based on (1) the iteration cost and (2) parallel
+ // efficiency. We want blocks to be not too small to mitigate
+ // parallelization overheads; not too large to mitigate tail
+ // effect and potential load imbalance and we also want number
+ // of blocks to be evenly dividable across threads.
+
+ double block_size_f = 1.0 / CostModel::taskSize(1, cost);
+ Index block_size = numext::mini(n, numext::maxi<Index>(1, block_size_f));
+ const Index max_block_size =
+ numext::mini(n, numext::maxi<Index>(1, 2 * block_size_f));
+ if (block_align) {
+ Index new_block_size = block_align(block_size);
+ eigen_assert(new_block_size >= block_size);
+ block_size = numext::mini(n, new_block_size);
+ }
+ Index block_count = divup(n, block_size);
+ // Calculate parallel efficiency as fraction of total CPU time used for
+ // computations:
+ double max_efficiency =
+ static_cast<double>(block_count) /
+ (divup<int>(block_count, numThreads()) * numThreads());
+ // Now try to increase block size up to max_block_size as long as it
+ // doesn't decrease parallel efficiency.
+ for (Index prev_block_count = block_count; prev_block_count > 1;) {
+ // This is the next block size that divides size into a smaller number
+ // of blocks than the current block_size.
+ Index coarser_block_size = divup(n, prev_block_count - 1);
+ if (block_align) {
+ Index new_block_size = block_align(coarser_block_size);
+ eigen_assert(new_block_size >= coarser_block_size);
+ coarser_block_size = numext::mini(n, new_block_size);
+ }
+ if (coarser_block_size > max_block_size) {
+ break; // Reached max block size. Stop.
+ }
+ // Recalculate parallel efficiency.
+ const Index coarser_block_count = divup(n, coarser_block_size);
+ eigen_assert(coarser_block_count < prev_block_count);
+ prev_block_count = coarser_block_count;
+ const double coarser_efficiency =
+ static_cast<double>(coarser_block_count) /
+ (divup<int>(coarser_block_count, numThreads()) * numThreads());
+ if (coarser_efficiency + 0.01 >= max_efficiency) {
+ // Taking it.
+ block_size = coarser_block_size;
+ block_count = coarser_block_count;
+ if (max_efficiency < coarser_efficiency) {
+ max_efficiency = coarser_efficiency;
+ }
+ }
+ }
+
+ // Recursively divide size into halves until we reach block_size.
+ // Division code rounds mid to block_size, so we are guaranteed to get
+ // block_count leaves that do actual computations.
+ Barrier barrier(static_cast<unsigned int>(block_count));
+ std::function<void(Index, Index)> handleRange;
+ handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) {
+ if (last - first <= block_size) {
+ // Single block or less, execute directly.
+ f(first, last);
+ barrier.Notify();
+ return;
+ }
+ // Split into halves and submit to the pool.
+ Index mid = first + divup((last - first) / 2, block_size) * block_size;
+ pool_->Schedule([=, &handleRange]() { handleRange(mid, last); });
+ pool_->Schedule([=, &handleRange]() { handleRange(first, mid); });
+ };
+ handleRange(0, n);
+ barrier.Wait();
+ }
+
+ // Convenience wrapper for parallelFor that does not align blocks.
+ void parallelFor(Index n, const TensorOpCost& cost,
+ std::function<void(Index, Index)> f) const {
+ parallelFor(n, cost, nullptr, std::move(f));
}
private:
ThreadPoolInterface* pool_;
- size_t num_threads_;
+ int num_threads_;
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
index ca9ac79df..1a30e45fb 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
@@ -44,7 +44,7 @@ template<DenseIndex n, typename Index, std::size_t Rank> const Index array_get(c
}
-#if defined(EIGEN_HAS_CONSTEXPR)
+#if EIGEN_HAS_CONSTEXPR
template <typename Index, std::size_t Rank>
struct index_known_statically_impl<DimensionList<Index, Rank> > {
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index f0b8ac958..b24cdebf1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -29,14 +29,6 @@ namespace Eigen {
* \sa Tensor
*/
-// Can't use std::pair on cuda devices
-template <typename Index> struct IndexPair {
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) { }
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Index f, Index s) : first(f), second(s) { }
- Index first;
- Index second;
-};
-
// Boilerplate code
namespace internal {
@@ -115,7 +107,7 @@ struct Sizes : internal::numeric_list<std::ptrdiff_t, Indices...> {
explicit EIGEN_DEVICE_FUNC Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
// todo: add assertion
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template <typename... DenseIndex> EIGEN_DEVICE_FUNC Sizes(DenseIndex...) { }
explicit EIGEN_DEVICE_FUNC Sizes(std::initializer_list<std::ptrdiff_t> /*l*/) {
// todo: add assertion
@@ -182,7 +174,7 @@ template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0
return *this;
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template <typename... DenseIndex> Sizes(DenseIndex... /*indices*/) { }
explicit Sizes(std::initializer_list<std::size_t>) {
// todo: add assertion
@@ -190,13 +182,13 @@ template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0
#else
EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex) {
}
- EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex) {
+ EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex) {
}
- EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex, const DenseIndex) {
+ EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex) {
}
- EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
+ EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
}
- EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
+ EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
}
#endif
@@ -290,31 +282,31 @@ struct DSizes : array<DenseIndex, NumDims> {
(*this)[0] = i0;
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) {
EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 2 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
#else
- EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1) {
+ EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1) {
eigen_assert(NumDims == 2);
(*this)[0] = i0;
(*this)[1] = i1;
}
- EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) {
+ EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) {
eigen_assert(NumDims == 3);
(*this)[0] = i0;
(*this)[1] = i1;
(*this)[2] = i2;
}
- EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) {
+ EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) {
eigen_assert(NumDims == 4);
(*this)[0] = i0;
(*this)[1] = i1;
(*this)[2] = i2;
(*this)[3] = i3;
}
- EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) {
+ EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) {
eigen_assert(NumDims == 5);
(*this)[0] = i0;
(*this)[1] = i1;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index c556fec0f..a08dfa7c3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -56,7 +56,7 @@ struct nested<TensorEvalToOp<XprType>, 1, typename eval<TensorEvalToOp<XprType>
template<typename XprType>
-class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType> >
+class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorEvalToOp>::Scalar Scalar;
@@ -94,7 +94,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
enum {
- IsAligned = true,
+ IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index ae4ce3c90..61c111cec 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -129,6 +129,10 @@ template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
double loadConstant(const double* address) {
return __ldg(address);
}
+template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+Eigen::half loadConstant(const Eigen::half* address) {
+ return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x)));
+}
#endif
}
@@ -222,7 +226,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
EIGEN_DEVICE_FUNC
TensorEvaluator(const XprType& op, const Device& device)
- : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device)
+ : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper()
{ }
typedef typename XprType::Index Index;
@@ -239,13 +243,13 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
{
- return m_functor(index);
+ return m_wrapper(m_functor, index);
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- return m_functor.template packetOp<Index, PacketReturnType>(index);
+ return m_wrapper.template packetOp<PacketReturnType, Index>(m_functor, index);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
@@ -259,6 +263,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
private:
const NullaryOp m_functor;
TensorEvaluator<ArgType, Device> m_argImpl;
+ const internal::nullary_wrapper<CoeffReturnType,NullaryOp> m_wrapper;
};
@@ -399,6 +404,101 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
TensorEvaluator<RightArgType, Device> m_rightImpl;
};
+// -------------------- CwiseTernaryOp --------------------
+
+template<typename TernaryOp, typename Arg1Type, typename Arg2Type, typename Arg3Type, typename Device>
+struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type>, Device>
+{
+ typedef TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type> XprType;
+
+ enum {
+ IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned & TensorEvaluator<Arg3Type, Device>::IsAligned,
+ PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess &
+ internal::functor_traits<TernaryOp>::PacketAccess,
+ Layout = TensorEvaluator<Arg1Type, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
+ };
+
+ EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+ : m_functor(op.functor()),
+ m_arg1Impl(op.arg1Expression(), device),
+ m_arg2Impl(op.arg2Expression(), device),
+ m_arg3Impl(op.arg3Expression(), device)
+ {
+ EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<Arg1Type, Device>::Layout) == static_cast<int>(TensorEvaluator<Arg3Type, Device>::Layout) || internal::traits<XprType>::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
+ typename internal::traits<Arg2Type>::StorageKind>::value),
+ STORAGE_KIND_MUST_MATCH)
+ EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
+ typename internal::traits<Arg3Type>::StorageKind>::value),
+ STORAGE_KIND_MUST_MATCH)
+ EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
+ typename internal::traits<Arg2Type>::Index>::value),
+ STORAGE_INDEX_MUST_MATCH)
+ EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
+ typename internal::traits<Arg3Type>::Index>::value),
+ STORAGE_INDEX_MUST_MATCH)
+
+ eigen_assert(dimensions_match(m_arg1Impl.dimensions(), m_arg2Impl.dimensions()) && dimensions_match(m_arg1Impl.dimensions(), m_arg3Impl.dimensions()));
+ }
+
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+ typedef typename TensorEvaluator<Arg1Type, Device>::Dimensions Dimensions;
+
+ EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
+ {
+ // TODO: use arg2 or arg3 dimensions if they are known at compile time.
+ return m_arg1Impl.dimensions();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+ m_arg1Impl.evalSubExprsIfNeeded(NULL);
+ m_arg2Impl.evalSubExprsIfNeeded(NULL);
+ m_arg3Impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ m_arg1Impl.cleanup();
+ m_arg2Impl.cleanup();
+ m_arg3Impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+ {
+ return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index));
+ }
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ return m_functor.packetOp(m_arg1Impl.template packet<LoadMode>(index),
+ m_arg2Impl.template packet<LoadMode>(index),
+ m_arg3Impl.template packet<LoadMode>(index));
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ const double functor_cost = internal::functor_traits<TernaryOp>::Cost;
+ return m_arg1Impl.costPerCoeff(vectorized) +
+ m_arg2Impl.costPerCoeff(vectorized) +
+ m_arg3Impl.costPerCoeff(vectorized) +
+ TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
+ }
+
+ EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+
+ private:
+ const TernaryOp m_functor;
+ TensorEvaluator<Arg1Type, Device> m_arg1Impl;
+ TensorEvaluator<Arg1Type, Device> m_arg2Impl;
+ TensorEvaluator<Arg3Type, Device> m_arg3Impl;
+};
+
// -------------------- SelectOp --------------------
@@ -475,7 +575,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
.cwiseMax(m_elseImpl.costPerCoeff(vectorized));
}
- EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return NULL; }
private:
TensorEvaluator<IfArgType, Device> m_condImpl;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 5c3d4d630..0cac7b179 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -59,13 +59,14 @@ class TensorExecutor<Expression, DefaultDevice, true>
{
const Index size = array_prod(evaluator.dimensions());
const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
- // Manually unroll this loop since compilers don't do it.
+ // Give the compiler a strong hint to unroll the loop. But don't insist
+ // on unrolling, because if the function is expensive the compiler should not
+ // unroll the loop at the expense of inlining.
const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
- evaluator.evalPacket(i);
- evaluator.evalPacket(i+PacketSize);
- evaluator.evalPacket(i+2*PacketSize);
- evaluator.evalPacket(i+3*PacketSize);
+ for (Index j = 0; j < 4; j++) {
+ evaluator.evalPacket(i + j * PacketSize);
+ }
}
const Index VectorizedSize = (size / PacketSize) * PacketSize;
for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
@@ -92,24 +93,30 @@ struct EvalRange {
evaluator.evalScalar(i);
}
}
+
+ static Index alignBlockSize(Index size) {
+ return size;
+ }
};
template <typename Evaluator, typename Index>
struct EvalRange<Evaluator, Index, true> {
+ static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+
static void run(Evaluator* evaluator_in, const Index first, const Index last) {
Evaluator evaluator = *evaluator_in;
eigen_assert(last >= first);
Index i = first;
- const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
if (last - first >= PacketSize) {
eigen_assert(first % PacketSize == 0);
Index last_chunk_offset = last - 4 * PacketSize;
- // Manually unroll this loop since compilers don't do it.
+ // Give the compiler a strong hint to unroll the loop. But don't insist
+ // on unrolling, because if the function is expensive the compiler should not
+ // unroll the loop at the expense of inlining.
for (; i <= last_chunk_offset; i += 4*PacketSize) {
- evaluator.evalPacket(i);
- evaluator.evalPacket(i+PacketSize);
- evaluator.evalPacket(i+2*PacketSize);
- evaluator.evalPacket(i+3*PacketSize);
+ for (Index j = 0; j < 4; j++) {
+ evaluator.evalPacket(i + j * PacketSize);
+ }
}
last_chunk_offset = last - PacketSize;
for (; i <= last_chunk_offset; i += PacketSize) {
@@ -120,6 +127,15 @@ struct EvalRange<Evaluator, Index, true> {
evaluator.evalScalar(i);
}
}
+
+ static Index alignBlockSize(Index size) {
+ // Align block size to packet size and account for unrolling in run above.
+ if (size >= 16 * PacketSize) {
+ return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
+ }
+ // Aligning to 4 * PacketSize would increase block size by more than 25%.
+ return (size + PacketSize - 1) & ~(PacketSize - 1);
+ }
};
template <typename Expression, bool Vectorizable>
@@ -133,18 +149,23 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign)
{
- const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
const Index size = array_prod(evaluator.dimensions());
+#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL)
+ device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
+ EvalRange<Evaluator, Index, Vectorizable>::alignBlockSize,
+ [&evaluator](Index first, Index last) {
+ EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, first, last);
+ });
+#else
size_t num_threads = device.numThreads();
-#ifdef EIGEN_USE_COST_MODEL
if (num_threads > 1) {
num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
size, evaluator.costPerCoeff(Vectorizable), num_threads);
}
-#endif
if (num_threads == 1) {
EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
} else {
+ const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
const Index numblocks = size / blocksize;
@@ -161,11 +182,12 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
}
barrier.Wait();
}
+#endif // defined(!EIGEN_USE_SIMPLE_THREAD_POOL)
}
evaluator.cleanup();
}
};
-#endif
+#endif // EIGEN_USE_THREADS
// GPU: the evaluation of the expression is offloaded to a GPU.
@@ -212,16 +234,11 @@ struct EigenMetaKernelEval<Evaluator, Index, true> {
template <typename Evaluator, typename Index>
__global__ void
__launch_bounds__(1024)
-EigenMetaKernel(Evaluator memcopied_eval, Index size) {
+EigenMetaKernel(Evaluator eval, Index size) {
const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
const Index step_size = blockDim.x * gridDim.x;
- // Cuda memcopies the kernel arguments. That's fine for POD, but for more
- // complex types such as evaluators we should really conform to the C++
- // standard and call a proper copy constructor.
- Evaluator eval(memcopied_eval);
-
const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
index 8491c4ca2..5f2e329f2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
@@ -219,6 +219,86 @@ class TensorCwiseBinaryOp : public TensorBase<TensorCwiseBinaryOp<BinaryOp, LhsX
namespace internal {
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+struct traits<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> >
+{
+ // Type promotion to handle the case where the types of the args are different.
+ typedef typename result_of<
+ TernaryOp(typename Arg1XprType::Scalar,
+ typename Arg2XprType::Scalar,
+ typename Arg3XprType::Scalar)>::type Scalar;
+ typedef traits<Arg1XprType> XprTraits;
+ typedef typename traits<Arg1XprType>::StorageKind StorageKind;
+ typedef typename traits<Arg1XprType>::Index Index;
+ typedef typename Arg1XprType::Nested Arg1Nested;
+ typedef typename Arg2XprType::Nested Arg2Nested;
+ typedef typename Arg3XprType::Nested Arg3Nested;
+ typedef typename remove_reference<Arg1Nested>::type _Arg1Nested;
+ typedef typename remove_reference<Arg2Nested>::type _Arg2Nested;
+ typedef typename remove_reference<Arg3Nested>::type _Arg3Nested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+
+ enum {
+ Flags = 0
+ };
+};
+
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+struct eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, Eigen::Dense>
+{
+ typedef const TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>& type;
+};
+
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+struct nested<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, 1, typename eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> >::type>
+{
+ typedef TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+class TensorCwiseTernaryOp : public TensorBase<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, ReadOnlyAccessors>
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef Scalar CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorCwiseTernaryOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseTernaryOp(const Arg1XprType& arg1, const Arg2XprType& arg2, const Arg3XprType& arg3, const TernaryOp& func = TernaryOp())
+ : m_arg1_xpr(arg1), m_arg2_xpr(arg2), m_arg3_xpr(arg3), m_functor(func) {}
+
+ EIGEN_DEVICE_FUNC
+ const TernaryOp& functor() const { return m_functor; }
+
+ /** \returns the nested expressions */
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename Arg1XprType::Nested>::type&
+ arg1Expression() const { return m_arg1_xpr; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename Arg1XprType::Nested>::type&
+ arg2Expression() const { return m_arg2_xpr; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename Arg3XprType::Nested>::type&
+ arg3Expression() const { return m_arg3_xpr; }
+
+ protected:
+ typename Arg1XprType::Nested m_arg1_xpr;
+ typename Arg1XprType::Nested m_arg2_xpr;
+ typename Arg3XprType::Nested m_arg3_xpr;
+ const TernaryOp m_functor;
+};
+
+
+namespace internal {
template<typename IfXprType, typename ThenXprType, typename ElseXprType>
struct traits<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >
: traits<ThenXprType>
@@ -252,7 +332,7 @@ struct nested<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, 1, typename e
template<typename IfXprType, typename ThenXprType, typename ElseXprType>
-class TensorSelectOp : public TensorBase<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >
+class TensorSelectOp : public TensorBase<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorSelectOp>::Scalar Scalar;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index ece2ed91b..08eb5595a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -329,7 +329,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
for (Index i = 0; i < n; ++i) {
if(FFTDir == FFT_FORWARD) {
- a[i] = data[i] * std::conj(pos_j_base_powered[i]);
+ a[i] = data[i] * numext::conj(pos_j_base_powered[i]);
}
else {
a[i] = data[i] * pos_j_base_powered[i];
@@ -344,7 +344,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
b[i] = pos_j_base_powered[i];
}
else {
- b[i] = std::conj(pos_j_base_powered[i]);
+ b[i] = numext::conj(pos_j_base_powered[i]);
}
}
for (Index i = n; i < m - n; ++i) {
@@ -355,7 +355,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
b[i] = pos_j_base_powered[m-i];
}
else {
- b[i] = std::conj(pos_j_base_powered[m-i]);
+ b[i] = numext::conj(pos_j_base_powered[m-i]);
}
}
@@ -379,7 +379,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
for (Index i = 0; i < n; ++i) {
if(FFTDir == FFT_FORWARD) {
- data[i] = a[i] * std::conj(pos_j_base_powered[i]);
+ data[i] = a[i] * numext::conj(pos_j_base_powered[i]);
}
else {
data[i] = a[i] * pos_j_base_powered[i];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index b27ee0084..fcee5f60d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -65,7 +65,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
inline Self& base() { return *this; }
inline const Self& base() const { return *this; }
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const
{
@@ -97,7 +97,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices)
{
@@ -128,7 +128,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
return m_storage.data()[0];
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const
{
@@ -213,7 +213,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
return coeff(index);
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
{
@@ -309,7 +309,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
{
}
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+#if EIGEN_HAS_RVALUE_REFERENCES
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(Self&& other)
: m_storage(other.m_storage)
{
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index 7ec757519..c23ecdbc4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -55,7 +55,7 @@ struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<X
template<typename XprType>
-class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType> >
+class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar;
@@ -102,7 +102,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
- const Index numValues = m_impl.dimensions().TotalSize();
+ const Index numValues = internal::array_prod(m_impl.dimensions());
m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType));
// Should initialize the memory in case we're dealing with non POD types.
if (NumTraits<CoeffReturnType>::RequireInitialization) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index a8bd8b888..490ddd8bd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -16,11 +16,12 @@ template<typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType
template<typename Scalar_, typename Dimensions, int Options_ = 0, typename IndexType = DenseIndex> class TensorFixedSize;
template<typename PlainObjectType, int Options_ = Unaligned> class TensorMap;
template<typename PlainObjectType> class TensorRef;
-template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value> class TensorBase;
+template<typename Derived, int AccessLevel> class TensorBase;
template<typename NullaryOp, typename PlainObjectType> class TensorCwiseNullaryOp;
template<typename UnaryOp, typename XprType> class TensorCwiseUnaryOp;
template<typename BinaryOp, typename LeftXprType, typename RightXprType> class TensorCwiseBinaryOp;
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType> class TensorCwiseTernaryOp;
template<typename IfXprType, typename ThenXprType, typename ElseXprType> class TensorSelectOp;
template<typename Op, typename Dims, typename XprType> class TensorReductionOp;
template<typename XprType> class TensorIndexTupleOp;
@@ -42,9 +43,11 @@ template<typename ReverseDimensions, typename XprType> class TensorReverseOp;
template<typename PaddingDimensions, typename XprType> class TensorPaddingOp;
template<typename Shuffle, typename XprType> class TensorShufflingOp;
template<typename Strides, typename XprType> class TensorStridingOp;
+template<typename StartIndices, typename StopIndices, typename Strides, typename XprType> class TensorStridingSlicingOp;
template<typename Strides, typename XprType> class TensorInflationOp;
template<typename Generator, typename XprType> class TensorGeneratorOp;
template<typename LeftXprType, typename RightXprType> class TensorAssignOp;
+template<typename Op, typename XprType> class TensorScanOp;
template<typename CustomUnaryFunc, typename XprType> class TensorCustomUnaryOp;
template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> class TensorCustomBinaryOp;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index 33cd00391..7164e8d60 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -25,7 +25,7 @@ struct scalar_mod_op {
};
template <typename Scalar>
struct functor_traits<scalar_mod_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };
+{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; };
/** \internal
@@ -38,7 +38,7 @@ struct scalar_mod2_op {
};
template <typename Scalar>
struct functor_traits<scalar_mod2_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };
+{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; };
template <typename Scalar>
struct scalar_fmod_op {
@@ -69,7 +69,7 @@ struct scalar_sigmoid_op {
template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Packet packetOp(const Packet& x) const {
- const Packet one = pset1<Packet>(1);
+ const Packet one = pset1<Packet>(T(1));
return pdiv(one, padd(one, pexp(pnegate(x))));
}
};
@@ -84,14 +84,23 @@ struct functor_traits<scalar_sigmoid_op<T> > {
};
+template<typename Reducer, typename Device>
+struct reducer_traits {
+ enum {
+ Cost = 1,
+ PacketAccess = false
+ };
+};
+
// Standard reduction functors
template <typename T> struct SumReducer
{
- static const bool PacketAccess = true;
+ static const bool PacketAccess = packet_traits<T>::HasAdd;
static const bool IsStateful = false;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
- (*accum) += t;
+ internal::scalar_sum_op<T> sum_op;
+ *accum = sum_op(*accum, t);
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
@@ -119,16 +128,26 @@ template <typename T> struct SumReducer
}
};
+template <typename T, typename Device>
+struct reducer_traits<SumReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = PacketType<T, Device>::HasAdd
+ };
+};
+
+
template <typename T> struct MeanReducer
{
- static const bool PacketAccess = !NumTraits<T>::IsInteger;
+ static const bool PacketAccess = packet_traits<T>::HasAdd && !NumTraits<T>::IsInteger;
static const bool IsStateful = true;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
MeanReducer() : scalarCount_(0), packetCount_(0) { }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
- (*accum) += t;
+ internal::scalar_sum_op<T> sum_op;
+ *accum = sum_op(*accum, t);
scalarCount_++;
}
template <typename Packet>
@@ -162,9 +181,44 @@ template <typename T> struct MeanReducer
DenseIndex packetCount_;
};
+template <typename T, typename Device>
+struct reducer_traits<MeanReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = PacketType<T, Device>::HasAdd
+ };
+};
+
+
+template <typename T, bool IsMax = true, bool IsInteger = true>
+struct MinMaxBottomValue {
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
+ return Eigen::NumTraits<T>::lowest();
+ }
+};
+template <typename T>
+struct MinMaxBottomValue<T, true, false> {
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
+ return -Eigen::NumTraits<T>::infinity();
+ }
+};
+template <typename T>
+struct MinMaxBottomValue<T, false, true> {
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
+ return Eigen::NumTraits<T>::highest();
+ }
+};
+template <typename T>
+struct MinMaxBottomValue<T, false, false> {
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
+ return Eigen::NumTraits<T>::infinity();
+ }
+};
+
+
template <typename T> struct MaxReducer
{
- static const bool PacketAccess = true;
+ static const bool PacketAccess = packet_traits<T>::HasMax;
static const bool IsStateful = false;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
@@ -174,9 +228,8 @@ template <typename T> struct MaxReducer
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
(*accum) = pmax<Packet>(*accum, p);
}
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
- return Eigen::NumTraits<T>::lowest();
+ return MinMaxBottomValue<T, true, Eigen::NumTraits<T>::IsInteger>::bottom_value();
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
@@ -195,9 +248,18 @@ template <typename T> struct MaxReducer
}
};
+template <typename T, typename Device>
+struct reducer_traits<MaxReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = PacketType<T, Device>::HasMax
+ };
+};
+
+
template <typename T> struct MinReducer
{
- static const bool PacketAccess = true;
+ static const bool PacketAccess = packet_traits<T>::HasMin;
static const bool IsStateful = false;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
@@ -207,9 +269,8 @@ template <typename T> struct MinReducer
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
(*accum) = pmin<Packet>(*accum, p);
}
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
- return Eigen::NumTraits<T>::highest();
+ return MinMaxBottomValue<T, false, Eigen::NumTraits<T>::IsInteger>::bottom_value();
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
@@ -228,10 +289,18 @@ template <typename T> struct MinReducer
}
};
+template <typename T, typename Device>
+struct reducer_traits<MinReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = PacketType<T, Device>::HasMin
+ };
+};
+
template <typename T> struct ProdReducer
{
- static const bool PacketAccess = true;
+ static const bool PacketAccess = packet_traits<T>::HasMul;
static const bool IsStateful = false;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
@@ -263,6 +332,14 @@ template <typename T> struct ProdReducer
}
};
+template <typename T, typename Device>
+struct reducer_traits<ProdReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::MulCost,
+ PacketAccess = PacketType<T, Device>::HasMul
+ };
+};
+
struct AndReducer
{
@@ -280,6 +357,15 @@ struct AndReducer
}
};
+template <typename Device>
+struct reducer_traits<AndReducer, Device> {
+ enum {
+ Cost = 1,
+ PacketAccess = false
+ };
+};
+
+
struct OrReducer {
static const bool PacketAccess = false;
static const bool IsStateful = false;
@@ -295,6 +381,15 @@ struct OrReducer {
}
};
+template <typename Device>
+struct reducer_traits<OrReducer, Device> {
+ enum {
+ Cost = 1,
+ PacketAccess = false
+ };
+};
+
+
// Argmin/Argmax reducers
template <typename T> struct ArgMaxTupleReducer
{
@@ -312,6 +407,15 @@ template <typename T> struct ArgMaxTupleReducer
}
};
+template <typename T, typename Device>
+struct reducer_traits<ArgMaxTupleReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = false
+ };
+};
+
+
template <typename T> struct ArgMinTupleReducer
{
static const bool PacketAccess = false;
@@ -328,457 +432,11 @@ template <typename T> struct ArgMinTupleReducer
}
};
-
-// Random number generation
-namespace {
-#ifdef __CUDA_ARCH__
-__device__ int get_random_seed() {
- return clock();
-}
-#else
-int get_random_seed() {
-#ifdef _WIN32
- SYSTEMTIME st;
- GetSystemTime(&st);
- return st.wSecond + 1000 * st.wMilliseconds;
-#elif defined __APPLE__
- return static_cast<int>(mach_absolute_time());
-#else
- timespec ts;
- clock_gettime(CLOCK_REALTIME, &ts);
- return static_cast<int>(ts.tv_nsec);
-#endif
-}
-#endif
-}
-
-#if !defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)
-// We're not compiling a cuda kernel
-template <typename T> class UniformRandomGenerator {
-
- public:
- static const bool PacketAccess = true;
-
- UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
- if (!deterministic) {
- srand(get_random_seed());
- }
- }
- UniformRandomGenerator(const UniformRandomGenerator& other) {
- m_deterministic = other.m_deterministic;
- }
-
- template<typename Index>
- T operator()(Index) const {
- return random<T>();
- }
- template<typename Index, typename PacketType>
- PacketType packetOp(Index) const {
- const int packetSize = internal::unpacket_traits<PacketType>::size;
- EIGEN_ALIGN_MAX T values[packetSize];
- for (int i = 0; i < packetSize; ++i) {
- values[i] = random<T>();
- }
- return internal::pload<PacketType>(values);
- }
-
- private:
- bool m_deterministic;
-};
-
-#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900
-template <> class UniformRandomGenerator<float> {
- public:
- static const bool PacketAccess = true;
-
- UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_generator(new std::mt19937()) {
- if (!deterministic) {
- m_generator->seed(get_random_seed());
- }
- }
- UniformRandomGenerator(const UniformRandomGenerator<float>& other) {
- m_generator = new std::mt19937();
- m_generator->seed(other(0) * UINT_MAX);
- m_deterministic = other.m_deterministic;
- }
- ~UniformRandomGenerator() {
- delete m_generator;
- }
-
- template<typename Index>
- float operator()(Index) const {
- return m_distribution(*m_generator);
- }
- template<typename Index, typename PacketType>
- PacketType packetOp(Index i) const {
- const int packetSize = internal::unpacket_traits<PacketType>::size;
- EIGEN_ALIGN_MAX float values[packetSize];
- for (int k = 0; k < packetSize; ++k) {
- values[k] = this->operator()(i);
- }
- return internal::pload<PacketType>(values);
- }
-
- private:
- UniformRandomGenerator& operator = (const UniformRandomGenerator&);
- // Make sure m_deterministic comes first to match the layout of the cpu
- // version of the code.
- bool m_deterministic;
- std::mt19937* m_generator;
- mutable std::uniform_real_distribution<float> m_distribution;
-};
-
-template <> class UniformRandomGenerator<double> {
- public:
- static const bool PacketAccess = true;
-
- UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_generator(new std::mt19937()) {
- if (!deterministic) {
- m_generator->seed(get_random_seed());
- }
- }
- UniformRandomGenerator(const UniformRandomGenerator<double>& other) {
- m_generator = new std::mt19937();
- m_generator->seed(other(0) * UINT_MAX);
- m_deterministic = other.m_deterministic;
- }
- ~UniformRandomGenerator() {
- delete m_generator;
- }
-
- template<typename Index>
- double operator()(Index) const {
- return m_distribution(*m_generator);
- }
- template<typename Index, typename PacketType>
- PacketType packetOp(Index i) const {
- const int packetSize = internal::unpacket_traits<PacketType>::size;
- EIGEN_ALIGN_MAX double values[packetSize];
- for (int k = 0; k < packetSize; ++k) {
- values[k] = this->operator()(i);
- }
- return internal::pload<PacketType>(values);
- }
-
- private:
- UniformRandomGenerator& operator = (const UniformRandomGenerator&);
- // Make sure m_deterministic comes first to match the layout of the cpu
- // version of the code.
- bool m_deterministic;
- std::mt19937* m_generator;
- mutable std::uniform_real_distribution<double> m_distribution;
-};
-#endif
-
-#else
-
-// We're compiling a cuda kernel
-template <typename T> class UniformRandomGenerator;
-
-template <> class UniformRandomGenerator<float> {
- public:
- static const bool PacketAccess = true;
-
- __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
-
- __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
- m_deterministic = other.m_deterministic;
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = m_deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
-
- template<typename Index>
- __device__ float operator()(Index) const {
- return curand_uniform(&m_state);
- }
- template<typename Index, typename PacketType>
- __device__ float4 packetOp(Index) const {
- EIGEN_STATIC_ASSERT((is_same<PacketType, float4>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
- return curand_uniform4(&m_state);
- }
-
- private:
- bool m_deterministic;
- mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class UniformRandomGenerator<double> {
- public:
- static const bool PacketAccess = true;
-
- __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
- m_deterministic = other.m_deterministic;
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = m_deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- template<typename Index>
- __device__ double operator()(Index) const {
- return curand_uniform_double(&m_state);
- }
- template<typename Index, typename PacketType>
- __device__ double2 packetOp(Index) const {
- EIGEN_STATIC_ASSERT((is_same<PacketType, double2>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
- return curand_uniform2_double(&m_state);
- }
-
- private:
- bool m_deterministic;
- mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class UniformRandomGenerator<std::complex<float> > {
- public:
- static const bool PacketAccess = false;
-
- __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
- m_deterministic = other.m_deterministic;
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = m_deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- template<typename Index>
- __device__ std::complex<float> operator()(Index) const {
- float4 vals = curand_uniform4(&m_state);
- return std::complex<float>(vals.x, vals.y);
- }
-
- private:
- bool m_deterministic;
- mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class UniformRandomGenerator<std::complex<double> > {
- public:
- static const bool PacketAccess = false;
-
- __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
- m_deterministic = other.m_deterministic;
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = m_deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- template<typename Index>
- __device__ std::complex<double> operator()(Index) const {
- double2 vals = curand_uniform2_double(&m_state);
- return std::complex<double>(vals.x, vals.y);
- }
-
- private:
- bool m_deterministic;
- mutable curandStatePhilox4_32_10_t m_state;
-};
-
-#endif
-
-template <typename Scalar>
-struct functor_traits<UniformRandomGenerator<Scalar> > {
- enum {
- // Rough estimate.
- Cost = 100 * NumTraits<Scalar>::MulCost,
- PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
- };
-};
-
-
-
-#if (!defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)) && (__cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900)
-// We're not compiling a cuda kernel
-template <typename T> class NormalRandomGenerator {
- public:
- static const bool PacketAccess = true;
-
- NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_distribution(0, 1), m_generator(new std::mt19937()) {
- if (!deterministic) {
- m_generator->seed(get_random_seed());
- }
- }
- NormalRandomGenerator(const NormalRandomGenerator& other)
- : m_deterministic(other.m_deterministic), m_distribution(other.m_distribution), m_generator(new std::mt19937()) {
- m_generator->seed(other(0) * UINT_MAX);
- }
- ~NormalRandomGenerator() {
- delete m_generator;
- }
- template<typename Index>
- T operator()(Index) const {
- return m_distribution(*m_generator);
- }
- template<typename Index, typename PacketType>
- PacketType packetOp(Index) const {
- const int packetSize = internal::unpacket_traits<PacketType>::size;
- EIGEN_ALIGN_MAX T values[packetSize];
- for (int i = 0; i < packetSize; ++i) {
- values[i] = m_distribution(*m_generator);
- }
- return internal::pload<PacketType>(values);
- }
-
- private:
- // No assignment
- NormalRandomGenerator& operator = (const NormalRandomGenerator&);
-
- bool m_deterministic;
- mutable std::normal_distribution<T> m_distribution;
- std::mt19937* m_generator;
-};
-
-#elif defined (EIGEN_USE_GPU) && defined(__CUDACC__) && defined(__CUDA_ARCH__)
-
-// We're compiling a cuda kernel
-template <typename T> class NormalRandomGenerator;
-
-template <> class NormalRandomGenerator<float> {
- public:
- static const bool PacketAccess = true;
-
- __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- __device__ NormalRandomGenerator(const NormalRandomGenerator<float>& other) {
- m_deterministic = other.m_deterministic;
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = m_deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- template<typename Index>
- __device__ float operator()(Index) const {
- return curand_normal(&m_state);
- }
- template<typename Index, typename PacketType>
- __device__ float4 packetOp(Index) const {
- EIGEN_STATIC_ASSERT((is_same<PacketType, float4>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
- return curand_normal4(&m_state);
- }
-
- private:
- bool m_deterministic;
- mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class NormalRandomGenerator<double> {
- public:
- static const bool PacketAccess = true;
-
- __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- __device__ NormalRandomGenerator(const NormalRandomGenerator<double>& other) {
- m_deterministic = other.m_deterministic;
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = m_deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- template<typename Index>
- __device__ double operator()(Index) const {
- return curand_normal_double(&m_state);
- }
- template<typename Index, typename PacketType>
- __device__ double2 packetOp(Index) const {
- EIGEN_STATIC_ASSERT((is_same<PacketType, double2>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
- return curand_normal2_double(&m_state);
- }
-
- private:
- bool m_deterministic;
- mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class NormalRandomGenerator<std::complex<float> > {
- public:
- static const bool PacketAccess = false;
-
- __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- __device__ NormalRandomGenerator(const NormalRandomGenerator& other) {
- m_deterministic = other.m_deterministic;
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = m_deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- template<typename Index>
- __device__ std::complex<float> operator()(Index) const {
- float4 vals = curand_normal4(&m_state);
- return std::complex<float>(vals.x, vals.y);
- }
-
- private:
- bool m_deterministic;
- mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class NormalRandomGenerator<std::complex<double> > {
- public:
- static const bool PacketAccess = false;
-
- __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- __device__ NormalRandomGenerator(const NormalRandomGenerator& other) {
- m_deterministic = other.m_deterministic;
- const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int seed = m_deterministic ? 0 : get_random_seed();
- curand_init(seed, tid, 0, &m_state);
- }
- template<typename Index>
- __device__ std::complex<double> operator()(Index) const {
- double2 vals = curand_normal2_double(&m_state);
- return std::complex<double>(vals.x, vals.y);
- }
-
- private:
- bool m_deterministic;
- mutable curandStatePhilox4_32_10_t m_state;
-};
-
-#else
-
-template <typename T> class NormalRandomGenerator {
- public:
- static const bool PacketAccess = false;
- NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {}
-
- private:
- bool m_deterministic;
-};
-
-#endif
-
-template <typename Scalar>
-struct functor_traits<NormalRandomGenerator<Scalar> > {
+template <typename T, typename Device>
+struct reducer_traits<ArgMinTupleReducer<T>, Device> {
enum {
- // Rough estimate.
- Cost = 100 * NumTraits<Scalar>::MulCost,
- PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = false
};
};
@@ -797,7 +455,7 @@ class GaussianGenerator {
}
}
- T operator()(const array<Index, NumDims>& coordinates) const {
+ EIGEN_DEVICE_FUNC T operator()(const array<Index, NumDims>& coordinates) const {
T tmp = T(0);
for (size_t i = 0; i < NumDims; ++i) {
T offset = coordinates[i] - m_means[i];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
index 8ff7d5815..eb1d4934e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@@ -134,7 +134,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
- EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h
new file mode 100644
index 000000000..665b861cf
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h
@@ -0,0 +1,33 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
+
+namespace Eigen {
+
+/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given tensors.
+ *
+ * This function computes the regularized incomplete beta function (integral).
+ *
+ */
+template <typename ADerived, typename BDerived, typename XDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+ TensorCwiseTernaryOp<internal::scalar_betainc_op<typename XDerived::Scalar>,
+ const ADerived, const BDerived, const XDerived>
+ betainc(const ADerived& a, const BDerived& b, const XDerived& x) {
+ return TensorCwiseTernaryOp<
+ internal::scalar_betainc_op<typename XDerived::Scalar>, const ADerived,
+ const BDerived, const XDerived>(
+ a, b, x, internal::scalar_betainc_op<typename XDerived::Scalar>());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
index 38a833f82..a901c5dd4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
@@ -13,38 +13,61 @@
namespace Eigen {
namespace internal {
-template<>
-struct significant_decimals_impl<std::string>
- : significant_decimals_default_impl<std::string, true>
-{};
-}
+// Print the tensor as a 2d matrix
+template <typename Tensor, int Rank>
+struct TensorPrinter {
+ static void run (std::ostream& os, const Tensor& tensor) {
+ typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar;
+ typedef typename Tensor::Index Index;
+ const Index total_size = internal::array_prod(tensor.dimensions());
+ if (total_size > 0) {
+ const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions());
+ static const int layout = Tensor::Layout;
+ Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim);
+ os << matrix;
+ }
+ }
+};
+
+
+// Print the tensor as a vector
+template <typename Tensor>
+struct TensorPrinter<Tensor, 1> {
+ static void run (std::ostream& os, const Tensor& tensor) {
+ typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar;
+ typedef typename Tensor::Index Index;
+ const Index total_size = internal::array_prod(tensor.dimensions());
+ if (total_size > 0) {
+ Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size);
+ os << array;
+ }
+ }
+};
+
+
+// Print the tensor as a scalar
+template <typename Tensor>
+struct TensorPrinter<Tensor, 0> {
+ static void run (std::ostream& os, const Tensor& tensor) {
+ os << tensor.coeff(0);
+ }
+};
+}
template <typename T>
std::ostream& operator << (std::ostream& os, const TensorBase<T, ReadOnlyAccessors>& expr) {
+ typedef TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> Evaluator;
+ typedef typename Evaluator::Dimensions Dimensions;
+
// Evaluate the expression if needed
TensorForcedEvalOp<const T> eval = expr.eval();
- TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> tensor(eval, DefaultDevice());
+ Evaluator tensor(eval, DefaultDevice());
tensor.evalSubExprsIfNeeded(NULL);
- typedef typename internal::remove_const<typename T::Scalar>::type Scalar;
- typedef typename T::Index Index;
- typedef typename TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Dimensions Dimensions;
- const Index total_size = internal::array_prod(tensor.dimensions());
-
- // Print the tensor as a 1d vector or a 2d matrix.
+ // Print the result
static const int rank = internal::array_size<Dimensions>::value;
- if (rank == 0) {
- os << tensor.coeff(0);
- } else if (rank == 1) {
- Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size);
- os << array;
- } else {
- const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions());
- static const int layout = TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Layout;
- Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim);
- os << matrix;
- }
+ internal::TensorPrinter<Evaluator, rank>::run(os, tensor);
// Cleanup.
tensor.cleanup();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index bafcc67bd..566856ed2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -174,7 +174,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device)
{
- EIGEN_STATIC_ASSERT(NumDims >= 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
m_paddingValue = op.padding_value();
@@ -362,7 +362,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
index 985594bc8..3209fecd3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
@@ -10,7 +10,8 @@
#ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
#define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
-#if defined(EIGEN_HAS_CONSTEXPR) && defined(EIGEN_HAS_VARIADIC_TEMPLATES)
+
+#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES
#define EIGEN_HAS_INDEX_LIST
@@ -45,6 +46,24 @@ struct type2index {
}
};
+// This can be used with IndexPairList to get compile-time constant pairs,
+// such as IndexPairList<type2indexpair<1,2>, type2indexpair<3,4>>().
+template <DenseIndex f, DenseIndex s>
+struct type2indexpair {
+ static const DenseIndex first = f;
+ static const DenseIndex second = s;
+
+ constexpr EIGEN_DEVICE_FUNC operator IndexPair<DenseIndex>() const {
+ return IndexPair<DenseIndex>(f, s);
+ }
+
+ EIGEN_DEVICE_FUNC void set(const IndexPair<DenseIndex>& val) {
+ eigen_assert(val.first == f);
+ eigen_assert(val.second == s);
+ }
+};
+
+
template<DenseIndex n> struct NumTraits<type2index<n> >
{
typedef DenseIndex Real;
@@ -73,6 +92,16 @@ EIGEN_DEVICE_FUNC void update_value(type2index<n>& val, DenseIndex new_val) {
}
template <typename T>
+EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair<DenseIndex> new_val) {
+ val = new_val;
+}
+template <DenseIndex f, DenseIndex s>
+EIGEN_DEVICE_FUNC void update_value(type2indexpair<f, s>& val, IndexPair<DenseIndex> new_val) {
+ val.set(new_val);
+}
+
+
+template <typename T>
struct is_compile_time_constant {
static constexpr bool value = false;
};
@@ -94,7 +123,22 @@ struct is_compile_time_constant<const type2index<idx>& > {
static constexpr bool value = true;
};
-
+template <DenseIndex f, DenseIndex s>
+struct is_compile_time_constant<type2indexpair<f, s> > {
+ static constexpr bool value = true;
+};
+template <DenseIndex f, DenseIndex s>
+struct is_compile_time_constant<const type2indexpair<f, s> > {
+ static constexpr bool value = true;
+};
+template <DenseIndex f, DenseIndex s>
+struct is_compile_time_constant<type2indexpair<f, s>& > {
+ static constexpr bool value = true;
+};
+template <DenseIndex f, DenseIndex s>
+struct is_compile_time_constant<const type2indexpair<f, s>& > {
+ static constexpr bool value = true;
+};
template<typename... T>
@@ -184,31 +228,32 @@ template <typename T, typename... O>
-template <DenseIndex Idx>
+template <DenseIndex Idx, typename ValueT>
struct tuple_coeff {
template <typename... T>
- EIGEN_DEVICE_FUNC static constexpr DenseIndex get(const DenseIndex i, const IndexTuple<T...>& t) {
- return array_get<Idx>(t) * (i == Idx) + tuple_coeff<Idx-1>::get(i, t) * (i != Idx);
+ EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex i, const IndexTuple<T...>& t) {
+ // return array_get<Idx>(t) * (i == Idx) + tuple_coeff<Idx-1>::get(i, t) * (i != Idx);
+ return (i == Idx ? array_get<Idx>(t) : tuple_coeff<Idx-1, ValueT>::get(i, t));
}
template <typename... T>
- EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple<T...>& t, const DenseIndex value) {
+ EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple<T...>& t, const ValueT& value) {
if (i == Idx) {
update_value(array_get<Idx>(t), value);
} else {
- tuple_coeff<Idx-1>::set(i, t, value);
+ tuple_coeff<Idx-1, ValueT>::set(i, t, value);
}
}
template <typename... T>
EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple<T...>& t) {
return ((i == Idx) & is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value) ||
- tuple_coeff<Idx-1>::value_known_statically(i, t);
+ tuple_coeff<Idx-1, ValueT>::value_known_statically(i, t);
}
template <typename... T>
EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple<T...>& t) {
return is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
- tuple_coeff<Idx-1>::values_up_to_known_statically(t);
+ tuple_coeff<Idx-1, ValueT>::values_up_to_known_statically(t);
}
template <typename... T>
@@ -216,19 +261,19 @@ struct tuple_coeff {
return is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
array_get<Idx>(t) > array_get<Idx-1>(t) &&
- tuple_coeff<Idx-1>::values_up_to_statically_known_to_increase(t);
+ tuple_coeff<Idx-1, ValueT>::values_up_to_statically_known_to_increase(t);
}
};
-template <>
-struct tuple_coeff<0> {
+template <typename ValueT>
+struct tuple_coeff<0, ValueT> {
template <typename... T>
- EIGEN_DEVICE_FUNC static constexpr DenseIndex get(const DenseIndex i, const IndexTuple<T...>& t) {
+ EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex /*i*/, const IndexTuple<T...>& t) {
// eigen_assert (i == 0); // gcc fails to compile assertions in constexpr
- return array_get<0>(t) * (i == 0);
+ return array_get<0>(t)/* * (i == 0)*/;
}
template <typename... T>
- EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple<T...>& t, const DenseIndex value) {
+ EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple<T...>& t, const ValueT value) {
eigen_assert (i == 0);
update_value(array_get<0>(t), value);
}
@@ -254,13 +299,13 @@ struct tuple_coeff<0> {
template<typename FirstType, typename... OtherTypes>
struct IndexList : internal::IndexTuple<FirstType, OtherTypes...> {
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex operator[] (const DenseIndex i) const {
- return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::get(i, *this);
+ return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::get(i, *this);
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex get(const DenseIndex i) const {
- return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::get(i, *this);
+ return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::get(i, *this);
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const DenseIndex value) {
- return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::set(i, *this, value);
+ return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::set(i, *this, value);
}
EIGEN_DEVICE_FUNC constexpr IndexList(const internal::IndexTuple<FirstType, OtherTypes...>& other) : internal::IndexTuple<FirstType, OtherTypes...>(other) { }
@@ -268,14 +313,14 @@ struct IndexList : internal::IndexTuple<FirstType, OtherTypes...> {
EIGEN_DEVICE_FUNC constexpr IndexList() : internal::IndexTuple<FirstType, OtherTypes...>() { }
EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const {
- return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::value_known_statically(i, *this);
+ return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::value_known_statically(i, *this);
}
EIGEN_DEVICE_FUNC constexpr bool all_values_known_statically() const {
- return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::values_up_to_known_statically(*this);
+ return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::values_up_to_known_statically(*this);
}
EIGEN_DEVICE_FUNC constexpr bool values_statically_known_to_increase() const {
- return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::values_up_to_statically_known_to_increase(*this);
+ return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::values_up_to_statically_known_to_increase(*this);
}
};
@@ -286,6 +331,23 @@ constexpr IndexList<FirstType, OtherTypes...> make_index_list(FirstType val1, Ot
}
+template<typename FirstType, typename... OtherTypes>
+struct IndexPairList : internal::IndexTuple<FirstType, OtherTypes...> {
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair<DenseIndex> operator[] (const DenseIndex i) const {
+ return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, IndexPair<DenseIndex>>::get(i, *this);
+ }
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const IndexPair<DenseIndex> value) {
+ return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value-1, IndexPair<DenseIndex> >::set(i, *this, value);
+ }
+
+ EIGEN_DEVICE_FUNC constexpr IndexPairList(const internal::IndexTuple<FirstType, OtherTypes...>& other) : internal::IndexTuple<FirstType, OtherTypes...>(other) { }
+ EIGEN_DEVICE_FUNC constexpr IndexPairList() : internal::IndexTuple<FirstType, OtherTypes...>() { }
+
+ EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const {
+ return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::value_known_statically(i, *this);
+ }
+};
+
namespace internal {
template<typename FirstType, typename... OtherTypes> size_t array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
@@ -303,6 +365,13 @@ template<typename FirstType, typename... OtherTypes> struct array_size<const Ind
static const size_t value = array_size<IndexTuple<FirstType, OtherTypes...> >::value;
};
+template<typename FirstType, typename... OtherTypes> struct array_size<IndexPairList<FirstType, OtherTypes...> > {
+ static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
+};
+template<typename FirstType, typename... OtherTypes> struct array_size<const IndexPairList<FirstType, OtherTypes...> > {
+ static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
+};
+
template<DenseIndex N, typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC constexpr DenseIndex array_get(IndexList<FirstType, OtherTypes...>& a) {
return IndexTupleExtractor<N, FirstType, OtherTypes...>::get_val(a);
}
@@ -472,6 +541,57 @@ struct index_statically_lt_impl<const IndexList<FirstType, OtherTypes...> > {
}
};
+
+
+template <typename Tx>
+struct index_pair_first_statically_eq_impl {
+ EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
+ return false;
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_first_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+ return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+ (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_first_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+ return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+ (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
+ }
+};
+
+
+
+template <typename Tx>
+struct index_pair_second_statically_eq_impl {
+ EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
+ return false;
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_second_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+ return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+ (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_second_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+ return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+ (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
+ }
+};
+
+
} // end namespace internal
} // end namespace Eigen
@@ -482,53 +602,69 @@ namespace internal {
template <typename T>
struct index_known_statically_impl {
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(const DenseIndex) {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex) {
return false;
}
};
template <typename T>
struct all_indices_known_statically_impl {
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run() {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
return false;
}
};
template <typename T>
struct indices_statically_known_to_increase_impl {
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run() {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
return false;
}
};
template <typename T>
struct index_statically_eq_impl {
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(DenseIndex, DenseIndex) {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
return false;
}
};
template <typename T>
struct index_statically_ne_impl {
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(DenseIndex, DenseIndex) {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
return false;
}
};
template <typename T>
struct index_statically_gt_impl {
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(DenseIndex, DenseIndex) {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
return false;
}
};
template <typename T>
struct index_statically_lt_impl {
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(DenseIndex, DenseIndex) {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
+ return false;
+ }
+};
+
+template <typename Tx>
+struct index_pair_first_statically_eq_impl {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
+ return false;
+ }
+};
+
+template <typename Tx>
+struct index_pair_second_statically_eq_impl {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
return false;
}
};
+
+
} // end namespace internal
} // end namespace Eigen
@@ -572,6 +708,16 @@ static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(DenseIndex i,
return index_statically_lt_impl<T>::run(i, value);
}
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(DenseIndex i, DenseIndex value) {
+ return index_pair_first_statically_eq_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(DenseIndex i, DenseIndex value) {
+ return index_pair_second_statically_eq_impl<T>::run(i, value);
+}
+
} // end namespace internal
} // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
index de2f67d74..f391fb9ee 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
@@ -189,7 +189,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
index 2d223140e..33edc49e3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
@@ -10,7 +10,7 @@
#ifndef EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
#define EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
#include <initializer_list>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
index 33c6c1b0f..ede3939c2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -29,25 +29,47 @@ namespace Eigen {
namespace internal {
namespace {
+
// Note: result is undefined if val == 0
template <typename T>
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int count_leading_zeros(const T val)
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+ typename internal::enable_if<sizeof(T)==4,int>::type count_leading_zeros(const T val)
{
#ifdef __CUDA_ARCH__
- return (sizeof(T) == 8) ? __clzll(val) : __clz(val);
+ return __clz(val);
#elif EIGEN_COMP_MSVC
- unsigned long index;
- if (sizeof(T) == 8) {
- _BitScanReverse64(&index, val);
- } else {
- _BitScanReverse(&index, val);
- }
- return (sizeof(T) == 8) ? 63 - index : 31 - index;
+ unsigned long index;
+ _BitScanReverse(&index, val);
+ return 31 - index;
+#else
+ EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ return __builtin_clz(static_cast<uint32_t>(val));
+#endif
+ }
+
+ template <typename T>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+ typename internal::enable_if<sizeof(T)==8,int>::type count_leading_zeros(const T val)
+ {
+#ifdef __CUDA_ARCH__
+ return __clzll(val);
+#elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64
+ unsigned long index;
+ _BitScanReverse64(&index, val);
+ return 63 - index;
+#elif EIGEN_COMP_MSVC
+ // MSVC's _BitScanReverse64 is not available for 32bits builds.
+ unsigned int lo = (unsigned int)(val&0xffffffff);
+ unsigned int hi = (unsigned int)((val>>32)&0xffffffff);
+ int n;
+ if(hi==0)
+ n = 32 + count_leading_zeros<unsigned int>(lo);
+ else
+ n = count_leading_zeros<unsigned int>(hi);
+ return n;
#else
EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE);
- return (sizeof(T) == 8) ?
- __builtin_clzll(static_cast<uint64_t>(val)) :
- __builtin_clz(static_cast<uint32_t>(val));
+ return __builtin_clzll(static_cast<uint64_t>(val));
#endif
}
@@ -98,7 +120,9 @@ namespace {
return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
#else
const uint64_t shift = 1ULL << log_div;
- TensorUInt128<uint64_t, uint64_t> result = (TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider) - TensorUInt128<static_val<1>, static_val<0> >(1, 0) + TensorUInt128<static_val<0>, static_val<1> >(1));
+ TensorUInt128<uint64_t, uint64_t> result = TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider)
+ - TensorUInt128<static_val<1>, static_val<0> >(1, 0)
+ + TensorUInt128<static_val<0>, static_val<1> >(1);
return static_cast<uint64_t>(result);
#endif
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
index 8ed71f838..ee0078bbc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
@@ -28,7 +28,7 @@
// SFINAE requires variadic templates
#ifndef __CUDACC__
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
// SFINAE doesn't work for gcc <= 4.7
#ifdef EIGEN_COMP_GNUC
#if EIGEN_GNUC_AT_LEAST(4,8)
@@ -44,7 +44,7 @@
typename internal::enable_if< ( __condition__ ) , int >::type = 0
-#if defined(EIGEN_HAS_CONSTEXPR)
+#if EIGEN_HAS_CONSTEXPR
#define EIGEN_CONSTEXPR constexpr
#else
#define EIGEN_CONSTEXPR
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
index 9ebd9172b..6fb4f4a31 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
@@ -57,7 +57,7 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) {
// The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
@@ -140,7 +140,7 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
return m_data[index];
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
{
@@ -227,7 +227,7 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
return m_data[index];
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
{
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index cd04716bd..fdb5ee6b8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -47,22 +47,39 @@ template <> struct max_n_1<0> {
// Default packet types
template <typename Scalar, typename Device>
-struct PacketType {
+struct PacketType : internal::packet_traits<Scalar> {
typedef typename internal::packet_traits<Scalar>::type type;
- enum { size = internal::unpacket_traits<type>::size };
};
// For CUDA packet types when using a GpuDevice
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16)
template <>
-struct PacketType<float, GpuDevice> {
- typedef float4 type;
- static const int size = 4;
-};
-template <>
-struct PacketType<double, GpuDevice> {
- typedef double2 type;
+struct PacketType<half, GpuDevice> {
+ typedef half2 type;
static const int size = 2;
+ enum {
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasNegate = 1,
+ HasAbs = 1,
+ HasArg = 0,
+ HasAbs2 = 0,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 0,
+ HasSetLinear = 0,
+ HasBlend = 0,
+
+ HasDiv = 1,
+ HasSqrt = 1,
+ HasRsqrt = 1,
+ HasExp = 1,
+ HasLog = 1,
+ HasLog1p = 0,
+ HasLog10 = 0,
+ HasPow = 1,
+ };
};
#endif
@@ -112,6 +129,20 @@ bool operator!=(const Tuple<U, V>& x, const Tuple<U, V>& y) {
}
+// Can't use std::pairs on cuda devices
+template <typename Idx> struct IndexPair {
+ EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) {}
+ EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Idx f, Idx s) : first(f), second(s) {}
+
+ EIGEN_DEVICE_FUNC void set(IndexPair<Idx> val) {
+ first = val.first;
+ second = val.second;
+ }
+
+ Idx first;
+ Idx second;
+};
+
#ifdef EIGEN_HAS_SFINAE
namespace internal {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index bfa65a607..d34f1e328 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -148,7 +148,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); }
- const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+ EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
protected:
TensorEvaluator<ArgType, Device> m_impl;
@@ -409,7 +409,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
- EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+packetSize-1 < internal::array_prod(dimensions()));
Index inputIndices[] = {0, 0};
@@ -603,6 +603,286 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
};
+
+namespace internal {
+template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+struct traits<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> > : public traits<XprType>
+{
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = array_size<StartIndices>::value;
+ static const int Layout = XprTraits::Layout;
+};
+
+template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+struct eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Eigen::Dense>
+{
+ typedef const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>& type;
+};
+
+template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+struct nested<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, 1, typename eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >::type>
+{
+ typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> type;
+};
+
+} // end namespace internal
+
+
+template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+class TensorStridingSlicingOp : public TensorBase<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >
+{
+ public:
+ typedef typename internal::traits<TensorStridingSlicingOp>::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename internal::nested<TensorStridingSlicingOp>::type Nested;
+ typedef typename internal::traits<TensorStridingSlicingOp>::StorageKind StorageKind;
+ typedef typename internal::traits<TensorStridingSlicingOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingSlicingOp(
+ const XprType& expr, const StartIndices& startIndices,
+ const StopIndices& stopIndices, const Strides& strides)
+ : m_xpr(expr), m_startIndices(startIndices), m_stopIndices(stopIndices),
+ m_strides(strides) {}
+
+ EIGEN_DEVICE_FUNC
+ const StartIndices& startIndices() const { return m_startIndices; }
+ EIGEN_DEVICE_FUNC
+ const StartIndices& stopIndices() const { return m_stopIndices; }
+ EIGEN_DEVICE_FUNC
+ const StartIndices& strides() const { return m_strides; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const TensorStridingSlicingOp& other)
+ {
+ typedef TensorAssignOp<TensorStridingSlicingOp, const TensorStridingSlicingOp> Assign;
+ Assign assign(*this, other);
+ internal::TensorExecutor<const Assign, DefaultDevice>::run(
+ assign, DefaultDevice());
+ return *this;
+ }
+
+ template<typename OtherDerived>
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const OtherDerived& other)
+ {
+ typedef TensorAssignOp<TensorStridingSlicingOp, const OtherDerived> Assign;
+ Assign assign(*this, other);
+ internal::TensorExecutor<const Assign, DefaultDevice>::run(
+ assign, DefaultDevice());
+ return *this;
+ }
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const StartIndices m_startIndices;
+ const StopIndices m_stopIndices;
+ const Strides m_strides;
+};
+
+// Eval as rvalue
+template<typename StartIndices, typename StopIndices, typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device>
+{
+ typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType;
+ static const int NumDims = internal::array_size<Strides>::value;
+
+ enum {
+ // Alignment can't be guaranteed at compile time since it depends on the
+ // slice offsets and sizes.
+ IsAligned = false,
+ PacketAccess = false,
+ BlockAccess = false,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ RawAccess = false
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device), m_device(device), m_strides(op.strides())
+ {
+ // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero
+ DSizes<Index,NumDims> startIndicesClamped, stopIndicesClamped;
+ for (size_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
+ eigen_assert(m_strides[i] != 0 && "0 stride is invalid");
+ if(m_strides[i]>0){
+ startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]);
+ stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]);
+ }else{
+ /* implies m_strides[i]<0 by assert */
+ startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1);
+ stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1);
+ }
+ m_startIndices[i] = startIndicesClamped[i];
+ }
+
+ const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+
+ // check for degenerate intervals and compute output tensor shape
+ bool degenerate = false;;
+ for(int i = 0; i < NumDims; i++){
+ Index interval = stopIndicesClamped[i] - startIndicesClamped[i];
+ if(interval == 0 || ((interval<0) != (m_strides[i]<0))){
+ m_dimensions[i] = 0;
+ degenerate = true;
+ }else{
+ m_dimensions[i] = interval / m_strides[i]
+ + (interval % m_strides[i] != 0 ? 1 : 0);
+ eigen_assert(m_dimensions[i] >= 0);
+ }
+ }
+ Strides output_dims = m_dimensions;
+
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_inputStrides[0] = m_strides[0];
+ m_offsets[0] = startIndicesClamped[0];
+ Index previousDimProduct = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ previousDimProduct *= input_dims[i-1];
+ m_inputStrides[i] = previousDimProduct * m_strides[i];
+ m_offsets[i] = startIndicesClamped[i] * previousDimProduct;
+ }
+
+ // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed.
+ m_outputStrides[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1];
+ // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash
+ m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(degenerate ? 1 : m_outputStrides[i]);
+ }
+ } else {
+ m_inputStrides[NumDims-1] = m_strides[NumDims-1];
+ m_offsets[NumDims-1] = startIndicesClamped[NumDims-1];
+ Index previousDimProduct = 1;
+ for (int i = NumDims - 2; i >= 0; --i) {
+ previousDimProduct *= input_dims[i+1];
+ m_inputStrides[i] = previousDimProduct * m_strides[i];
+ m_offsets[i] = startIndicesClamped[i] * previousDimProduct;
+ }
+
+ m_outputStrides[NumDims-1] = 1;
+ for (int i = NumDims - 2; i >= 0; --i) {
+ m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1];
+ // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash
+ m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(degenerate ? 1 : m_outputStrides[i]);
+ }
+ }
+ m_block_total_size_max = numext::maxi(static_cast<std::size_t>(1),
+ device.lastLevelCacheSize() /
+ sizeof(Scalar));
+ }
+
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef Strides Dimensions;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ return m_impl.coeff(srcCoeff(index));
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const {
+ return NULL;
+ }
+
+ protected:
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
+ {
+ Index inputIndex = 0;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = NumDims - 1; i >= 0; --i) {
+ const Index idx = index / m_fastOutputStrides[i];
+ inputIndex += idx * m_inputStrides[i] + m_offsets[i];
+ index -= idx * m_outputStrides[i];
+ }
+ } else {
+ for (int i = 0; i < NumDims; ++i) {
+ const Index idx = index / m_fastOutputStrides[i];
+ inputIndex += idx * m_inputStrides[i] + m_offsets[i];
+ index -= idx * m_outputStrides[i];
+ }
+ }
+ return inputIndex;
+ }
+
+ static EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
+ return numext::maxi(min, numext::mini(max,value));
+ }
+
+ array<Index, NumDims> m_outputStrides;
+ array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
+ array<Index, NumDims> m_inputStrides;
+ TensorEvaluator<ArgType, Device> m_impl;
+ const Device& m_device;
+ DSizes<Index, NumDims> m_startIndices; // clamped startIndices
+ DSizes<Index, NumDims> m_dimensions;
+ DSizes<Index, NumDims> m_offsets; // offset in a flattened shape
+ const Strides m_strides;
+ std::size_t m_block_total_size_max;
+};
+
+// Eval as lvalue
+template<typename StartIndices, typename StopIndices, typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device>
+ : public TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device>
+{
+ typedef TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device> Base;
+ typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType;
+ static const int NumDims = internal::array_size<Strides>::value;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = false,
+ BlockAccess = false,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
+ RawAccess = false
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : Base(op, device)
+ { }
+
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef Strides Dimensions;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+ {
+ return this->m_impl.coeffRef(this->srcCoeff(index));
+ }
+};
+
+
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index 88b838b27..647bcf108 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -93,7 +93,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
enum {
- IsAligned = false,
+ IsAligned = true,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = true,
@@ -106,7 +106,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
// The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead
// to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector
// of 1 element first and then pad.
- EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
// Compute dimensions
m_dimensions = m_impl.dimensions();
@@ -150,27 +150,26 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > 0; --i) {
const Index idx = index / m_outputStrides[i];
- if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
+ if (isPaddingAtIndexForDim(idx, i)) {
return m_paddingValue;
}
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
index -= idx * m_outputStrides[i];
}
- if (index < m_padding[0].first || index >= m_dimensions[0] - m_padding[0].second) {
+ if (isPaddingAtIndexForDim(index, 0)) {
return m_paddingValue;
}
inputIndex += (index - m_padding[0].first);
} else {
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx = index / m_outputStrides[i+1];
- if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
+ if (isPaddingAtIndexForDim(idx, i)) {
return m_paddingValue;
}
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
index -= idx * m_outputStrides[i+1];
}
- if (index < m_padding[NumDims-1].first ||
- index >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) {
+ if (isPaddingAtIndexForDim(index, NumDims-1)) {
return m_paddingValue;
}
inputIndex += (index - m_padding[NumDims-1].first);
@@ -187,43 +186,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
return packetRowMajor(index);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const
- {
- Index inputIndex;
- if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
- {
- const Index idx = coords[0];
- if (idx < m_padding[0].first || idx >= m_dimensions[0] - m_padding[0].second) {
- return m_paddingValue;
- }
- inputIndex = idx - m_padding[0].first;
- }
- for (int i = 1; i < NumDims; ++i) {
- const Index idx = coords[i];
- if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
- return m_paddingValue;
- }
- inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
- }
- } else {
- {
- const Index idx = coords[NumDims-1];
- if (idx < m_padding[NumDims-1].first || idx >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) {
- return m_paddingValue;
- }
- inputIndex = idx - m_padding[NumDims-1].first;
- }
- for (int i = NumDims - 2; i >= 0; --i) {
- const Index idx = coords[i];
- if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
- return m_paddingValue;
- }
- inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
- }
- }
- return m_impl.coeff(inputIndex);
- }
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
TensorOpCost cost = m_impl.costPerCoeff(vectorized);
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -239,6 +201,40 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
private:
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim(
+ Index index, int dim_index) const {
+#if defined(EIGEN_HAS_INDEX_LIST)
+ return (!internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0) &&
+ index < m_padding[dim_index].first) ||
+ (!internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0) &&
+ index >= m_dimensions[dim_index] - m_padding[dim_index].second);
+#else
+ return (index < m_padding[dim_index].first) ||
+ (index >= m_dimensions[dim_index] - m_padding[dim_index].second);
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isLeftPaddingCompileTimeZero(
+ int dim_index) const {
+#if defined(EIGEN_HAS_INDEX_LIST)
+ return internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0);
+#else
+ EIGEN_UNUSED_VARIABLE(dim_index);
+ return false;
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isRightPaddingCompileTimeZero(
+ int dim_index) const {
+#if defined(EIGEN_HAS_INDEX_LIST)
+ return internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0);
+#else
+ EIGEN_UNUSED_VARIABLE(dim_index);
+ return false;
+#endif
+ }
+
+
void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const {
const double in = static_cast<double>(m_impl.dimensions()[i]);
const double out = in + m_padding[i].first + m_padding[i].second;
@@ -261,7 +257,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
const Index initialIndex = index;
@@ -273,15 +269,15 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
const Index lastPaddedRight = m_outputStrides[i+1];
- if (last < lastPaddedLeft) {
+ if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(m_paddingValue);
}
- else if (first >= firstPaddedRight && last < lastPaddedRight) {
+ else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(m_paddingValue);
}
- else if (first >= lastPaddedLeft && last < firstPaddedRight) {
+ else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
// all the coefficient are between the 2 padding zones.
const Index idx = index / m_outputStrides[i];
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
@@ -299,15 +295,15 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
const Index lastPaddedRight = m_outputStrides[1];
- if (last < lastPaddedLeft) {
+ if (!isLeftPaddingCompileTimeZero(0) && last < lastPaddedLeft) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(m_paddingValue);
}
- else if (first >= firstPaddedRight && last < lastPaddedRight) {
+ else if (!isRightPaddingCompileTimeZero(0) && first >= firstPaddedRight && last < lastPaddedRight) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(m_paddingValue);
}
- else if (first >= lastPaddedLeft && last < firstPaddedRight) {
+ else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
// all the coefficient are between the 2 padding zones.
inputIndex += (index - m_padding[0].first);
return m_impl.template packet<Unaligned>(inputIndex);
@@ -318,7 +314,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
const Index initialIndex = index;
@@ -331,15 +327,15 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1];
const Index lastPaddedRight = m_outputStrides[i];
- if (last < lastPaddedLeft) {
+ if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(m_paddingValue);
}
- else if (first >= firstPaddedRight && last < lastPaddedRight) {
+ else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(m_paddingValue);
}
- else if (first >= lastPaddedLeft && last < firstPaddedRight) {
+ else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
// all the coefficient are between the 2 padding zones.
const Index idx = index / m_outputStrides[i+1];
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
@@ -357,15 +353,15 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second);
const Index lastPaddedRight = m_outputStrides[NumDims-1];
- if (last < lastPaddedLeft) {
+ if (!isLeftPaddingCompileTimeZero(NumDims-1) && last < lastPaddedLeft) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(m_paddingValue);
}
- else if (first >= firstPaddedRight && last < lastPaddedRight) {
+ else if (!isRightPaddingCompileTimeZero(NumDims-1) && first >= firstPaddedRight && last < lastPaddedRight) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(m_paddingValue);
}
- else if (first >= lastPaddedLeft && last < firstPaddedRight) {
+ else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
// all the coefficient are between the 2 padding zones.
inputIndex += (index - m_padding[NumDims-1].first);
return m_impl.template packet<Unaligned>(inputIndex);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
index a87e45330..886a254f6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@@ -184,7 +184,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
new file mode 100644
index 000000000..1655a813e
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
@@ -0,0 +1,276 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
+#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
+
+namespace Eigen {
+namespace internal {
+
+namespace {
+
+EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
+#ifdef __CUDA_ARCH__
+ // We don't support 3d kernels since we currently only use 1 and
+ // 2d kernels.
+ assert(threadIdx.z == 0);
+ return clock64() +
+ blockIdx.x * blockDim.x + threadIdx.x +
+ gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
+
+#elif defined _WIN32
+ // Use the current time as a baseline.
+ SYSTEMTIME st;
+ GetSystemTime(&st);
+ int time = st.wSecond + 1000 * st.wMilliseconds;
+ // Mix in a random number to make sure that we get different seeds if
+ // we try to generate seeds faster than the clock resolution.
+ // We need 2 random values since the generator only generate 16 bits at
+ // a time (https://msdn.microsoft.com/en-us/library/398ax69y.aspx)
+ int rnd1 = ::rand();
+ int rnd2 = ::rand();
+ uint64_t rnd = (rnd1 | rnd2 << 16) ^ time;
+ return rnd;
+
+#elif defined __APPLE__
+ // Same approach as for win32, except that the random number generator
+ // is better (// https://developer.apple.com/legacy/library/documentation/Darwin/Reference/ManPages/man3/random.3.html#//apple_ref/doc/man/3/random).
+ uint64_t rnd = ::random() ^ mach_absolute_time();
+ return rnd;
+
+#else
+ // Augment the current time with pseudo random number generation
+ // to ensure that we get different seeds if we try to generate seeds
+ // faster than the clock resolution.
+ timespec ts;
+ clock_gettime(CLOCK_REALTIME, &ts);
+ uint64_t rnd = ::random() ^ ts.tv_nsec;
+ return rnd;
+#endif
+}
+
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state) {
+ // TODO: Unify with the implementation in the non blocking thread pool.
+ uint64_t current = *state;
+ // Update the internal state
+ *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+ // Generate the random output (using the PCG-XSH-RS scheme)
+ return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
+}
+
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {
+ seed = seed ? seed : get_random_seed();
+ return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+}
+
+} // namespace
+
+
+template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+T RandomToTypeUniform(uint64_t* state) {
+ unsigned rnd = PCG_XSH_RS_generator(state);
+ return static_cast<T>(rnd);
+}
+
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state) {
+ Eigen::half result;
+ // Generate 10 random bits for the mantissa
+ unsigned rnd = PCG_XSH_RS_generator(state);
+ result.x = static_cast<uint16_t>(rnd & 0x3ffu);
+ // Set the exponent
+ result.x |= (static_cast<uint16_t>(15) << 10);
+ // Return the final result
+ return result - Eigen::half(1.0f);
+}
+
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float RandomToTypeUniform<float>(uint64_t* state) {
+ typedef union {
+ uint32_t raw;
+ float fp;
+ } internal;
+ internal result;
+ // Generate 23 random bits for the mantissa mantissa
+ const unsigned rnd = PCG_XSH_RS_generator(state);
+ result.raw = rnd & 0x7fffffu;
+ // Set the exponent
+ result.raw |= (static_cast<uint32_t>(127) << 23);
+ // Return the final result
+ return result.fp - 1.0f;
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double RandomToTypeUniform<double>(uint64_t* state) {
+ typedef union {
+ uint64_t raw;
+ double dp;
+ } internal;
+ internal result;
+ result.raw = 0;
+ // Generate 52 random bits for the mantissa
+ // First generate the upper 20 bits
+ unsigned rnd1 = PCG_XSH_RS_generator(state) & 0xfffffu;
+ // The generate the lower 32 bits
+ unsigned rnd2 = PCG_XSH_RS_generator(state);
+ result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
+ // Set the exponent
+ result.raw |= (static_cast<uint64_t>(1023) << 52);
+ // Return the final result
+ return result.dp - 1.0;
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state) {
+ return std::complex<float>(RandomToTypeUniform<float>(state),
+ RandomToTypeUniform<float>(state));
+}
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state) {
+ return std::complex<double>(RandomToTypeUniform<double>(state),
+ RandomToTypeUniform<double>(state));
+}
+
+template <typename T> class UniformRandomGenerator {
+ public:
+ static const bool PacketAccess = true;
+
+ // Uses the given "seed" if non-zero, otherwise uses a random seed.
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
+ uint64_t seed = 0) {
+ m_state = PCG_XSH_RS_state(seed);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
+ const UniformRandomGenerator& other) {
+ m_state = other.m_state;
+ }
+
+ template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ T operator()(Index i) const {
+ uint64_t local_state = m_state + i;
+ T result = RandomToTypeUniform<T>(&local_state);
+ m_state = local_state;
+ return result;
+ }
+
+ template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Packet packetOp(Index i) const {
+ const int packetSize = internal::unpacket_traits<Packet>::size;
+ EIGEN_ALIGN_MAX T values[packetSize];
+ uint64_t local_state = m_state + i;
+ for (int j = 0; j < packetSize; ++j) {
+ values[j] = RandomToTypeUniform<T>(&local_state);
+ }
+ m_state = local_state;
+ return internal::pload<Packet>(values);
+ }
+
+ private:
+ mutable uint64_t m_state;
+};
+
+template <typename Scalar>
+struct functor_traits<UniformRandomGenerator<Scalar> > {
+ enum {
+ // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)).
+ Cost = 12 * NumTraits<Scalar>::AddCost *
+ ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)),
+ PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
+ };
+};
+
+
+
+template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+T RandomToTypeNormal(uint64_t* state) {
+ // Use the ratio of uniform method to generate numbers following a normal
+ // distribution. See for example Numerical Recipes chapter 7.3.9 for the
+ // details.
+ T u, v, q;
+ do {
+ u = RandomToTypeUniform<T>(state);
+ v = T(1.7156) * (RandomToTypeUniform<T>(state) - T(0.5));
+ const T x = u - T(0.449871);
+ const T y = numext::abs(v) + T(0.386595);
+ q = x*x + y * (T(0.196)*y - T(0.25472)*x);
+ } while (q > T(0.27597) &&
+ (q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u));
+
+ return v/u;
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state) {
+ return std::complex<float>(RandomToTypeNormal<float>(state),
+ RandomToTypeNormal<float>(state));
+}
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state) {
+ return std::complex<double>(RandomToTypeNormal<double>(state),
+ RandomToTypeNormal<double>(state));
+}
+
+
+template <typename T> class NormalRandomGenerator {
+ public:
+ static const bool PacketAccess = true;
+
+ // Uses the given "seed" if non-zero, otherwise uses a random seed.
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
+ m_state = PCG_XSH_RS_state(seed);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(
+ const NormalRandomGenerator& other) {
+ m_state = other.m_state;
+ }
+
+ template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ T operator()(Index i) const {
+ uint64_t local_state = m_state + i;
+ T result = RandomToTypeNormal<T>(&local_state);
+ m_state = local_state;
+ return result;
+ }
+
+ template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Packet packetOp(Index i) const {
+ const int packetSize = internal::unpacket_traits<Packet>::size;
+ EIGEN_ALIGN_MAX T values[packetSize];
+ uint64_t local_state = m_state + i;
+ for (int j = 0; j < packetSize; ++j) {
+ values[j] = RandomToTypeNormal<T>(&local_state);
+ }
+ m_state = local_state;
+ return internal::pload<Packet>(values);
+ }
+
+ private:
+ mutable uint64_t m_state;
+};
+
+
+template <typename Scalar>
+struct functor_traits<NormalRandomGenerator<Scalar> > {
+ enum {
+ // On average, we need to generate about 3 random numbers
+ // 15 mul, 8 add, 1.5 logs
+ Cost = 3 * functor_traits<UniformRandomGenerator<Scalar> >::Cost +
+ 15 * NumTraits<Scalar>::AddCost + 8 * NumTraits<Scalar>::AddCost +
+ 3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2,
+ PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
+ };
+};
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 885295f0a..a87777b22 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -87,7 +87,7 @@ struct preserve_inner_most_dims {
static const bool value = false;
};
-#if defined(EIGEN_HAS_CONSTEXPR) && defined(EIGEN_HAS_VARIADIC_TEMPLATES)
+#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES
template <typename ReducedDims, int NumTensorDims>
struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
@@ -122,7 +122,7 @@ struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
template <int DimIndex, typename Self, typename Op>
struct GenericDimReducer {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
- EIGEN_STATIC_ASSERT(DimIndex > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
GenericDimReducer<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
@@ -183,7 +183,7 @@ struct InnerMostDimPreserver {
template <int DimIndex, typename Self, typename Op>
struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
- EIGEN_STATIC_ASSERT(DimIndex > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
@@ -248,16 +248,12 @@ struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
*output = reducer.finalize(reducer.initialize());
return;
}
-#ifdef EIGEN_USE_COST_MODEL
const TensorOpCost cost =
self.m_impl.costPerCoeff(Vectorizable) +
TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable,
PacketSize);
const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
num_coeffs, cost, device.numThreads());
-#else
- const int num_threads = device.numThreads();
-#endif
if (num_threads == 1) {
*output =
InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
@@ -268,7 +264,7 @@ struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
eigen_assert(num_coeffs >= numblocks * blocksize);
- Barrier barrier(numblocks);
+ Barrier barrier(internal::convert_index<unsigned int>(numblocks));
MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
for (Index i = 0; i < numblocks; ++i) {
device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run,
@@ -320,7 +316,18 @@ struct OuterReducer {
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
template <int B, int N, typename S, typename R, typename I>
-__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
+__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
+
+
+#ifdef EIGEN_HAS_CUDA_FP16
+template <typename S, typename R, typename I>
+__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
+template <int B, int N, typename S, typename R, typename I>
+__global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
+template <int NPT, typename S, typename R, typename I>
+__global__ void InnerReductionKernelHalfFloat(R, const S, I, I, half*);
+
+#endif
template <int NPT, typename S, typename R, typename I>
__global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
@@ -396,7 +403,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device)
{
- EIGEN_STATIC_ASSERT(NumInputDims >= NumReducedDims, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -464,22 +471,14 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
- static bool size_large_enough(Index total_size) {
-#ifndef EIGEN_USE_COST_MODEL
- return total_size > 1024 * 1024;
-#else
- return true || total_size;
-#endif
- }
-
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) {
m_impl.evalSubExprsIfNeeded(NULL);
// Use the FullReducer if possible.
- if (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
+ if (RunningFullReduction &&
+ internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
- (!RunningOnGPU && size_large_enough(internal::array_prod(m_impl.dimensions()))))) {
-
+ !RunningOnGPU)) {
bool need_assign = false;
if (!data) {
m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType)));
@@ -493,7 +492,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
}
// Attempt to use an optimized reduction.
- else if (RunningOnGPU && data && (m_device.majorDeviceVersion() >= 3)) {
+ else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) {
bool reducing_inner_dims = true;
for (int i = 0; i < NumReducedDims; ++i) {
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -506,8 +505,25 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
(reducing_inner_dims || ReducingInnerMostDims)) {
const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+ if (!data) {
+ if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) {
+ data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+ m_result = data;
+ }
+ else {
+ return true;
+ }
+ }
Op reducer(m_reducer);
- return internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
+ if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
+ if (m_result) {
+ m_device.deallocate(m_result);
+ m_result = NULL;
+ }
+ return true;
+ } else {
+ return (m_result != NULL);
+ }
}
bool preserving_inner_dims = true;
@@ -522,8 +538,25 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
preserving_inner_dims) {
const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+ if (!data) {
+ if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) {
+ data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+ m_result = data;
+ }
+ else {
+ return true;
+ }
+ }
Op reducer(m_reducer);
- return internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
+ if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
+ if (m_result) {
+ m_device.deallocate(m_result);
+ m_result = NULL;
+ }
+ return true;
+ } else {
+ return (m_result != NULL);
+ }
}
}
return true;
@@ -533,13 +566,14 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
m_impl.cleanup();
if (m_result) {
m_device.deallocate(m_result);
+ m_result = NULL;
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
- if (RunningFullReduction && m_result) {
- return *m_result;
+ if ((RunningFullReduction || RunningOnGPU) && m_result) {
+ return *(m_result + index);
}
Op reducer(m_reducer);
if (ReducingInnerMostDims || RunningFullReduction) {
@@ -558,8 +592,12 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
- eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions())));
+
+ if (RunningOnGPU && m_result) {
+ return internal::pload<PacketReturnType>(m_result + index);
+ }
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
if (ReducingInnerMostDims) {
@@ -617,11 +655,19 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
#endif
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
- template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
+ template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
+#ifdef EIGEN_HAS_CUDA_FP16
+ template <typename S, typename R, typename I> friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
+ template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
+ template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
+#endif
template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+
template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
#endif
+ template <typename S, typename O, typename D> friend struct internal::InnerReducer;
+
// Returns the Index in the input tensor of the first value that needs to be
// used to compute the reduction at output index "index".
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index fd2587dd5..65638b6a8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -67,8 +67,41 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
#endif
}
-template <typename T>
-__device__ inline void atomicReduce(T* output, T accum, SumReducer<T>&) {
+// We extend atomicExch to support extra data types
+template <typename Type>
+__device__ inline Type atomicExchCustom(Type* address, Type val) {
+ return atomicExch(address, val);
+}
+
+template <>
+__device__ inline double atomicExchCustom(double* address, double val) {
+ unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
+ return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
+}
+
+#ifdef EIGEN_HAS_CUDA_FP16
+template <template <typename T> class R>
+__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
+ unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+ unsigned int newval = oldval;
+ reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+ if (newval == oldval) {
+ return;
+ }
+ unsigned int readback;
+ while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+ oldval = readback;
+ newval = oldval;
+ reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+ if (newval == oldval) {
+ return;
+ }
+ }
+}
+#endif
+
+template <>
+__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
#if __CUDA_ARCH__ >= 300
atomicAdd(output, accum);
#else
@@ -86,17 +119,44 @@ __global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coe
}
}
+
template <int BlockSize, int NumPerThread, typename Self,
typename Reducer, typename Index>
__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
- typename Self::CoeffReturnType* output) {
+ typename Self::CoeffReturnType* output, unsigned int* semaphore) {
+#if __CUDA_ARCH__ >= 300
+ // Initialize the output value
const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
-
- // Initialize the output value if it wasn't initialized by the ReductionInitKernel
- if (gridDim.x == 1 && first_index == 0) {
- *output = reducer.initialize();
+ if (gridDim.x == 1) {
+ if (first_index == 0) {
+ *output = reducer.initialize();
+ }
+ }
+ else {
+ if (threadIdx.x == 0) {
+ unsigned int block = atomicCAS(semaphore, 0u, 1u);
+ if (block == 0) {
+ // We're the first block to run, initialize the output value
+ atomicExchCustom(output, reducer.initialize());
+ __threadfence();
+ atomicExch(semaphore, 2u);
+ }
+ else {
+ // Wait for the first block to initialize the output value.
+ // Use atomicCAS here to ensure that the reads aren't cached
+ unsigned int val;
+ do {
+ val = atomicCAS(semaphore, 2u, 2u);
+ }
+ while (val < 2u);
+ }
+ }
}
+ __syncthreads();
+
+ eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
+
typename Self::CoeffReturnType accum = reducer.initialize();
Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
for (Index i = 0; i < max_iter; i+=BlockSize) {
@@ -108,50 +168,203 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
#pragma unroll
for (int offset = warpSize/2; offset > 0; offset /= 2) {
- reducer.reduce(__shfl_down(accum, offset), &accum);
+ reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
}
if ((threadIdx.x & (warpSize - 1)) == 0) {
atomicReduce(output, accum, reducer);
}
+
+ if (gridDim.x > 1 && threadIdx.x == 0) {
+ // Let the last block reset the semaphore
+ atomicInc(semaphore, gridDim.x + 1);
+ }
+#else
+ assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
+
+
+#ifdef EIGEN_HAS_CUDA_FP16
+template <typename Self,
+ typename Reducer, typename Index>
+__global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) {
+ eigen_assert(blockDim.x == 1);
+ eigen_assert(gridDim.x == 1);
+ if (num_coeffs % 2 != 0) {
+ half last = input.m_impl.coeff(num_coeffs-1);
+ *scratch = __halves2half2(last, reducer.initialize());
+ } else {
+ *scratch = reducer.template initializePacket<half2>();
+ }
}
+template <typename Self,
+ typename Reducer, typename Index>
+__global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
+ const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+ const Index num_threads = blockDim.x * gridDim.x;
+ const Index num_packets = num_coeffs / 2;
+ for (Index i = thread_id; i < num_packets; i += num_threads) {
+ ((half2*)output)[i] = reducer.template initializePacket<half2>();
+ }
-template <typename Self, typename Op, bool Vectorizable>
-struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
- // Unfortunately nvidia doesn't support well exotic types such as complex,
- // so reduce the scope of the optimized version of the code to the simple case
- // of floats.
- static const bool HasOptimizedImplementation = !Op::IsStateful &&
- internal::is_same<typename Self::CoeffReturnType, float>::value;
+ if (thread_id == 0 && num_coeffs % 2 != 0) {
+ output[num_coeffs-1] = reducer.initialize();
+ }
+}
- template <typename OutputType>
- static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const GpuDevice&, OutputType*) {
- assert(false && "Should only be called on floats");
+template <int BlockSize, int NumPerThread, typename Self,
+ typename Reducer, typename Index>
+__global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
+ half* output, half2* scratch) {
+ eigen_assert(NumPerThread % 2 == 0);
+
+ const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x;
+
+ // Initialize the output value if it wasn't initialized by the ReductionInitKernel
+ if (gridDim.x == 1 && first_index == 0) {
+ if (num_coeffs % 2 != 0) {
+ half last = input.m_impl.coeff(num_coeffs-1);
+ *scratch = __halves2half2(last, reducer.initialize());
+ } else {
+ *scratch = reducer.template initializePacket<half2>();
+ }
+ __syncthreads();
+ }
+
+ half2 accum = reducer.template initializePacket<half2>();
+ const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2);
+ for (Index i = 0; i < max_iter; i += BlockSize) {
+ const Index index = first_index + 2*i;
+ eigen_assert(index + 1 < num_coeffs);
+ half2 val = input.m_impl.template packet<Unaligned>(index);
+ reducer.reducePacket(val, &accum);
+ }
+
+#pragma unroll
+ for (int offset = warpSize/2; offset > 0; offset /= 2) {
+ reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
}
- static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
+ if ((threadIdx.x & (warpSize - 1)) == 0) {
+ atomicReduce(scratch, accum, reducer);
+ }
+
+ __syncthreads();
+
+ if (gridDim.x == 1 && first_index == 0) {
+ half tmp = __low2half(*scratch);
+ reducer.reduce(__high2half(*scratch), &tmp);
+ *output = tmp;
+ }
+}
+
+template <typename Op>
+__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
+ eigen_assert(threadIdx.x == 1);
+ half tmp = __low2half(*scratch);
+ reducer.reduce(__high2half(*scratch), &tmp);
+ *output = tmp;
+}
+
+#endif
+
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct FullReductionLauncher {
+ static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
+ assert(false && "Should only be called on doubles, floats and half floats");
+ }
+};
+
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct FullReductionLauncher<
+ Self, Op, OutputType, PacketAccess,
+ typename internal::enable_if<
+ internal::is_same<float, OutputType>::value ||
+ internal::is_same<double, OutputType>::value,
+ void>::type> {
+ static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
typedef typename Self::Index Index;
+ typedef typename Self::CoeffReturnType Scalar;
+ const int block_size = 256;
+ const int num_per_thread = 128;
+ const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
- const Index num_coeffs = array_prod(self.m_impl.dimensions());
- // Don't crash when we're called with an input tensor of size 0.
- if (num_coeffs == 0) {
- return;
+ unsigned int* semaphore = NULL;
+ if (num_blocks > 1) {
+ semaphore = device.semaphore();
}
+ LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
+ num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
+ }
+};
+
+#ifdef EIGEN_HAS_CUDA_FP16
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, false> {
+ static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
+ assert(false && "Should not be called since there is no packet accessor");
+ }
+};
+
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, true> {
+ static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
+ typedef typename Self::Index Index;
+
const int block_size = 256;
const int num_per_thread = 128;
const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+ half2* scratch = static_cast<half2*>(device.scratchpad());
if (num_blocks > 1) {
- // We initialize the outputs outside the reduction kernel when we can't be sure that there
+ // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
// won't be a race conditions between multiple thread blocks.
- LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
- 1, 32, 0, device, reducer.initialize(), 1, output);
+ LAUNCH_CUDA_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
+ 1, 1, 0, device, reducer, self, num_coeffs, scratch);
}
- LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
- num_blocks, block_size, 0, device, reducer, self, num_coeffs, output);
+ LAUNCH_CUDA_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
+ num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
+
+ if (num_blocks > 1) {
+ LAUNCH_CUDA_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
+ 1, 1, 0, device, reducer, output, scratch);
+ }
+ }
+};
+#endif
+
+
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
+ // Unfortunately nvidia doesn't support well exotic types such as complex,
+ // so reduce the scope of the optimized version of the code to the simple cases
+ // of doubles, floats and half floats
+#ifdef EIGEN_HAS_CUDA_FP16
+ static const bool HasOptimizedImplementation = !Op::IsStateful &&
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value ||
+ (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
+#else
+ static const bool HasOptimizedImplementation = !Op::IsStateful &&
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif
+
+ template <typename OutputType>
+ static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
+ assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+ const Index num_coeffs = array_prod(self.m_impl.dimensions());
+ // Don't crash when we're called with an input tensor of size 0.
+ if (num_coeffs == 0) {
+ return;
+ }
+
+ FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
}
};
@@ -160,6 +373,8 @@ template <int NumPerThread, typename Self,
typename Reducer, typename Index>
__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
typename Self::CoeffReturnType* output) {
+#if __CUDA_ARCH__ >= 300
+ typedef typename Self::CoeffReturnType Type;
eigen_assert(blockDim.y == 1);
eigen_assert(blockDim.z == 1);
eigen_assert(gridDim.y == 1);
@@ -179,6 +394,7 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
output[i] = reducer.initialize();
}
+ __syncthreads();
}
for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
@@ -188,13 +404,13 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
const Index col_block = i % input_col_blocks;
const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
- float reduced_val = reducer.initialize();
+ Type reduced_val = reducer.initialize();
for (Index j = 0; j < NumPerThread; j += unroll_times) {
const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
if (last_col >= num_coeffs_to_reduce) {
- for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col +=blockDim.x) {
- const float val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+ for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
+ const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
reducer.reduce(val, &reduced_val);
}
break;
@@ -217,33 +433,128 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
atomicReduce(&(output[row]), reduced_val, reducer);
}
}
+ }
+#else
+ assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
+
+#ifdef EIGEN_HAS_CUDA_FP16
+
+template <int NumPerThread, typename Self,
+ typename Reducer, typename Index>
+__global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+ half* output) {
+ eigen_assert(blockDim.y == 1);
+ eigen_assert(blockDim.z == 1);
+ eigen_assert(gridDim.y == 1);
+ eigen_assert(gridDim.z == 1);
+
+ const int unroll_times = 16;
+ eigen_assert(NumPerThread % unroll_times == 0);
+ eigen_assert(unroll_times % 2 == 0);
+
+ const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
+ const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
+ const Index num_threads = blockDim.x * gridDim.x;
+ const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+ // Initialize the output values if they weren't initialized by the ReductionInitKernel
+ if (gridDim.x == 1) {
+ Index i = 2*thread_id;
+ for (; i + 1 < num_preserved_coeffs; i += 2*num_threads) {
+ half* loc = output + i;
+ *((half2*)loc) = reducer.template initializePacket<half2>();
+ }
+ if (i < num_preserved_coeffs) {
+ output[i] = reducer.initialize();
+ }
__syncthreads();
}
+
+ for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
+ const Index row = 2 * (i / input_col_blocks);
+
+ if (row + 1 < num_preserved_coeffs) {
+ const Index col_block = i % input_col_blocks;
+ const Index col_begin = 2 * (col_block * blockDim.x * NumPerThread + threadIdx.x);
+
+ half2 reduced_val1 = reducer.template initializePacket<half2>();
+ half2 reduced_val2 = reducer.template initializePacket<half2>();
+
+ for (Index j = 0; j < NumPerThread; j += unroll_times) {
+ const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1) * 2;
+ if (last_col >= num_coeffs_to_reduce) {
+ Index col = col_begin + blockDim.x * j;
+ for (; col + 1 < num_coeffs_to_reduce; col += blockDim.x) {
+ const half2 val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col);
+ reducer.reducePacket(val1, &reduced_val1);
+ const half2 val2 = input.m_impl.template packet<Unaligned>((row+1) * num_coeffs_to_reduce + col);
+ reducer.reducePacket(val2, &reduced_val2);
+ }
+ if (col < num_coeffs_to_reduce) {
+ // Peel;
+ const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+ const half2 val1 = __halves2half2(last1, reducer.initialize());
+ reducer.reducePacket(val1, &reduced_val1);
+ const half last2 = input.m_impl.coeff((row+1) * num_coeffs_to_reduce + col);
+ const half2 val2 = __halves2half2(last2, reducer.initialize());
+ reducer.reducePacket(val2, &reduced_val2);
+ }
+ break;
+ } else {
+ // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+ for (int k = 0; k < unroll_times; ++k) {
+ const Index col = col_begin + blockDim.x * (j + k) * 2;
+ reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col), &reduced_val1);
+ reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1)* num_coeffs_to_reduce + col), &reduced_val2);
+ }
+ }
+ }
+
+#pragma unroll
+ for (int offset = warpSize/2; offset > 0; offset /= 2) {
+ reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1);
+ reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2);
+ }
+
+ half val1 = __low2half(reduced_val1);
+ reducer.reduce(__high2half(reduced_val1), &val1);
+ half val2 = __low2half(reduced_val2);
+ reducer.reduce(__high2half(reduced_val2), &val2);
+ half2 val = __halves2half2(val1, val2);
+
+ if ((threadIdx.x & (warpSize - 1)) == 0) {
+ half* loc = output + row;
+ atomicReduce((half2*)loc, val, reducer);
+ }
+ }
+ }
}
-template <typename Self, typename Op>
-struct InnerReducer<Self, Op, GpuDevice> {
- // Unfortunately nvidia doesn't support well exotic types such as complex,
- // so reduce the scope of the optimized version of the code to the simple case
- // of floats.
- static const bool HasOptimizedImplementation = !Op::IsStateful &&
- internal::is_same<typename Self::CoeffReturnType, float>::value;
+#endif
- template <typename Device, typename OutputType>
- static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
- assert(false && "Should only be called to reduce floats on a gpu device");
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct InnerReductionLauncher {
+ static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
+ assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
return true;
}
+};
- static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct InnerReductionLauncher<
+ Self, Op, OutputType, PacketAccess,
+ typename internal::enable_if<
+ internal::is_same<float, OutputType>::value ||
+ internal::is_same<double, OutputType>::value,
+ void>::type> {
+ static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
typedef typename Self::Index Index;
- // It's faster to use the usual code.
- if (num_coeffs_to_reduce <= 32) {
- return true;
- }
-
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
const int block_size = 256;
const int num_per_thread = 128;
@@ -259,7 +570,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
const int max_blocks = device.getNumCudaMultiProcessors() *
device.maxCudaThreadsPerMultiProcessor() / 1024;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
- LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
+ LAUNCH_CUDA_KERNEL((ReductionInitKernel<OutputType, Index>),
num_blocks, 1024, 0, device, reducer.initialize(),
num_preserved_vals, output);
}
@@ -271,6 +582,85 @@ struct InnerReducer<Self, Op, GpuDevice> {
}
};
+#ifdef EIGEN_HAS_CUDA_FP16
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
+ static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
+ assert(false && "Should not be called since there is no packet accessor");
+ return true;
+ }
+};
+
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
+ static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+ typedef typename Self::Index Index;
+
+ if (num_preserved_vals % 2 != 0) {
+ // Not supported yet, revert to the slower code path
+ return true;
+ }
+
+ const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+ const int block_size = /*256*/128;
+ const int num_per_thread = /*128*/64;
+ const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+ const int max_blocks = device.getNumCudaMultiProcessors() *
+ device.maxCudaThreadsPerMultiProcessor() / block_size;
+ const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+ if (num_blocks > 1) {
+ // We initialize the outputs outside the reduction kernel when we can't be sure that there
+ // won't be a race conditions between multiple thread blocks.
+ const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+ const int max_blocks = device.getNumCudaMultiProcessors() *
+ device.maxCudaThreadsPerMultiProcessor() / 1024;
+ const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+ LAUNCH_CUDA_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
+ 1, 1, 0, device, reducer, self, num_preserved_vals, output);
+ }
+
+ LAUNCH_CUDA_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
+ num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+ return false;
+ }
+};
+#endif
+
+
+template <typename Self, typename Op>
+struct InnerReducer<Self, Op, GpuDevice> {
+ // Unfortunately nvidia doesn't support well exotic types such as complex,
+ // so reduce the scope of the optimized version of the code to the simple case
+ // of floats and half floats.
+#ifdef EIGEN_HAS_CUDA_FP16
+ static const bool HasOptimizedImplementation = !Op::IsStateful &&
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value ||
+ (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
+#else
+ static const bool HasOptimizedImplementation = !Op::IsStateful &&
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif
+
+ template <typename OutputType>
+ static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+ assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+ const Index num_coeffs = array_prod(self.m_impl.dimensions());
+ // Don't crash when we're called with an input tensor of size 0.
+ if (num_coeffs == 0) {
+ return true;
+ }
+ // It's faster to use the usual code.
+ if (num_coeffs_to_reduce <= 128) {
+ return true;
+ }
+
+ return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
+ }
+};
template <int NumPerThread, typename Self,
typename Reducer, typename Index>
@@ -283,6 +673,7 @@ __global__ void OuterReductionKernel(Reducer reducer, const Self input, Index nu
for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
output[i] = reducer.initialize();
}
+ __syncthreads();
}
// Do the reduction.
@@ -307,11 +698,11 @@ struct OuterReducer<Self, Op, GpuDevice> {
// so reduce the scope of the optimized version of the code to the simple case
// of floats.
static const bool HasOptimizedImplementation = !Op::IsStateful &&
- internal::is_same<typename Self::CoeffReturnType, float>::value;
-
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value);
template <typename Device, typename OutputType>
static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
- assert(false && "Should only be called to reduce floats on a gpu device");
+ assert(false && "Should only be called to reduce doubles or floats on a gpu device");
return true;
}
@@ -323,7 +714,7 @@ struct OuterReducer<Self, Op, GpuDevice> {
return true;
}
- const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+ const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
const int block_size = 256;
const int num_per_thread = 16;
const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
index bc92d9e6d..99245f778 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
@@ -193,7 +193,7 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef
return m_evaluator->coeff(index);
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar operator()(Index firstIndex, IndexTypes... otherIndices) const
{
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index 1a59cc8f7..14e392e36 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -122,7 +122,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
: m_impl(op.expression(), device), m_reverse(op.reverse())
{
// Reversing a scalar isn't supported yet. It would be a no-op anyway.
- EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
// Compute strides
m_dimensions = m_impl.dimensions();
@@ -195,7 +195,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
PacketReturnType packet(Index index) const
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
// TODO(ndjaitly): write a better packing routine that uses
@@ -269,7 +269,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x) {
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
// This code is pilfered from TensorMorphing.h
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
new file mode 100644
index 000000000..8501466ce
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -0,0 +1,287 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Igor Babuschkin <igor@babuschk.in>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
+#define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Op, typename XprType>
+struct traits<TensorScanOp<Op, XprType> >
+ : public traits<XprType> {
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+};
+
+template<typename Op, typename XprType>
+struct eval<TensorScanOp<Op, XprType>, Eigen::Dense>
+{
+ typedef const TensorScanOp<Op, XprType>& type;
+};
+
+template<typename Op, typename XprType>
+struct nested<TensorScanOp<Op, XprType>, 1,
+ typename eval<TensorScanOp<Op, XprType> >::type>
+{
+ typedef TensorScanOp<Op, XprType> type;
+};
+} // end namespace internal
+
+/** \class TensorScan
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor scan class.
+ */
+template <typename Op, typename XprType>
+class TensorScanOp
+ : public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> {
+public:
+ typedef typename Eigen::internal::traits<TensorScanOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorScanOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorScanOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorScanOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp(
+ const XprType& expr, const Index& axis, bool exclusive = false, const Op& op = Op())
+ : m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const Index axis() const { return m_axis; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const XprType& expression() const { return m_expr; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const Op accumulator() const { return m_accumulator; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ bool exclusive() const { return m_exclusive; }
+
+protected:
+ typename XprType::Nested m_expr;
+ const Index m_axis;
+ const Op m_accumulator;
+ const bool m_exclusive;
+};
+
+template <typename Self, typename Reducer, typename Device>
+struct ScanLauncher;
+
+// Eval as rvalue
+template <typename Op, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
+
+ typedef TensorScanOp<Op, ArgType> XprType;
+ typedef typename XprType::Index Index;
+ static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
+ BlockAccess = false,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false,
+ RawAccess = true
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
+ const Device& device)
+ : m_impl(op.expression(), device),
+ m_device(device),
+ m_exclusive(op.exclusive()),
+ m_accumulator(op.accumulator()),
+ m_size(m_impl.dimensions()[op.axis()]),
+ m_stride(1),
+ m_output(NULL) {
+
+ // Accumulating a scalar isn't supported.
+ EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ eigen_assert(op.axis() >= 0 && op.axis() < NumDims);
+
+ // Compute stride of scan axis
+ const Dimensions& dims = m_impl.dimensions();
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = 0; i < op.axis(); ++i) {
+ m_stride = m_stride * dims[i];
+ }
+ } else {
+ for (int i = NumDims - 1; i > op.axis(); --i) {
+ m_stride = m_stride * dims[i];
+ }
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
+ return m_impl.dimensions();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const {
+ return m_stride;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const {
+ return m_size;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const {
+ return m_accumulator;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const {
+ return m_exclusive;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const {
+ return m_impl;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const {
+ return m_device;
+ }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ ScanLauncher<Self, Op, Device> launcher;
+ if (data) {
+ launcher(*this, data);
+ return false;
+ }
+
+ const Index total_size = internal::array_prod(dimensions());
+ m_output = static_cast<CoeffReturnType*>(m_device.allocate(total_size * sizeof(Scalar)));
+ launcher(*this, m_output);
+ return true;
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+ return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const
+ {
+ return m_output;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ return m_output[index];
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
+ return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ if (m_output != NULL) {
+ m_device.deallocate(m_output);
+ m_output = NULL;
+ }
+ m_impl.cleanup();
+ }
+
+protected:
+ TensorEvaluator<ArgType, Device> m_impl;
+ const Device& m_device;
+ const bool m_exclusive;
+ Op m_accumulator;
+ const Index m_size;
+ Index m_stride;
+ CoeffReturnType* m_output;
+};
+
+// CPU implementation of scan
+// TODO(ibab) This single-threaded implementation should be parallelized,
+// at least by running multiple scans at the same time.
+template <typename Self, typename Reducer, typename Device>
+struct ScanLauncher {
+ void operator()(Self& self, typename Self::CoeffReturnType *data) {
+ Index total_size = internal::array_prod(self.dimensions());
+
+ // We fix the index along the scan axis to 0 and perform a
+ // scan per remaining entry. The iteration is split into two nested
+ // loops to avoid an integer division by keeping track of each idx1 and idx2.
+ for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
+ for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
+ // Calculate the starting offset for the scan
+ Index offset = idx1 + idx2;
+
+ // Compute the scan along the axis, starting at the calculated offset
+ typename Self::CoeffReturnType accum = self.accumulator().initialize();
+ for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+ Index curr = offset + idx3 * self.stride();
+
+ if (self.exclusive()) {
+ data[curr] = self.accumulator().finalize(accum);
+ self.accumulator().reduce(self.inner().coeff(curr), &accum);
+ } else {
+ self.accumulator().reduce(self.inner().coeff(curr), &accum);
+ data[curr] = self.accumulator().finalize(accum);
+ }
+ }
+ }
+ }
+ }
+};
+
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+
+// GPU implementation of scan
+// TODO(ibab) This placeholder implementation performs multiple scans in
+// parallel, but it would be better to use a parallel scan algorithm and
+// optimize memory access.
+template <typename Self, typename Reducer>
+__global__ void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
+ // Compute offset as in the CPU version
+ Index val = threadIdx.x + blockIdx.x * blockDim.x;
+ Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
+
+ if (offset + (self.size() - 1) * self.stride() < total_size) {
+ // Compute the scan along the axis, starting at the calculated offset
+ typename Self::CoeffReturnType accum = self.accumulator().initialize();
+ for (Index idx = 0; idx < self.size(); idx++) {
+ Index curr = offset + idx * self.stride();
+ if (self.exclusive()) {
+ data[curr] = self.accumulator().finalize(accum);
+ self.accumulator().reduce(self.inner().coeff(curr), &accum);
+ } else {
+ self.accumulator().reduce(self.inner().coeff(curr), &accum);
+ data[curr] = self.accumulator().finalize(accum);
+ }
+ }
+ }
+ __syncthreads();
+
+}
+
+template <typename Self, typename Reducer>
+struct ScanLauncher<Self, Reducer, GpuDevice> {
+ void operator()(const Self& self, typename Self::CoeffReturnType* data) {
+ Index total_size = internal::array_prod(self.dimensions());
+ Index num_blocks = (total_size / self.size() + 63) / 64;
+ Index block_size = 64;
+ LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
+ }
+};
+#endif // EIGEN_USE_GPU && __CUDACC__
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index e76533710..113c060e3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -166,7 +166,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
@@ -248,7 +248,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
template <int StoreMode> EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
index 0e89033c4..f8121d17b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
@@ -85,7 +85,7 @@ class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_>
: m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size)), m_dimensions(dimensions)
{ EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN }
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
template <typename... DenseIndex>
EIGEN_DEVICE_FUNC TensorStorage(DenseIndex... indices) : m_dimensions(indices...) {
m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(internal::array_prod(m_dimensions));
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 52b7d216a..6c35bfdb6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -164,7 +164,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
Index inputIndices[] = {0, 0};
@@ -289,7 +289,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < this->dimensions().TotalSize());
Index inputIndices[] = {0, 0};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
index 5950f38e2..3523e7c94 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
@@ -20,6 +20,7 @@ struct static_val {
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator uint64_t() const { return n; }
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val() { }
+
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) {
eigen_assert(v == n);
@@ -53,7 +54,7 @@ struct TensorUInt128
template<typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
explicit TensorUInt128(const T& x) : high(0), low(x) {
- eigen_assert((static_cast<typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type>(x) <= static_cast<typename conditional<sizeof(LOW) == 8, uint64_t, uint32_t>::type>(NumTraits<LOW>::highest())));
+ eigen_assert((static_cast<typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type>(x) <= NumTraits<uint64_t>::highest()));
eigen_assert(x >= 0);
}
@@ -74,21 +75,21 @@ struct TensorUInt128
template <typename HL, typename LL, typename HR, typename LR>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-static bool operator == (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+bool operator == (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
{
return (lhs.high == rhs.high) & (lhs.low == rhs.low);
}
template <typename HL, typename LL, typename HR, typename LR>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-static bool operator != (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+bool operator != (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
{
return (lhs.high != rhs.high) | (lhs.low != rhs.low);
}
template <typename HL, typename LL, typename HR, typename LR>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-static bool operator >= (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+bool operator >= (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
{
if (lhs.high != rhs.high) {
return lhs.high > rhs.high;
@@ -98,7 +99,7 @@ static bool operator >= (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<H
template <typename HL, typename LL, typename HR, typename LR>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-static bool operator < (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+bool operator < (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
{
if (lhs.high != rhs.high) {
return lhs.high < rhs.high;
@@ -108,7 +109,7 @@ static bool operator < (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR
template <typename HL, typename LL, typename HR, typename LR>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-static TensorUInt128<uint64_t, uint64_t> operator + (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+TensorUInt128<uint64_t, uint64_t> operator + (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
{
TensorUInt128<uint64_t, uint64_t> result(lhs.high + rhs.high, lhs.low + rhs.low);
if (result.low < rhs.low) {
@@ -119,7 +120,7 @@ static TensorUInt128<uint64_t, uint64_t> operator + (const TensorUInt128<HL, LL>
template <typename HL, typename LL, typename HR, typename LR>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-static TensorUInt128<uint64_t, uint64_t> operator - (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+TensorUInt128<uint64_t, uint64_t> operator - (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
{
TensorUInt128<uint64_t, uint64_t> result(lhs.high - rhs.high, lhs.low - rhs.low);
if (result.low > lhs.low) {
@@ -130,8 +131,8 @@ static TensorUInt128<uint64_t, uint64_t> operator - (const TensorUInt128<HL, LL>
template <typename HL, typename LL, typename HR, typename LR>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-static TensorUInt128<uint64_t, uint64_t> operator * (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+TensorUInt128<uint64_t, uint64_t> operator * (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
{
// Split each 128-bit integer into 4 32-bit integers, and then do the
// multiplications by hand as follow:
@@ -205,8 +206,8 @@ static TensorUInt128<uint64_t, uint64_t> operator * (const TensorUInt128<HL, LL>
}
template <typename HL, typename LL, typename HR, typename LR>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-static TensorUInt128<uint64_t, uint64_t> operator / (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+TensorUInt128<uint64_t, uint64_t> operator / (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
{
if (rhs == TensorUInt128<static_val<0>, static_val<1> >(1)) {
return TensorUInt128<uint64_t, uint64_t>(lhs.high, lhs.low);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
index e735fc76f..0ca2cac84 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@@ -187,7 +187,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device)
{
- EIGEN_STATIC_ASSERT(NumDims >= 5, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE);
m_paddingValue = op.padding_value();
@@ -408,7 +408,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
- EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1 ||
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt b/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt
deleted file mode 100644
index 6e871a8da..000000000
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-FILE(GLOB Eigen_CXX11_TensorSymmetry_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_CXX11_TensorSymmetry_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/TensorSymmetry COMPONENT Devel
- )
-
-add_subdirectory(util)
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt
deleted file mode 100644
index dc9fc78ec..000000000
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_TensorSymmetry_util_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_CXX11_TensorSymmetry_util_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/TensorSymmetry/util COMPONENT Devel
- )
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt b/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt
deleted file mode 100644
index 88fef50c6..000000000
--- a/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_ThreadPool_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_CXX11_ThreadPool_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/ThreadPool COMPONENT Devel
- )
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
index 6dd64f185..71d55552d 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
@@ -50,7 +50,7 @@ class EventCount {
public:
class Waiter;
- EventCount(std::vector<Waiter>& waiters) : waiters_(waiters) {
+ EventCount(MaxSizeVector<Waiter>& waiters) : waiters_(waiters) {
eigen_assert(waiters.size() < (1 << kWaiterBits) - 1);
// Initialize epoch to something close to overflow to test overflow.
state_ = kStackMask | (kEpochMask - kEpochInc * waiters.size() * 2);
@@ -169,7 +169,8 @@ class EventCount {
class Waiter {
friend class EventCount;
- std::atomic<Waiter*> next;
+ // Align to 128 byte boundary to prevent false sharing with other Waiter objects in the same vector.
+ EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic<Waiter*> next;
std::mutex mu;
std::condition_variable cv;
uint64_t epoch;
@@ -179,8 +180,6 @@ class EventCount {
kWaiting,
kSignaled,
};
- // Prevent false sharing with other Waiter objects in the same vector.
- char pad_[128];
};
private:
@@ -200,7 +199,7 @@ class EventCount {
static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift;
static const uint64_t kEpochInc = 1ull << kEpochShift;
std::atomic<uint64_t> state_;
- std::vector<Waiter>& waiters_;
+ MaxSizeVector<Waiter>& waiters_;
void Park(Waiter* w) {
std::unique_lock<std::mutex> lock(w->mu);
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
index 1c471a19f..354bce52a 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@@ -23,18 +23,44 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
: env_(env),
threads_(num_threads),
queues_(num_threads),
+ coprimes_(num_threads),
waiters_(num_threads),
- blocked_(),
- spinning_(),
- done_(),
+ blocked_(0),
+ spinning_(0),
+ done_(false),
ec_(waiters_) {
- for (int i = 0; i < num_threads; i++) queues_.push_back(new Queue());
- for (int i = 0; i < num_threads; i++)
+ waiters_.resize(num_threads);
+
+ // Calculate coprimes of num_threads.
+ // Coprimes are used for a random walk over all threads in Steal
+ // and NonEmptyQueueIndex. Iteration is based on the fact that if we take
+ // a walk starting thread index t and calculate num_threads - 1 subsequent
+ // indices as (t + coprime) % num_threads, we will cover all threads without
+ // repetitions (effectively getting a presudo-random permutation of thread
+ // indices).
+ for (int i = 1; i <= num_threads; i++) {
+ unsigned a = i;
+ unsigned b = num_threads;
+ // If GCD(a, b) == 1, then a and b are coprimes.
+ while (b != 0) {
+ unsigned tmp = a;
+ a = b;
+ b = tmp % b;
+ }
+ if (a == 1) {
+ coprimes_.push_back(i);
+ }
+ }
+ for (int i = 0; i < num_threads; i++) {
+ queues_.push_back(new Queue());
+ }
+ for (int i = 0; i < num_threads; i++) {
threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); }));
+ }
}
~NonBlockingThreadPoolTempl() {
- done_.store(true, std::memory_order_relaxed);
+ done_ = true;
// Now if all threads block without work, they will start exiting.
// But note that threads can continue to work arbitrary long,
// block, submit new work, unblock and otherwise live full life.
@@ -50,7 +76,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
PerThread* pt = GetPerThread();
if (pt->pool == this) {
// Worker thread of this pool, push onto the thread's queue.
- Queue* q = queues_[pt->index];
+ Queue* q = queues_[pt->thread_id];
t = q->PushFront(std::move(t));
} else {
// A free-standing thread (or worker of another pool), push onto a random
@@ -71,108 +97,111 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
env_.ExecuteTask(t); // Push failed, execute directly.
}
+ int NumThreads() const final {
+ return static_cast<int>(threads_.size());
+ }
+
+ int CurrentThreadId() const final {
+ const PerThread* pt =
+ const_cast<NonBlockingThreadPoolTempl*>(this)->GetPerThread();
+ if (pt->pool == this) {
+ return pt->thread_id;
+ } else {
+ return -1;
+ }
+ }
+
private:
typedef typename Environment::EnvThread Thread;
struct PerThread {
- bool inited;
+ constexpr PerThread() : pool(NULL), rand(0), thread_id(-1) { }
NonBlockingThreadPoolTempl* pool; // Parent pool, or null for normal threads.
- unsigned index; // Worker thread index in pool.
- unsigned rand; // Random generator state.
+ uint64_t rand; // Random generator state.
+ int thread_id; // Worker thread index in pool.
};
Environment env_;
MaxSizeVector<Thread*> threads_;
MaxSizeVector<Queue*> queues_;
- std::vector<EventCount::Waiter> waiters_;
+ MaxSizeVector<unsigned> coprimes_;
+ MaxSizeVector<EventCount::Waiter> waiters_;
std::atomic<unsigned> blocked_;
std::atomic<bool> spinning_;
std::atomic<bool> done_;
EventCount ec_;
// Main worker thread loop.
- void WorkerLoop(unsigned index) {
+ void WorkerLoop(int thread_id) {
PerThread* pt = GetPerThread();
pt->pool = this;
- pt->index = index;
- Queue* q = queues_[index];
- EventCount::Waiter* waiter = &waiters_[index];
- std::vector<Task> stolen;
+ pt->rand = std::hash<std::thread::id>()(std::this_thread::get_id());
+ pt->thread_id = thread_id;
+ Queue* q = queues_[thread_id];
+ EventCount::Waiter* waiter = &waiters_[thread_id];
for (;;) {
- Task t;
- if (!stolen.empty()) {
- t = std::move(stolen.back());
- stolen.pop_back();
- }
- if (!t.f) t = q->PopFront();
+ Task t = q->PopFront();
if (!t.f) {
- if (Steal(&stolen)) {
- t = std::move(stolen.back());
- stolen.pop_back();
- while (stolen.size()) {
- Task t1 = q->PushFront(std::move(stolen.back()));
- stolen.pop_back();
- if (t1.f) {
- // There is not much we can do in this case. Just execute the
- // remaining directly.
- stolen.push_back(std::move(t1));
- break;
+ t = Steal();
+ if (!t.f) {
+ // Leave one thread spinning. This reduces latency.
+ // TODO(dvyukov): 1000 iterations is based on fair dice roll, tune it.
+ // Also, the time it takes to attempt to steal work 1000 times depends
+ // on the size of the thread pool. However the speed at which the user
+ // of the thread pool submit tasks is independent of the size of the
+ // pool. Consider a time based limit instead.
+ if (!spinning_ && !spinning_.exchange(true)) {
+ for (int i = 0; i < 1000 && !t.f; i++) {
+ t = Steal();
+ }
+ spinning_ = false;
+ }
+ if (!t.f) {
+ if (!WaitForWork(waiter, &t)) {
+ return;
}
}
}
}
if (t.f) {
env_.ExecuteTask(t);
- continue;
}
- // Leave one thread spinning. This reduces latency.
- if (!spinning_ && !spinning_.exchange(true)) {
- bool nowork = true;
- for (int i = 0; i < 1000; i++) {
- if (!OutOfWork()) {
- nowork = false;
- break;
- }
- }
- spinning_ = false;
- if (!nowork) continue;
- }
- if (!WaitForWork(waiter)) return;
}
}
// Steal tries to steal work from other worker threads in best-effort manner.
- bool Steal(std::vector<Task>* stolen) {
- if (queues_.size() == 1) return false;
+ Task Steal() {
PerThread* pt = GetPerThread();
- unsigned lastq = pt->index;
- for (unsigned i = queues_.size(); i > 0; i--) {
- unsigned victim = Rand(&pt->rand) % queues_.size();
- if (victim == lastq && queues_.size() > 2) {
- i++;
- continue;
+ const size_t size = queues_.size();
+ unsigned r = Rand(&pt->rand);
+ unsigned inc = coprimes_[r % coprimes_.size()];
+ unsigned victim = r % size;
+ for (unsigned i = 0; i < size; i++) {
+ Task t = queues_[victim]->PopBack();
+ if (t.f) {
+ return t;
+ }
+ victim += inc;
+ if (victim >= size) {
+ victim -= size;
}
- // Steal half of elements from a victim queue.
- // It is typical to steal just one element, but that assumes that work is
- // recursively subdivided in halves so that the stolen element is exactly
- // half of work. If work elements are equally-sized, then is makes sense
- // to steal half of elements at once and then work locally for a while.
- if (queues_[victim]->PopBackHalf(stolen)) return true;
- lastq = victim;
}
- // Just to make sure that we did not miss anything.
- for (unsigned i = queues_.size(); i > 0; i--)
- if (queues_[i - 1]->PopBackHalf(stolen)) return true;
- return false;
+ return Task();
}
- // WaitForWork blocks until new work is available, or if it is time to exit.
- bool WaitForWork(EventCount::Waiter* waiter) {
- // We already did best-effort emptiness check in Steal, so prepare blocking.
+ // WaitForWork blocks until new work is available (returns true), or if it is
+ // time to exit (returns false). Can optionally return a task to execute in t
+ // (in such case t.f != nullptr on return).
+ bool WaitForWork(EventCount::Waiter* waiter, Task* t) {
+ eigen_assert(!t->f);
+ // We already did best-effort emptiness check in Steal, so prepare for
+ // blocking.
ec_.Prewait(waiter);
- // Now do reliable emptiness check.
- if (!OutOfWork()) {
+ // Now do a reliable emptiness check.
+ int victim = NonEmptyQueueIndex();
+ if (victim != -1) {
ec_.CancelWait(waiter);
+ *t = queues_[victim]->PopBack();
return true;
}
// Number of blocked threads is used as termination condition.
@@ -186,7 +215,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
// right after incrementing blocked_ above. Now a free-standing thread
// submits work and calls destructor (which sets done_). If we don't
// re-check queues, we will exit leaving the work unexecuted.
- if (!OutOfWork()) {
+ if (NonEmptyQueueIndex() != -1) {
// Note: we must not pop from queues before we decrement blocked_,
// otherwise the following scenario is possible. Consider that instead
// of checking for emptiness we popped the only element from queues.
@@ -205,23 +234,36 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
return true;
}
- bool OutOfWork() {
- for (unsigned i = 0; i < queues_.size(); i++)
- if (!queues_[i]->Empty()) return false;
- return true;
+ int NonEmptyQueueIndex() {
+ PerThread* pt = GetPerThread();
+ const size_t size = queues_.size();
+ unsigned r = Rand(&pt->rand);
+ unsigned inc = coprimes_[r % coprimes_.size()];
+ unsigned victim = r % size;
+ for (unsigned i = 0; i < size; i++) {
+ if (!queues_[victim]->Empty()) {
+ return victim;
+ }
+ victim += inc;
+ if (victim >= size) {
+ victim -= size;
+ }
+ }
+ return -1;
}
- PerThread* GetPerThread() {
+ static EIGEN_STRONG_INLINE PerThread* GetPerThread() {
EIGEN_THREAD_LOCAL PerThread per_thread_;
PerThread* pt = &per_thread_;
- if (pt->inited) return pt;
- pt->inited = true;
- pt->rand = std::hash<std::thread::id>()(std::this_thread::get_id());
return pt;
}
- static unsigned Rand(unsigned* state) {
- return *state = *state * 1103515245 + 12345;
+ static EIGEN_STRONG_INLINE unsigned Rand(uint64_t* state) {
+ uint64_t current = *state;
+ // Update the internal state
+ *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+ // Generate the random output (using the PCG-XSH-RS scheme)
+ return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
}
};
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
index 0544a6e15..05ed76cbe 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
@@ -38,7 +38,7 @@ namespace Eigen {
template <typename Work, unsigned kSize>
class RunQueue {
public:
- RunQueue() : front_(), back_() {
+ RunQueue() : front_(0), back_(0) {
// require power-of-two for fast masking
eigen_assert((kSize & (kSize - 1)) == 0);
eigen_assert(kSize > 2); // why would you do this?
@@ -100,7 +100,7 @@ class RunQueue {
// PopBack removes and returns the last elements in the queue.
// Can fail spuriously.
Work PopBack() {
- if (Empty()) return 0;
+ if (Empty()) return Work();
std::unique_lock<std::mutex> lock(mutex_, std::try_to_lock);
if (!lock) return Work();
unsigned back = back_.load(std::memory_order_relaxed);
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
index 17fd1658b..e75d0f467 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
@@ -24,7 +24,7 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface {
explicit SimpleThreadPoolTempl(int num_threads, Environment env = Environment())
: env_(env), threads_(num_threads), waiters_(num_threads) {
for (int i = 0; i < num_threads; i++) {
- threads_.push_back(env.CreateThread([this]() { WorkerLoop(); }));
+ threads_.push_back(env.CreateThread([this, i]() { WorkerLoop(i); }));
}
}
@@ -55,7 +55,7 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface {
// Schedule fn() for execution in the pool of threads. The functions are
// executed in the order in which they are scheduled.
- void Schedule(std::function<void()> fn) {
+ void Schedule(std::function<void()> fn) final {
Task t = env_.CreateTask(std::move(fn));
std::unique_lock<std::mutex> l(mu_);
if (waiters_.empty()) {
@@ -69,9 +69,25 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface {
}
}
+ int NumThreads() const final {
+ return static_cast<int>(threads_.size());
+ }
+
+ int CurrentThreadId() const final {
+ const PerThread* pt = this->GetPerThread();
+ if (pt->pool == this) {
+ return pt->thread_id;
+ } else {
+ return -1;
+ }
+ }
+
protected:
- void WorkerLoop() {
+ void WorkerLoop(int thread_id) {
std::unique_lock<std::mutex> l(mu_);
+ PerThread* pt = GetPerThread();
+ pt->pool = this;
+ pt->thread_id = thread_id;
Waiter w;
Task t;
while (!exiting_) {
@@ -111,13 +127,24 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface {
bool ready;
};
+ struct PerThread {
+ constexpr PerThread() : pool(NULL), thread_id(-1) { }
+ SimpleThreadPoolTempl* pool; // Parent pool, or null for normal threads.
+ int thread_id; // Worker thread index in pool.
+ };
+
Environment env_;
std::mutex mu_;
MaxSizeVector<Thread*> threads_; // All threads
MaxSizeVector<Waiter*> waiters_; // Stack of waiting threads.
- std::deque<Task> pending_; // Queue of pending work
- std::condition_variable empty_; // Signaled on pending_.empty()
+ std::deque<Task> pending_; // Queue of pending work
+ std::condition_variable empty_; // Signaled on pending_.empty()
bool exiting_ = false;
+
+ PerThread* GetPerThread() const {
+ EIGEN_THREAD_LOCAL PerThread per_thread;
+ return &per_thread;
+ }
};
typedef SimpleThreadPoolTempl<StlThreadEnvironment> SimpleThreadPool;
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
index d2204ad5b..399f95cc1 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
@@ -21,14 +21,14 @@ struct StlThreadEnvironment {
// destructor must join the thread.
class EnvThread {
public:
- EnvThread(std::function<void()> f) : thr_(f) {}
+ EnvThread(std::function<void()> f) : thr_(std::move(f)) {}
~EnvThread() { thr_.join(); }
private:
std::thread thr_;
};
- EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(f); }
+ EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(std::move(f)); }
Task CreateTask(std::function<void()> f) { return Task{std::move(f)}; }
void ExecuteTask(const Task& t) { t.f(); }
};
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
index 38b40aceb..a65ee97c9 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
@@ -18,6 +18,13 @@ class ThreadPoolInterface {
public:
virtual void Schedule(std::function<void()> fn) = 0;
+ // Returns the number of threads in the pool.
+ virtual int NumThreads() const = 0;
+
+ // Returns a logical thread index between 0 and NumThreads() - 1 if called
+ // from one of the threads in the pool. Returns -1 otherwise.
+ virtual int CurrentThreadId() const = 0;
+
virtual ~ThreadPoolInterface() {}
};
diff --git a/unsupported/Eigen/CXX11/src/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/util/CMakeLists.txt
deleted file mode 100644
index 7eab492d6..000000000
--- a/unsupported/Eigen/CXX11/src/util/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_util_SRCS "*.h")
-
-INSTALL(FILES
- ${Eigen_CXX11_util_SRCS}
- DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/util COMPONENT Devel
- )
diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
index 24159e54c..30d3ebcff 100644
--- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
@@ -117,7 +117,7 @@ template <typename T, size_t n> class array {
values[7] = v8;
}
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE array(std::initializer_list<T> l) {
eigen_assert(l.size() == n);
@@ -167,7 +167,7 @@ template <typename T> class array<T, 0> {
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE array() : dummy() { }
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() {
eigen_assert(l.size() == 0);
}
diff --git a/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h b/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
index 961456f10..4bc3dd1ba 100644
--- a/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
+++ b/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
@@ -55,6 +55,17 @@ class MaxSizeVector {
internal::aligned_free(data_);
}
+ void resize(size_t n) {
+ eigen_assert(n <= reserve_);
+ for (size_t i = size_; i < n; ++i) {
+ new (&data_[i]) T;
+ }
+ for (size_t i = n; i < size_; ++i) {
+ data_[i].~T();
+ }
+ size_ = n;
+ }
+
// Append new elements (up to reserved size).
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void push_back(const T& t) {