aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported
diff options
context:
space:
mode:
authorGravatar Ville Kallioniemi <ville.kallioniemi@gmail.com>2016-01-21 23:08:54 -0700
committerGravatar Ville Kallioniemi <ville.kallioniemi@gmail.com>2016-01-21 23:08:54 -0700
commit9b6c72958a567e78e81b116eba255ba5b1b121ba (patch)
tree4da7f84648f06827d0bfe32c0e5b53f4a3d085c8 /unsupported
parent73aec9219b323b21710cb92f7f2ccfbfc253a367 (diff)
parent7b68cf2e0f3b1b54e099a94606ff951d319d7389 (diff)
Update to latest default branch
Diffstat (limited to 'unsupported')
-rw-r--r--unsupported/Eigen/CXX11/Tensor1
-rw-r--r--unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h6
-rw-r--r--unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/Tensor.h15
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h3
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h26
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h372
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h464
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h54
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h18
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h3
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h6
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h12
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h13
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMap.h11
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h6
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h68
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h170
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorRef.h3
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h12
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h1
-rw-r--r--unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h4
-rw-r--r--unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h2
-rw-r--r--unsupported/Eigen/src/SparseExtra/RandomSetter.h6
-rw-r--r--unsupported/test/cxx11_tensor_broadcasting.cpp6
-rw-r--r--unsupported/test/cxx11_tensor_contraction.cpp2
-rw-r--r--unsupported/test/cxx11_tensor_map.cpp2
46 files changed, 853 insertions, 474 deletions
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 7481a9ddb..1c5734383 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -88,6 +88,7 @@ typedef unsigned __int64 uint64_t;
#include "src/Tensor/TensorReductionCuda.h"
#include "src/Tensor/TensorArgMax.h"
#include "src/Tensor/TensorConcatenation.h"
+#include "src/Tensor/TensorContractionMapper.h"
#include "src/Tensor/TensorContraction.h"
#include "src/Tensor/TensorContractionThreadPool.h"
#include "src/Tensor/TensorContractionCuda.h"
diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h
index 3f149c6a3..c1c57041f 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h
@@ -109,11 +109,9 @@ template<int n, typename x> struct get;
template<int n, typename a, typename... as> struct get<n, type_list<a, as...>> : get<n-1, type_list<as...>> {};
template<typename a, typename... as> struct get<0, type_list<a, as...>> { typedef a type; };
-template<int n EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, as)> struct get<n, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(as)>> { static_assert((n - n) < 0, "meta-template get: The element to extract from a list must be smaller than the size of the list."); };
template<typename T, int n, T a, T... as> struct get<n, numeric_list<T, a, as...>> : get<n-1, numeric_list<T, as...>> {};
template<typename T, T a, T... as> struct get<0, numeric_list<T, a, as...>> { constexpr static T value = a; };
-template<typename T, int n EIGEN_TPL_PP_SPEC_HACK_DEFC(T, as)> struct get<n, numeric_list<T EIGEN_TPL_PP_SPEC_HACK_USEC(as)>> { static_assert((n - n) < 0, "meta-template get: The element to extract from a list must be smaller than the size of the list."); };
/* always get type, regardless of dummy; good for parameter pack expansion */
@@ -406,7 +404,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
template<typename Op, typename A, typename B, std::size_t N, int... n>
constexpr inline array<decltype(Op::run(A(), B())),N> h_array_zip(array<A, N> a, array<B, N> b, numeric_list<int, n...>)
{
- return array<decltype(Op::run(A(), B())),N>{{ Op::run(array_get<n>(a), array_get<n>(b))... }};
+ return array<decltype(Op::run(A(), B())),N>{ Op::run(array_get<n>(a), array_get<n>(b))... };
}
template<typename Op, typename A, typename B, std::size_t N>
@@ -434,7 +432,7 @@ constexpr inline auto array_zip_and_reduce(array<A, N> a, array<B, N> b) -> decl
template<typename Op, typename A, std::size_t N, int... n>
constexpr inline array<decltype(Op::run(A())),N> h_array_apply(array<A, N> a, numeric_list<int, n...>)
{
- return array<decltype(Op::run(A())),N>{{ Op::run(array_get<n>(a))... }};
+ return array<decltype(Op::run(A())),N>{ Op::run(array_get<n>(a))... };
}
template<typename Op, typename A, std::size_t N>
diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
index ab9c2ec3e..456b34d0b 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
@@ -132,13 +132,13 @@ template <typename T> class array<T, 0> {
return *static_cast<const T*>(NULL);
}
- static EIGEN_ALWAYS_INLINE std::size_t size() { return 0; }
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::size_t size() { return 0; }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE array() { }
#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
- array(std::initializer_list<T> l) {
+ EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) {
eigen_assert(l.size() == 0);
}
#endif
diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
index ad525bac8..092e30c1f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
@@ -78,7 +78,8 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0) & !(Options_&DontAlign),
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
Layout = Options_ & RowMajor ? RowMajor : ColMajor,
- CoordAccess = true
+ CoordAccess = true,
+ RawAccess = true
};
static const int Options = Options_;
@@ -118,7 +119,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
{
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
- return coeff(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+ return coeff(array<Index, NumIndices>{firstIndex, secondIndex, otherIndices...});
}
#endif
@@ -158,7 +159,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
{
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
- return coeffRef(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+ return coeffRef(array<Index, NumIndices>{firstIndex, secondIndex, otherIndices...});
}
#endif
@@ -198,7 +199,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
{
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
- return this->operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+ return this->operator()(array<Index, NumIndices>{firstIndex, secondIndex, otherIndices...});
}
#else
EIGEN_DEVICE_FUNC
@@ -265,7 +266,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
{
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
- return operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+ return operator()(array<Index, NumIndices>{firstIndex, secondIndex, otherIndices...});
}
#else
EIGEN_DEVICE_FUNC
@@ -341,7 +342,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index firstDimension, IndexTypes... otherDimensions)
- : m_storage(internal::array_prod(array<Index, NumIndices>{{firstDimension, otherDimensions...}}), array<Index, NumIndices>{{firstDimension, otherDimensions...}})
+ : m_storage(internal::array_prod(array<Index, NumIndices>{firstDimension, otherDimensions...}), array<Index, NumIndices>{firstDimension, otherDimensions...})
{
// The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
@@ -426,7 +427,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
{
// The number of dimensions used to resize a tensor must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
- resize(array<Index, NumIndices>{{firstDimension, otherDimensions...}});
+ resize(array<Index, NumIndices>{firstDimension, otherDimensions...});
}
#endif
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
index c783aab97..f1ec04c49 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
@@ -89,6 +89,7 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
+ RawAccess = false
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -134,7 +135,7 @@ struct traits<TensorTupleReducerOp<ReduceOp, Dims, XprType> > : public traits<Xp
typedef Index Scalar;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
- static const int NumDimensions = XprTraits::NumDimensions;
+ static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
static const int Layout = XprTraits::Layout;
};
@@ -210,6 +211,7 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
BlockAccess = false,
Layout = TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Layout,
CoordAccess = false, // to be implemented
+ RawAccess = false
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index a41d4d265..10fac0cc5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -97,6 +97,7 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+ RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess,
};
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
@@ -152,6 +153,8 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
return m_leftImpl.template packet<LoadMode>(index);
}
+ EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_leftImpl.data(); }
+
private:
TensorEvaluator<LeftArgType, Device> m_leftImpl;
TensorEvaluator<RightArgType, Device> m_rightImpl;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index dc64959e1..efca7cd79 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -46,6 +46,21 @@ struct nested<TensorBroadcastingOp<Broadcast, XprType>, 1, typename eval<TensorB
typedef TensorBroadcastingOp<Broadcast, XprType> type;
};
+template <typename Dims>
+struct is_input_scalar {
+ static const bool value = false;
+};
+template <>
+struct is_input_scalar<Sizes<> > {
+ static const bool value = true;
+};
+#ifndef EIGEN_EMULATE_CXX11_META_H
+template <typename std::size_t... Indices>
+struct is_input_scalar<Sizes<Indices...> > {
+ static const bool value = (Sizes<Indices...>::total_size == 1);
+};
+#endif
+
} // end namespace internal
@@ -94,6 +109,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
+ RawAccess = false
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -103,7 +119,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
// and store the result in a scalar. Instead one should reshape the scalar into a a N-D
// tensor with N >= 1 of 1 element first and then broadcast.
EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
- const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+ const InputDimensions& input_dims = m_impl.dimensions();
const Broadcast& broadcast = op.broadcast();
for (int i = 0; i < NumDims; ++i) {
eigen_assert(input_dims[i] > 0);
@@ -143,6 +159,10 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const
{
+ if (internal::is_input_scalar<typename internal::remove_all<InputDimensions>::type>::value) {
+ return m_impl.coeff(0);
+ }
+
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
return coeffColMajor(index);
} else {
@@ -214,6 +234,10 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const
{
+ if (internal::is_input_scalar<typename internal::remove_all<InputDimensions>::type>::value) {
+ return internal::pset1<PacketReturnType>(m_impl.coeff(0));
+ }
+
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
return packetColMajor<LoadMode>(index);
} else {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index abc3c92ca..a209e885b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -145,6 +145,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
+ RawAccess = false
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -304,6 +305,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ RawAccess = false
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index 3d153bb94..f57d2bb7d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -125,6 +125,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
IsAligned = false,
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+ RawAccess = false
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -287,6 +288,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
IsAligned = false,
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+ RawAccess = false
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index eda93a1de..624e814e2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -21,358 +21,6 @@ namespace Eigen {
*/
namespace internal {
-enum {
- Rhs = 0,
- Lhs = 1,
-};
-
-/*
- * Implementation of the Eigen blas_data_mapper class for tensors.
- */
-template<typename Scalar, typename Index, int side,
- typename Tensor,
- typename nocontract_t, typename contract_t,
- int packet_size, bool inner_dim_contiguous>
-class SimpleTensorContractionMapper {
- public:
- EIGEN_DEVICE_FUNC
- SimpleTensorContractionMapper(const Tensor& tensor,
- const nocontract_t& nocontract_strides,
- const nocontract_t& ij_strides,
- const contract_t& contract_strides,
- const contract_t& k_strides) :
- m_tensor(tensor),
- m_nocontract_strides(nocontract_strides),
- m_ij_strides(ij_strides),
- m_contract_strides(contract_strides),
- m_k_strides(k_strides) { }
-
- EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { }
-
- EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE Scalar operator()(Index row) const {
- // column major assumption
- return operator()(row, 0);
- }
-
- EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const {
- return m_tensor.coeff(computeIndex(row, col));
- }
-
- EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const {
- const bool left = (side == Lhs);
- Index nocontract_val = left ? row : col;
- Index linidx = 0;
- for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
- const Index idx = nocontract_val / m_ij_strides[i];
- linidx += idx * m_nocontract_strides[i];
- nocontract_val -= idx * m_ij_strides[i];
- }
- if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
- if (side == Lhs && inner_dim_contiguous) {
- eigen_assert(m_nocontract_strides[0] == 1);
- linidx += nocontract_val;
- } else {
- linidx += nocontract_val * m_nocontract_strides[0];
- }
- }
-
- Index contract_val = left ? col : row;
- for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
- const Index idx = contract_val / m_k_strides[i];
- linidx += idx * m_contract_strides[i];
- contract_val -= idx * m_k_strides[i];
- }
-
- if(array_size<contract_t>::value > 0) {
- if (side == Rhs && inner_dim_contiguous) {
- eigen_assert(m_contract_strides[0] == 1);
- linidx += contract_val;
- } else {
- linidx += contract_val * m_contract_strides[0];
- }
- }
-
- return linidx;
- }
-
- EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE IndexPair<Index> computeIndexPair(Index row, Index col, const Index distance) const {
- const bool left = (side == Lhs);
- Index nocontract_val[2] = {left ? row : col, left ? row + distance : col};
- Index linidx[2] = {0, 0};
- for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
- const Index idx0 = nocontract_val[0] / m_ij_strides[i];
- const Index idx1 = nocontract_val[1] / m_ij_strides[i];
- linidx[0] += idx0 * m_nocontract_strides[i];
- linidx[1] += idx1 * m_nocontract_strides[i];
- nocontract_val[0] -= idx0 * m_ij_strides[i];
- nocontract_val[1] -= idx1 * m_ij_strides[i];
- }
- if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
- if (side == Lhs && inner_dim_contiguous) {
- eigen_assert(m_nocontract_strides[0] == 1);
- linidx[0] += nocontract_val[0];
- linidx[1] += nocontract_val[1];
- } else {
- linidx[0] += nocontract_val[0] * m_nocontract_strides[0];
- linidx[1] += nocontract_val[1] * m_nocontract_strides[0];
- }
- }
-
- Index contract_val[2] = {left ? col : row, left ? col : row + distance};
- for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
- const Index idx0 = contract_val[0] / m_k_strides[i];
- const Index idx1 = contract_val[1] / m_k_strides[i];
- linidx[0] += idx0 * m_contract_strides[i];
- linidx[1] += idx1 * m_contract_strides[i];
- contract_val[0] -= idx0 * m_k_strides[i];
- contract_val[1] -= idx1 * m_k_strides[i];
- }
-
- if (side == Rhs && inner_dim_contiguous) {
- eigen_assert(m_contract_strides[0] == 1);
- linidx[0] += contract_val[0];
- linidx[1] += contract_val[1];
- } else {
- linidx[0] += contract_val[0] * m_contract_strides[0];
- linidx[1] += contract_val[1] * m_contract_strides[0];
- }
- return IndexPair<Index>(linidx[0], linidx[1]);
- }
-
- Index firstAligned(Index size) const {
- return size;
- }
- Index stride() const {
- return 1;
- }
-
- protected:
- const Tensor m_tensor;
- const nocontract_t m_nocontract_strides;
- const nocontract_t m_ij_strides;
- const contract_t m_contract_strides;
- const contract_t m_k_strides;
-};
-
-
-template<typename Scalar, typename Index, int side,
- typename Tensor,
- typename nocontract_t, typename contract_t,
- int packet_size, bool inner_dim_contiguous,
- bool inner_dim_reordered, int Alignment>
- class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous>
-{
- public:
- typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous> ParentMapper;
-
- EIGEN_DEVICE_FUNC
- BaseTensorContractionMapper(const Tensor& tensor,
- const nocontract_t& nocontract_strides,
- const nocontract_t& ij_strides,
- const contract_t& contract_strides,
- const contract_t& k_strides) :
- ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
-
- typedef typename packet_traits<Scalar>::type Packet;
- typedef typename packet_traits<Scalar>::half HalfPacket;
-
- EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
- // whole method makes column major assumption
-
- // don't need to add offsets for now (because operator handles that)
- // current code assumes packet size must be a multiple of 2
- EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
-
- if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) {
- const Index index = this->computeIndex(i, j);
- eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1);
- return this->m_tensor.template packet<Alignment>(index);
- }
-
- const IndexPair<Index> indexPair = this->computeIndexPair(i, j, packet_size - 1);
- const Index first = indexPair.first;
- const Index last = indexPair.second;
-
- // We can always do optimized packet reads from left hand side right now, because
- // the vertical matrix dimension on the left hand side is never contracting.
- // On the right hand side we need to check if the contracting dimensions may have
- // been shuffled first.
- if (Tensor::PacketAccess &&
- (side == Lhs || internal::array_size<contract_t>::value <= 1 || !inner_dim_reordered) &&
- (last - first) == (packet_size - 1)) {
-
- return this->m_tensor.template packet<Alignment>(first);
- }
-
- EIGEN_ALIGN_MAX Scalar data[packet_size];
-
- data[0] = this->m_tensor.coeff(first);
- for (Index k = 1; k < packet_size - 1; k += 2) {
- const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
- data[k] = this->m_tensor.coeff(internal_pair.first);
- data[k + 1] = this->m_tensor.coeff(internal_pair.second);
- }
- data[packet_size - 1] = this->m_tensor.coeff(last);
-
- return pload<Packet>(data);
- }
-
- EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
- // whole method makes column major assumption
-
- // don't need to add offsets for now (because operator handles that)
- const Index half_packet_size = unpacket_traits<HalfPacket>::size;
- if (half_packet_size == packet_size) {
- return loadPacket(i, j);
- }
- EIGEN_ALIGN_MAX Scalar data[half_packet_size];
- for (Index k = 0; k < half_packet_size; k++) {
- data[k] = operator()(i + k, j);
- }
- return pload<HalfPacket>(data);
- }
-};
-
-
-template<typename Scalar, typename Index, int side,
- typename Tensor,
- typename nocontract_t, typename contract_t,
- bool inner_dim_contiguous,
- bool inner_dim_reordered, int Alignment>
-class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous>
-{
- public:
- typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous> ParentMapper;
-
- EIGEN_DEVICE_FUNC
- BaseTensorContractionMapper(const Tensor& tensor,
- const nocontract_t& nocontract_strides,
- const nocontract_t& ij_strides,
- const contract_t& contract_strides,
- const contract_t& k_strides) :
- ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
-
- typedef typename packet_traits<Scalar>::type Packet;
- EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
- EIGEN_ALIGN_MAX Scalar data[1];
- data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
- return pload<typename packet_traits<Scalar>::type>(data);
- }
- EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const {
- return loadPacket(i, j);
- }
-};
-
-template<typename Scalar, typename Index, int side,
- typename Tensor,
- typename nocontract_t, typename contract_t,
- int packet_size,
- bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
-class TensorContractionInputMapper;
-
-template<typename Scalar, typename Index, int side,
- typename Tensor,
- typename nocontract_t, typename contract_t,
- int packet_size,
- bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
-class TensorContractionSubMapper {
- public:
- typedef typename packet_traits<Scalar>::type Packet;
- typedef typename packet_traits<Scalar>::half HalfPacket;
-
- typedef TensorContractionInputMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper;
- typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self;
- typedef Self LinearMapper;
-
- EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
- : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { }
-
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
- return m_base_mapper(i + m_vert_offset, m_horiz_offset);
- }
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const {
- return m_base_mapper(i + m_vert_offset, j + m_horiz_offset);
- }
-
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
- return m_base_mapper.loadPacket(i + m_vert_offset, m_horiz_offset);
- }
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
- return m_base_mapper.loadPacket(i + m_vert_offset, j + m_horiz_offset);
- }
-
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
- return m_base_mapper.loadHalfPacket(i + m_vert_offset, m_horiz_offset);
- }
-
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
- m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p);
- }
-
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
- return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset);
- }
-
- template <typename PacketT, int AlignmentType>
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const {
- EIGEN_STATIC_ASSERT((internal::is_same<PacketT, Packet>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
- EIGEN_STATIC_ASSERT((AlignmentType == Aligned || Alignment == Unaligned), YOU_MADE_A_PROGRAMMING_MISTAKE);
- return loadPacket(i);
- }
-
- template <typename Packet>
- EIGEN_DEVICE_FUNC bool aligned(Index) const {
- return false;
- }
-
- private:
- const ParentMapper& m_base_mapper;
- const Index m_vert_offset;
- const Index m_horiz_offset;
-};
-
-
-template<typename Scalar, typename Index, int side,
- typename Tensor,
- typename nocontract_t, typename contract_t,
- int packet_size,
- bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
-class TensorContractionInputMapper
- : public BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> {
-
- public:
- typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Base;
- typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
- typedef SubMapper VectorMapper;
-
- EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor,
- const nocontract_t& nocontract_strides,
- const nocontract_t& ij_strides,
- const contract_t& contract_strides,
- const contract_t& k_strides)
- : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
-
- EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
- return SubMapper(*this, i, j);
- }
-
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
- return VectorMapper(*this, i, j);
- }
-};
-
-
-
template<typename Dimensions, typename LhsXprType, typename RhsXprType>
struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
{
@@ -480,6 +128,7 @@ struct TensorContractionEvaluatorBase
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
CoordAccess = false, // to be implemented
+ RawAccess = true
};
// Most of the code is assuming that both input tensors are ColMajor. If the
@@ -498,8 +147,6 @@ struct TensorContractionEvaluatorBase
static const int ContractDims = internal::array_size<Indices>::value;
static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
- typedef array<Index, LDims> left_dim_mapper_t;
- typedef array<Index, RDims> right_dim_mapper_t;
typedef array<Index, ContractDims> contract_t;
typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
@@ -546,8 +193,8 @@ struct TensorContractionEvaluatorBase
// We need to flip all the pairs of contracting indices as well as
// reversing the dimensions.
for (int i = 0; i < ContractDims; i++) {
- eval_op_indices[i].first = LDims - 1 - op.indices()[i].second;
- eval_op_indices[i].second = RDims - 1 - op.indices()[i].first;
+ eval_op_indices[i].first = LDims - 1 - op.indices()[ContractDims - 1 - i].second;
+ eval_op_indices[i].second = RDims - 1 - op.indices()[ContractDims - 1 - i].first;
}
}
@@ -741,17 +388,19 @@ struct TensorContractionEvaluatorBase
typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
const Index lhs_packet_size = internal::packet_traits<LhsScalar>::size;
const Index rhs_packet_size = internal::packet_traits<RhsScalar>::size;
+ const int lhs_alignment = LeftEvaluator::IsAligned ? Aligned : Unaligned;
+ const int rhs_alignment = RightEvaluator::IsAligned ? Aligned : Unaligned;
typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
LeftEvaluator, left_nocontract_t,
contract_t, lhs_packet_size,
lhs_inner_dim_contiguous,
- false, Unaligned> LhsMapper;
+ false, lhs_alignment> LhsMapper;
typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
RightEvaluator, right_nocontract_t,
contract_t, rhs_packet_size,
rhs_inner_dim_contiguous,
- rhs_inner_dim_reordered, Unaligned> RhsMapper;
+ rhs_inner_dim_reordered, rhs_alignment> RhsMapper;
LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides,
m_left_contracting_strides, m_k_strides);
@@ -784,11 +433,11 @@ struct TensorContractionEvaluatorBase
}
template<int LoadMode>
- EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
return internal::ploadt<Packet, LoadMode>(m_result + index);
}
- EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; }
protected:
// Prevent assignment
@@ -853,9 +502,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
static const int ContractDims = internal::array_size<Indices>::value;
- typedef array<Index, LDims> left_dim_mapper_t;
- typedef array<Index, RDims> right_dim_mapper_t;
-
typedef array<Index, ContractDims> contract_t;
typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index 90ee50678..a5f3debc4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -1261,7 +1261,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
Base(op, device) {}
// We need to redefine this method to make nvcc happy
- EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
this->m_leftImpl.evalSubExprsIfNeeded(NULL);
this->m_rightImpl.evalSubExprsIfNeeded(NULL);
if (data) {
@@ -1317,6 +1317,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
void evalTyped(Scalar* buffer) const {
// columns in left side, rows in right side
const Index k = this->m_k_size;
+ EIGEN_UNUSED_VARIABLE(k)
// rows in left side
const Index m = this->m_i_size;
@@ -1361,7 +1362,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
const dim3 block_size(16, 16, 1);
LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k);
} else {
- const Index m_blocks = (m + 127) / 128;
+ const Index m_blocks = (m + 127) / 128;
const Index n_blocks = (n + 63) / 64;
const dim3 num_blocks(m_blocks, n_blocks, 1);
const dim3 block_size(8, 32, 1);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
new file mode 100644
index 000000000..9b6d18090
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@@ -0,0 +1,464 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H
+
+namespace Eigen {
+
+namespace internal {
+
+enum {
+ Rhs = 0,
+ Lhs = 1,
+};
+
+/*
+ * Implementation of the Eigen blas_data_mapper class for tensors.
+ */
+
+template <typename Tensor, bool HasRawAccess> struct CoeffLoader {
+ enum {
+ DirectOffsets = false
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_tensor(tensor) { }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index) {
+ eigen_assert(false && "unsupported");
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return m_tensor.coeff(index); }
+
+ template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename Tensor::PacketReturnType packet(typename Tensor::Index index) const
+ {
+ return m_tensor.template packet<LoadMode>(index);
+ }
+
+
+ private:
+ const Tensor m_tensor;
+};
+
+template <typename Tensor> struct CoeffLoader<Tensor, true> {
+ enum {
+ DirectOffsets = true
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_data(tensor.data()) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) {
+ m_data += offset;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return loadConstant(m_data+index); }
+
+ template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename Tensor::PacketReturnType packet(typename Tensor::Index index) const
+ {
+ return internal::ploadt_ro<typename Tensor::PacketReturnType, LoadMode>(m_data + index);
+ }
+ private:
+ typedef typename Tensor::Scalar Scalar;
+ const Scalar* m_data;
+};
+
+template<typename Scalar, typename Index, int side,
+ typename Tensor,
+ typename nocontract_t, typename contract_t,
+ int packet_size, bool inner_dim_contiguous, int Alignment>
+class SimpleTensorContractionMapper {
+ public:
+ EIGEN_DEVICE_FUNC
+ SimpleTensorContractionMapper(const Tensor& tensor,
+ const nocontract_t& nocontract_strides,
+ const nocontract_t& ij_strides,
+ const contract_t& contract_strides,
+ const contract_t& k_strides) :
+ m_tensor(tensor),
+ m_nocontract_strides(nocontract_strides),
+ m_ij_strides(ij_strides),
+ m_contract_strides(contract_strides),
+ m_k_strides(k_strides) { }
+
+ enum {
+ DirectOffsets = CoeffLoader<Tensor, Tensor::RawAccess>::DirectOffsets
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) {
+ m_tensor.offsetBuffer(offset);
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar operator()(Index row) const {
+ // column major assumption
+ return operator()(row, 0);
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const {
+ return m_tensor.coeff(computeIndex(row, col));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const {
+ const bool left = (side == Lhs);
+ Index nocontract_val = left ? row : col;
+ Index linidx = 0;
+ for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
+ const Index idx = nocontract_val / m_ij_strides[i];
+ linidx += idx * m_nocontract_strides[i];
+ nocontract_val -= idx * m_ij_strides[i];
+ }
+ if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+ if (side == Lhs && inner_dim_contiguous) {
+ eigen_assert(m_nocontract_strides[0] == 1);
+ linidx += nocontract_val;
+ } else {
+ linidx += nocontract_val * m_nocontract_strides[0];
+ }
+ }
+
+ Index contract_val = left ? col : row;
+ for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
+ const Index idx = contract_val / m_k_strides[i];
+ linidx += idx * m_contract_strides[i];
+ contract_val -= idx * m_k_strides[i];
+ }
+
+ if(array_size<contract_t>::value > 0) {
+ if (side == Rhs && inner_dim_contiguous) {
+ eigen_assert(m_contract_strides[0] == 1);
+ linidx += contract_val;
+ } else {
+ linidx += contract_val * m_contract_strides[0];
+ }
+ }
+
+ return linidx;
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE IndexPair<Index> computeIndexPair(Index row, Index col, const Index distance) const {
+ const bool left = (side == Lhs);
+ Index nocontract_val[2] = {left ? row : col, left ? row + distance : col};
+ Index linidx[2] = {0, 0};
+ for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
+ const Index idx0 = nocontract_val[0] / m_ij_strides[i];
+ const Index idx1 = nocontract_val[1] / m_ij_strides[i];
+ linidx[0] += idx0 * m_nocontract_strides[i];
+ linidx[1] += idx1 * m_nocontract_strides[i];
+ nocontract_val[0] -= idx0 * m_ij_strides[i];
+ nocontract_val[1] -= idx1 * m_ij_strides[i];
+ }
+ if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+ if (side == Lhs && inner_dim_contiguous) {
+ eigen_assert(m_nocontract_strides[0] == 1);
+ linidx[0] += nocontract_val[0];
+ linidx[1] += nocontract_val[1];
+ } else {
+ linidx[0] += nocontract_val[0] * m_nocontract_strides[0];
+ linidx[1] += nocontract_val[1] * m_nocontract_strides[0];
+ }
+ }
+
+ Index contract_val[2] = {left ? col : row, left ? col : row + distance};
+ for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
+ const Index idx0 = contract_val[0] / m_k_strides[i];
+ const Index idx1 = contract_val[1] / m_k_strides[i];
+ linidx[0] += idx0 * m_contract_strides[i];
+ linidx[1] += idx1 * m_contract_strides[i];
+ contract_val[0] -= idx0 * m_k_strides[i];
+ contract_val[1] -= idx1 * m_k_strides[i];
+ }
+
+ if (side == Rhs && inner_dim_contiguous) {
+ eigen_assert(m_contract_strides[0] == 1);
+ linidx[0] += contract_val[0];
+ linidx[1] += contract_val[1];
+ } else {
+ linidx[0] += contract_val[0] * m_contract_strides[0];
+ linidx[1] += contract_val[1] * m_contract_strides[0];
+ }
+ return IndexPair<Index>(linidx[0], linidx[1]);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index firstAligned(Index size) const {
+ // Only claim alignment when we can compute the actual stride (ie when we're
+ // dealing with the lhs with inner_dim_contiguous. This is because the
+ // matrix-vector product relies on the stride when dealing with aligned inputs.
+ return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const {
+ return ((side == Lhs) && inner_dim_contiguous) ? m_contract_strides[0] : 1;
+ }
+
+ protected:
+ CoeffLoader<Tensor, Tensor::RawAccess> m_tensor;
+ const nocontract_t m_nocontract_strides;
+ const nocontract_t m_ij_strides;
+ const contract_t m_contract_strides;
+ const contract_t m_k_strides;
+};
+
+
+template<typename Scalar, typename Index, int side,
+ typename Tensor,
+ typename nocontract_t, typename contract_t,
+ int packet_size, bool inner_dim_contiguous,
+ bool inner_dim_reordered, int Alignment>
+class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment>
+{
+ public:
+ typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment> ParentMapper;
+
+ EIGEN_DEVICE_FUNC
+ BaseTensorContractionMapper(const Tensor& tensor,
+ const nocontract_t& nocontract_strides,
+ const nocontract_t& ij_strides,
+ const contract_t& contract_strides,
+ const contract_t& k_strides) :
+ ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
+
+ typedef typename packet_traits<Scalar>::type Packet;
+ typedef typename packet_traits<Scalar>::half HalfPacket;
+
+ template <int AlignmentType = Alignment>
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
+ // whole method makes column major assumption
+
+ // don't need to add offsets for now (because operator handles that)
+ // current code assumes packet size must be a multiple of 2
+ EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) {
+ const Index index = this->computeIndex(i, j);
+ eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1);
+ return this->m_tensor.template packet<AlignmentType>(index);
+ }
+
+ const IndexPair<Index> indexPair = this->computeIndexPair(i, j, packet_size - 1);
+ const Index first = indexPair.first;
+ const Index last = indexPair.second;
+
+ // We can always do optimized packet reads from left hand side right now, because
+ // the vertical matrix dimension on the left hand side is never contracting.
+ // On the right hand side we need to check if the contracting dimensions may have
+ // been shuffled first.
+ if (Tensor::PacketAccess &&
+ (side == Lhs || internal::array_size<contract_t>::value <= 1 || !inner_dim_reordered) &&
+ (last - first) == (packet_size - 1)) {
+
+ return this->m_tensor.template packet<AlignmentType>(first);
+ }
+
+ EIGEN_ALIGN_MAX Scalar data[packet_size];
+
+ data[0] = this->m_tensor.coeff(first);
+ for (Index k = 1; k < packet_size - 1; k += 2) {
+ const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
+ data[k] = this->m_tensor.coeff(internal_pair.first);
+ data[k + 1] = this->m_tensor.coeff(internal_pair.second);
+ }
+ data[packet_size - 1] = this->m_tensor.coeff(last);
+
+ return pload<Packet>(data);
+ }
+
+ template <int AlignmentType = Alignment>
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
+ // whole method makes column major assumption
+
+ // don't need to add offsets for now (because operator handles that)
+ const Index half_packet_size = unpacket_traits<HalfPacket>::size;
+ if (half_packet_size == packet_size) {
+ return loadPacket<AlignmentType>(i, j);
+ }
+ EIGEN_ALIGN_MAX Scalar data[half_packet_size];
+ for (Index k = 0; k < half_packet_size; k++) {
+ data[k] = operator()(i + k, j);
+ }
+ return pload<HalfPacket>(data);
+ }
+};
+
+
+template<typename Scalar, typename Index, int side,
+ typename Tensor,
+ typename nocontract_t, typename contract_t,
+ bool inner_dim_contiguous,
+ bool inner_dim_reordered, int Alignment>
+class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment>
+{
+ public:
+ typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment> ParentMapper;
+
+ EIGEN_DEVICE_FUNC
+ BaseTensorContractionMapper(const Tensor& tensor,
+ const nocontract_t& nocontract_strides,
+ const nocontract_t& ij_strides,
+ const contract_t& contract_strides,
+ const contract_t& k_strides) :
+ ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
+
+ typedef typename packet_traits<Scalar>::type Packet;
+ template <int> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
+ EIGEN_ALIGN_MAX Scalar data[1];
+ data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
+ return pload<typename packet_traits<Scalar>::type>(data);
+ }
+ template <int> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const {
+ return loadPacket(i, j);
+ }
+};
+
+
+template<typename Scalar, typename Index, int side,
+ typename Tensor,
+ typename nocontract_t, typename contract_t,
+ int packet_size,
+ bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionSubMapper {
+ public:
+ typedef typename packet_traits<Scalar>::type Packet;
+ typedef typename packet_traits<Scalar>::half HalfPacket;
+
+ typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper;
+ typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self;
+ typedef Self LinearMapper;
+
+ enum {
+ // We can use direct offsets iff the parent mapper supports then and we can compute the strides.
+ // TODO: we should also enable direct offsets for the Rhs case.
+ UseDirectOffsets = (side == Lhs) && inner_dim_contiguous && ParentMapper::DirectOffsets
+ };
+
+ EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
+ : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) {
+ // Bake the offsets into the buffer used by the base mapper whenever possible. This avoids the need to recompute
+ // this offset every time we attempt to access a coefficient.
+ if (UseDirectOffsets) {
+ Index stride = m_base_mapper.stride();
+ m_base_mapper.offsetBuffer(vert_offset + horiz_offset * stride);
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
+ if (UseDirectOffsets) {
+ return m_base_mapper(i, 0);
+ }
+ return m_base_mapper(i + m_vert_offset, m_horiz_offset);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const {
+ if (UseDirectOffsets) {
+ return m_base_mapper(i, j);
+ }
+ return m_base_mapper(i + m_vert_offset, j + m_horiz_offset);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
+ if (UseDirectOffsets) {
+ return m_base_mapper.template loadPacket<Alignment>(i, 0);
+ }
+ return m_base_mapper.template loadPacket<Alignment>(i + m_vert_offset, m_horiz_offset);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
+ if (UseDirectOffsets) {
+ return m_base_mapper.template loadPacket<Alignment>(i, j);
+ }
+ return m_base_mapper.template loadPacket<Alignment>(i + m_vert_offset, j + m_horiz_offset);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
+ if (UseDirectOffsets) {
+ return m_base_mapper.template loadHalfPacket<Alignment>(i, 0);
+ }
+ return m_base_mapper.template loadHalfPacket<Alignment>(i + m_vert_offset, m_horiz_offset);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
+ if (UseDirectOffsets) {
+ m_base_mapper.storePacket(i, 0, p);
+ }
+ m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+ if (UseDirectOffsets) {
+ return LinearMapper(m_base_mapper, i, j);
+ }
+ return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset);
+ }
+
+ template <typename PacketT, int AlignmentType>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const {
+ EIGEN_STATIC_ASSERT((internal::is_same<PacketT, Packet>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned;
+ if (UseDirectOffsets) {
+ return m_base_mapper.template loadPacket<ActualAlignment>(i, 0);
+ }
+ return m_base_mapper.template loadPacket<ActualAlignment>(i + m_vert_offset, m_horiz_offset);
+ }
+
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const {
+ return false;
+ }
+
+ private:
+ ParentMapper m_base_mapper;
+ const Index m_vert_offset;
+ const Index m_horiz_offset;
+};
+
+
+template<typename Scalar, typename Index, int side,
+ typename Tensor,
+ typename nocontract_t, typename contract_t,
+ int packet_size,
+ bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionInputMapper
+ : public BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> {
+
+ public:
+ typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Base;
+ typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
+ typedef SubMapper VectorMapper;
+
+ EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor,
+ const nocontract_t& nocontract_strides,
+ const nocontract_t& ij_strides,
+ const contract_t& contract_strides,
+ const contract_t& k_strides)
+ : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
+ return SubMapper(*this, i, j);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
+ return VectorMapper(*this, i, j);
+ }
+};
+
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index 3ca7daf32..877bcd0df 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -162,6 +162,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess && internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
Layout = TensorEvaluator<ArgType, Device>::Layout,
+ RawAccess = false
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index a82bfc0aa..367a152a0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -306,6 +306,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
Layout = TensorEvaluator<InputArgType, Device>::Layout,
CoordAccess = false, // to be implemented
+ RawAccess = false
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -752,6 +753,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
PacketAccess = false,
Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
CoordAccess = false, // to be implemented
+ RawAccess = false
};
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
index 0157f6fab..0f8a98caf 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
@@ -95,6 +95,7 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
BlockAccess = false,
Layout = TensorEvaluator<XprType, Device>::Layout,
CoordAccess = false, // to be implemented
+ RawAccess = false
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device)
@@ -250,6 +251,7 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
BlockAccess = false,
Layout = TensorEvaluator<LhsXprType, Device>::Layout,
CoordAccess = false, // to be implemented
+ RawAccess = false
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index 4d7570077..5abdc489b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -10,7 +10,6 @@
#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H)
#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H
-
namespace Eigen {
// This defines an interface that GPUDevice can take to use
@@ -206,20 +205,45 @@ struct GpuDevice {
#endif
}
- inline int getNumCudaMultiProcessors() const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
+#ifndef __CUDA_ARCH__
return stream_->deviceProperties().multiProcessorCount;
+#else
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
+ return 0;
+#endif
}
- inline int maxCudaThreadsPerBlock() const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
+#ifndef __CUDA_ARCH__
return stream_->deviceProperties().maxThreadsPerBlock;
+#else
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
+ return 0;
+#endif
}
- inline int maxCudaThreadsPerMultiProcessor() const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
+#ifndef __CUDA_ARCH__
return stream_->deviceProperties().maxThreadsPerMultiProcessor;
+#else
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
+ return 0;
+#endif
}
- inline int sharedMemPerBlock() const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
+#ifndef __CUDA_ARCH__
return stream_->deviceProperties().sharedMemPerBlock;
+#else
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
+ return 0;
+#endif
}
- inline int majorDeviceVersion() const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+#ifndef __CUDA_ARCH__
return stream_->deviceProperties().major;
+#else
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
+ return 0;
+#endif
}
// This function checks if the CUDA runtime recorded an error for the
@@ -239,23 +263,29 @@ struct GpuDevice {
};
#ifndef __CUDA_ARCH__
-#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
- (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
+#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
+ (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
assert(cudaGetLastError() == cudaSuccess);
#else
-#define LAUNCH_CUDA_KERNEL(...) \
- eigen_assert(false && "Cannot launch a kernel from another kernel");
+#define LAUNCH_CUDA_KERNEL(kernel, ...) \
+ { const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); } \
+ eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__);
#endif
+
// FIXME: Should be device and kernel specific.
#ifdef __CUDACC__
-static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
+static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
+#ifndef __CUDA_ARCH__
cudaError_t status = cudaDeviceSetSharedMemConfig(config);
EIGEN_UNUSED_VARIABLE(status)
assert(status == cudaSuccess);
+#else
+ EIGEN_UNUSED_VARIABLE(config)
+#endif
}
#endif
} // end namespace Eigen
-#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index f3c9a3148..06cac3570 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -110,14 +110,14 @@ struct Sizes : internal::numeric_list<std::ptrdiff_t, Indices...> {
return internal::arg_prod(Indices...);
}
- Sizes() { }
+ EIGEN_DEVICE_FUNC Sizes() { }
template <typename DenseIndex>
- explicit Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
+ explicit EIGEN_DEVICE_FUNC Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
// todo: add assertion
}
#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
- template <typename... DenseIndex> Sizes(DenseIndex...) { }
- explicit Sizes(std::initializer_list<std::ptrdiff_t> /*l*/) {
+ template <typename... DenseIndex> EIGEN_DEVICE_FUNC Sizes(DenseIndex...) { }
+ explicit EIGEN_DEVICE_FUNC Sizes(std::initializer_list<std::ptrdiff_t> /*l*/) {
// todo: add assertion
}
#endif
@@ -289,7 +289,7 @@ struct DSizes : array<DenseIndex, NumDims> {
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, IndexTypes... otherDimensions) {
EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE)
- (*this) = array<DenseIndex, NumDims>{{firstDimension, otherDimensions...}};
+ (*this) = array<DenseIndex, NumDims>{firstDimension, otherDimensions...};
}
#else
EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) {
@@ -405,20 +405,20 @@ template <std::size_t n, std::size_t V1, std::size_t V2, std::size_t V3, std::si
template <typename Dims1, typename Dims2, size_t n, size_t m>
struct sizes_match_below_dim {
- static inline bool run(Dims1&, Dims2&) {
+ static EIGEN_DEVICE_FUNC inline bool run(Dims1&, Dims2&) {
return false;
}
};
template <typename Dims1, typename Dims2, size_t n>
struct sizes_match_below_dim<Dims1, Dims2, n, n> {
- static inline bool run(Dims1& dims1, Dims2& dims2) {
+ static EIGEN_DEVICE_FUNC inline bool run(Dims1& dims1, Dims2& dims2) {
return (array_get<n-1>(dims1) == array_get<n-1>(dims2)) &
sizes_match_below_dim<Dims1, Dims2, n-1, n-1>::run(dims1, dims2);
}
};
template <typename Dims1, typename Dims2>
struct sizes_match_below_dim<Dims1, Dims2, 0, 0> {
- static inline bool run(Dims1&, Dims2&) {
+ static EIGEN_DEVICE_FUNC inline bool run(Dims1&, Dims2&) {
return true;
}
};
@@ -427,7 +427,7 @@ struct sizes_match_below_dim<Dims1, Dims2, 0, 0> {
template <typename Dims1, typename Dims2>
-bool dimensions_match(Dims1& dims1, Dims2& dims2) {
+EIGEN_DEVICE_FUNC bool dimensions_match(Dims1& dims1, Dims2& dims2) {
return internal::sizes_match_below_dim<Dims1, Dims2, internal::array_size<Dims1>::value, internal::array_size<Dims2>::value>::run(dims1, dims2);
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index ff4373f59..e7daf7304 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -98,6 +98,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
PacketAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
+ RawAccess = true
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -140,7 +141,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
return internal::ploadt<Packet, LoadMode>(m_buffer + index);
}
- EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+ EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_buffer; }
private:
TensorEvaluator<ArgType, Device> m_impl;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index 902f25247..f726585b1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -43,6 +43,7 @@ struct TensorEvaluator
PacketAccess = Derived::PacketAccess,
Layout = Derived::Layout,
CoordAccess = NumCoords > 0,
+ RawAccess = true
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
@@ -148,6 +149,7 @@ struct TensorEvaluator<const Derived, Device>
PacketAccess = Derived::PacketAccess,
Layout = Derived::Layout,
CoordAccess = NumCoords > 0,
+ RawAccess = true
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
@@ -207,6 +209,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
+ RawAccess = false
};
EIGEN_DEVICE_FUNC
@@ -257,6 +260,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess & internal::functor_traits<UnaryOp>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
+ RawAccess = false
};
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
@@ -312,6 +316,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
internal::functor_traits<BinaryOp>::PacketAccess,
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
CoordAccess = false, // to be implemented
+ RawAccess = false
};
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
@@ -378,6 +383,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
internal::packet_traits<Scalar>::HasBlend,
Layout = TensorEvaluator<IfArgType, Device>::Layout,
CoordAccess = false, // to be implemented
+ RawAccess = false
};
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index c28078882..d2ab70f2b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -156,14 +156,14 @@ template <typename Expression>
class TensorExecutor<Expression, GpuDevice, false> {
public:
typedef typename Expression::Index Index;
- EIGEN_DEVICE_FUNC static void run(const Expression& expr, const GpuDevice& device);
+ static EIGEN_DEVICE_FUNC void run(const Expression& expr, const GpuDevice& device);
};
template <typename Expression>
class TensorExecutor<Expression, GpuDevice, true> {
public:
typedef typename Expression::Index Index;
- EIGEN_DEVICE_FUNC static void run(const Expression& expr, const GpuDevice& device);
+ static EIGEN_DEVICE_FUNC void run(const Expression& expr, const GpuDevice& device);
};
#if defined(__CUDACC__)
@@ -215,7 +215,6 @@ EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) {
template <typename Expression>
EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device)
{
-#ifndef __CUDA_ARCH__
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign)
@@ -228,9 +227,6 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run(
LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
}
evaluator.cleanup();
-#else
- eigen_assert(false && "Cannot launch a kernel from another kernel");
-#endif
}
@@ -238,7 +234,6 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run(
template<typename Expression>
EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(const Expression& expr, const GpuDevice& device)
{
-#ifndef __CUDA_ARCH__
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign)
@@ -251,9 +246,6 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(c
LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
}
evaluator.cleanup();
-#else
- eigen_assert(false && "Cannot launch a kernel from another kernel");
-#endif
}
#endif // __CUDACC__
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index 215a4ebad..3bfaf6d23 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -135,6 +135,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
+ RawAccess = false
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index a4d6ce6b3..7d0858d02 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -44,7 +44,8 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
Layout = Options_ & RowMajor ? RowMajor : ColMajor,
CoordAccess = true,
- };
+ RawAccess = true
+ };
typedef Dimensions_ Dimensions;
static const std::size_t NumIndices = Dimensions::count;
@@ -53,7 +54,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
TensorStorage<Scalar, Dimensions, Options> m_storage;
public:
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); }
@@ -72,7 +73,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
{
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
- return coeff(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+ return coeff(array<Index, NumIndices>{firstIndex, otherIndices...});
}
#endif
@@ -104,7 +105,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
{
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
- return coeffRef(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+ return coeffRef(array<Index, NumIndices>{firstIndex, otherIndices...});
}
#endif
@@ -136,7 +137,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
{
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
- return this->operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+ return this->operator()(array<Index, NumIndices>{firstIndex, otherIndices...});
}
#endif
@@ -175,7 +176,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
{
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
- return operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+ return operator()(array<Index, NumIndices>{firstIndex, otherIndices...});
}
#endif
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index c16bf7e67..c9b0b2f28 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -92,6 +92,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
IsAligned = true,
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
Layout = TensorEvaluator<ArgType, Device>::Layout,
+ RawAccess = true
};
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
index 9316c9831..96f74b992 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@@ -95,6 +95,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
+ RawAccess = false
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index 11e510414..2ab332add 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -168,6 +168,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = NumDims == 5,
+ RawAccess = false
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
index ae9e9f751..2798956ae 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
@@ -91,6 +91,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
+ RawAccess = false
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
index f612bbd45..a37516974 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -123,6 +123,7 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
CoordAccess = false, // to be implemented
+ RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
index 5c759af09..7233f4c89 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
@@ -49,7 +49,8 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
IsAligned = ((int(Options_)&Aligned)==Aligned),
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
Layout = PlainObjectType::Layout,
- CoordAccess = true
+ CoordAccess = true,
+ RawAccess = true
};
EIGEN_DEVICE_FUNC
@@ -140,10 +141,10 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
{
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
if (PlainObjectType::Options&RowMajor) {
- const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+ const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumIndices>{firstIndex, otherIndices...});
return m_data[index];
} else {
- const Index index = m_dimensions.IndexOfColMajor(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+ const Index index = m_dimensions.IndexOfColMajor(array<Index, NumIndices>{firstIndex, otherIndices...});
return m_data[index];
}
}
@@ -227,10 +228,10 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
static_assert(sizeof...(otherIndices) + 1 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
const std::size_t NumDims = sizeof...(otherIndices) + 1;
if (PlainObjectType::Options&RowMajor) {
- const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumDims>{{firstIndex, otherIndices...}});
+ const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumDims>{firstIndex, otherIndices...});
return m_data[index];
} else {
- const Index index = m_dimensions.IndexOfColMajor(array<Index, NumDims>{{firstIndex, otherIndices...}});
+ const Index index = m_dimensions.IndexOfColMajor(array<Index, NumDims>{firstIndex, otherIndices...});
return m_data[index];
}
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index f28a9699d..d6ad65070 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -24,6 +24,11 @@ const T2& choose(Cond<false>, const T1&, const T2& second) {
return second;
}
+template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T divup(const T x, const T y) {
+ return (x + y - 1) / y;
+}
+
template <size_t n> struct max_n_1 {
static const size_t size = n;
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index d8c923d74..11284315c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -110,6 +110,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
+ RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -145,7 +146,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
return m_impl.template packet<LoadMode>(index);
}
- EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_impl.data(); }
+ EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); }
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
@@ -170,6 +171,7 @@ template<typename NewDimensions, typename ArgType, typename Device>
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
+ RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -317,6 +319,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
+ RawAccess = false
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -545,6 +548,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
+ RawAccess = false
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index 91e32d200..39a305a93 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -93,6 +93,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = true,
+ RawAccess = false
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
index 8fb53f4f2..2cbb820b1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@@ -94,6 +94,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = true,
+ RawAccess = false
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index c30980a49..09ee0c2c6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -24,11 +24,14 @@ template<typename Op, typename Dims, typename XprType>
struct traits<TensorReductionOp<Op, Dims, XprType> >
: traits<XprType>
{
- typedef typename traits<XprType>::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename XprTraits::Scalar Scalar;
typedef typename internal::packet_traits<Scalar>::type Packet;
- typedef typename traits<XprType>::StorageKind StorageKind;
- typedef typename traits<XprType>::Index Index;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
typedef typename XprType::Nested Nested;
+ static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
+ static const int Layout = XprTraits::Layout;
};
template<typename Op, typename Dims, typename XprType>
@@ -337,12 +340,22 @@ struct FullReducer<Self, Op, ThreadPoolDevice, true> {
#endif
+// Default inner reducer
+template <typename Self, typename Op, typename Device>
+struct InnerReducer {
+ static const bool HasOptimizedImplementation = false;
+
+ static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
+ assert(false && "Not implemented");
+ }
+};
+
// Default outer reducer
template <typename Self, typename Op, typename Device>
struct OuterReducer {
static const bool HasOptimizedImplementation = false;
- static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
+ static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
assert(false && "Not implemented");
}
};
@@ -353,6 +366,9 @@ template <int B, int N, typename S, typename R, typename I>
__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
template <int NPT, typename S, typename R, typename I>
+__global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+
+template <int NPT, typename S, typename R, typename I>
__global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
#endif
@@ -412,6 +428,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
+ RawAccess = false
};
static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
@@ -425,19 +442,18 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
YOU_MADE_A_PROGRAMMING_MISTAKE);
- // Bitmap indicating if an input dimension is reduced or not.
- array<bool, NumInputDims> reduced;
+ // Build the bitmap indicating if an input dimension is reduced or not.
for (int i = 0; i < NumInputDims; ++i) {
- reduced[i] = false;
+ m_reduced[i] = false;
}
for (int i = 0; i < NumReducedDims; ++i) {
eigen_assert(op.dims()[i] >= 0);
eigen_assert(op.dims()[i] < NumInputDims);
- reduced[op.dims()[i]] = true;
+ m_reduced[op.dims()[i]] = true;
}
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
- internal::DimInitializer<Dimensions>::run(input_dims, reduced, &m_dimensions, &m_reducedDims);
+ internal::DimInitializer<Dimensions>::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims);
// Precompute output strides.
if (NumOutputDims > 0) {
@@ -472,7 +488,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
int outputIndex = 0;
int reduceIndex = 0;
for (int i = 0; i < NumInputDims; ++i) {
- if (reduced[i]) {
+ if (m_reduced[i]) {
m_reducedStrides[reduceIndex] = input_strides[i];
++reduceIndex;
} else {
@@ -493,7 +509,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) {
m_impl.evalSubExprsIfNeeded(NULL);
// Use the FullReducer if possible.
@@ -514,26 +530,41 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
}
// Attempt to use an optimized reduction.
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
else if (RunningOnGPU && data && (m_device.majorDeviceVersion() >= 3)) {
+ bool reducing_inner_dims = true;
+ for (int i = 0; i < NumReducedDims; ++i) {
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ reducing_inner_dims &= m_reduced[i];
+ } else {
+ reducing_inner_dims &= m_reduced[NumInputDims - 1 - i];
+ }
+ }
+ if (internal::InnerReducer<Self, Op, Device>::HasOptimizedImplementation &&
+ (reducing_inner_dims || ReducingInnerMostDims)) {
+ const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
+ const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+ Op reducer(m_reducer);
+ internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
+ return false;
+ }
+
bool preserving_inner_dims = true;
for (int i = 0; i < NumReducedDims; ++i) {
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
- preserving_inner_dims &= m_reducedDims[NumInputDims - 1 - i];
+ preserving_inner_dims &= m_reduced[NumInputDims - 1 - i];
} else {
- preserving_inner_dims &= m_reducedDims[i];
+ preserving_inner_dims &= m_reduced[i];
}
}
- if (internal::OuterReducer<Self, Op, GpuDevice>::HasOptimizedImplementation &&
+ if (internal::OuterReducer<Self, Op, Device>::HasOptimizedImplementation &&
preserving_inner_dims) {
const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
Op reducer(m_reducer);
- internal::OuterReducer<Self, Op, GpuDevice>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
+ internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
return false;
}
}
-#endif
return true;
}
@@ -615,6 +646,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
#endif
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
+ template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
#endif
@@ -660,6 +692,8 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
return startInput;
}
+ // Bitmap indicating if an input dimension is reduced or not.
+ array<bool, NumInputDims> m_reduced;
// Dimensions of the output of the operation.
Dimensions m_dimensions;
// Precomputed strides for the output tensor.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 8e250867c..2da18b147 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -76,13 +76,24 @@ __device__ inline void atomicReduce(T* output, T accum, SumReducer<T>&) {
#endif
}
+
+template <typename CoeffType, typename Index>
+__global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
+ const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+ const Index num_threads = blockDim.x * gridDim.x;
+ for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+ output[i] = val;
+ }
+}
+
template <int BlockSize, int NumPerThread, typename Self,
typename Reducer, typename Index>
__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
typename Self::CoeffReturnType* output) {
const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
- if (first_index == 0) {
+ // Initialize the output value if it wasn't initialized by the ReductionInitKernel
+ if (gridDim.x == 1 && first_index == 0) {
*output = reducer.initialize();
}
@@ -115,23 +126,138 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
internal::is_same<typename Self::CoeffReturnType, float>::value;
template <typename OutputType>
- EIGEN_DEVICE_FUNC static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
+ static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
assert(false && "Should only be called on floats");
}
- EIGEN_DEVICE_FUNC static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
+ static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
typedef typename Self::Index Index;
const Index num_coeffs = array_prod(self.m_impl.dimensions());
const int block_size = 256;
const int num_per_thread = 128;
const int num_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread));
- LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread>),
+
+ if (num_blocks > 1) {
+ // We initialize the outputs outside the reduction kernel when we can't be sure that there
+ // won't be a race conditions between multiple thread blocks.
+ LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
+ 1, 32, 0, device, reducer.initialize(), 1, output);
+ }
+
+ LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
num_blocks, block_size, 0, device, reducer, self, num_coeffs, output);
}
};
-#define DIVUP(x, y) (((x) + (y)-1) / (y))
+
+template <int NumPerThread, typename Self,
+ typename Reducer, typename Index>
+__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+ typename Self::CoeffReturnType* output) {
+ eigen_assert(blockDim.y == 1);
+ eigen_assert(blockDim.z == 1);
+ eigen_assert(gridDim.y == 1);
+ eigen_assert(gridDim.z == 1);
+
+ const int unroll_times = 16;
+ eigen_assert(NumPerThread % unroll_times == 0);
+
+ const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
+ const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
+
+ const Index num_threads = blockDim.x * gridDim.x;
+ const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+ // Initialize the output values if they weren't initialized by the ReductionInitKernel
+ if (gridDim.x == 1) {
+ for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+ output[i] = reducer.initialize();
+ }
+ }
+
+ for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
+ const Index row = i / input_col_blocks;
+
+ if (row < num_preserved_coeffs) {
+ const Index col_block = i % input_col_blocks;
+ const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
+
+ float reduced_val = reducer.initialize();
+
+ for (Index j = 0; j < NumPerThread; j += unroll_times) {
+ const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
+ if (last_col >= num_coeffs_to_reduce) {
+ for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col +=blockDim.x) {
+ const float val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+ reducer.reduce(val, &reduced_val);
+ }
+ break;
+ } else {
+ // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+ for (int k = 0; k < unroll_times; ++k) {
+ const Index col = col_begin + blockDim.x * (j + k);
+ reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
+ }
+ }
+ }
+
+#pragma unroll
+ for (int offset = warpSize/2; offset > 0; offset /= 2) {
+ reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
+ }
+
+ if ((threadIdx.x & (warpSize - 1)) == 0) {
+ atomicReduce(&(output[row]), reduced_val, reducer);
+ }
+ }
+
+ __syncthreads();
+ }
+}
+
+template <typename Self, typename Op>
+struct InnerReducer<Self, Op, GpuDevice> {
+ // Unfortunately nvidia doesn't support well exotic types such as complex,
+ // so reduce the scope of the optimized version of the code to the simple case
+ // of floats.
+ static const bool HasOptimizedImplementation = !Op::IsStateful &&
+ internal::is_same<typename Self::CoeffReturnType, float>::value;
+
+ template <typename Device, typename OutputType>
+ static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
+ assert(false && "Should only be called to reduce floats on a gpu device");
+ }
+
+ static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+ typedef typename Self::Index Index;
+
+ const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+ const int block_size = 256;
+ const int num_per_thread = 128;
+ const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+ const int max_blocks = device.getNumCudaMultiProcessors() *
+ device.maxCudaThreadsPerMultiProcessor() / block_size;
+ const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+ if (num_blocks > 1) {
+ // We initialize the outputs outside the reduction kernel when we can't be sure that there
+ // won't be a race conditions between multiple thread blocks.
+ const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+ const int max_blocks = device.getNumCudaMultiProcessors() *
+ device.maxCudaThreadsPerMultiProcessor() / 1024;
+ const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+ LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
+ num_blocks, 1024, 0, device, reducer.initialize(),
+ num_preserved_vals, output);
+ }
+
+ LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
+ num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+ }
+};
+
template <int NumPerThread, typename Self,
typename Reducer, typename Index>
@@ -139,13 +265,15 @@ __global__ void OuterReductionKernel(Reducer reducer, const Self input, Index nu
typename Self::CoeffReturnType* output) {
const Index num_threads = blockDim.x * gridDim.x;
const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
- // Initialize the output values
- for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
- output[i] = reducer.initialize();
+ // Initialize the output values if they weren't initialized by the ReductionInitKernel
+ if (gridDim.x == 1) {
+ for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+ output[i] = reducer.initialize();
+ }
}
// Do the reduction.
- const Index max_iter = DIVUP(num_coeffs_to_reduce, NumPerThread) * num_preserved_coeffs;
+ const Index max_iter = num_preserved_coeffs * numext::maxi<Index>(1, (num_coeffs_to_reduce - NumPerThread + 1));
for (Index i = thread_id; i < max_iter; i += num_threads) {
const Index input_col = i % num_preserved_coeffs;
const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
@@ -169,28 +297,38 @@ struct OuterReducer<Self, Op, GpuDevice> {
internal::is_same<typename Self::CoeffReturnType, float>::value;
template <typename Device, typename OutputType>
- static void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
+ static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
assert(false && "Should only be called to reduce floats on a gpu device");
}
- static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+ static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
typedef typename Self::Index Index;
- const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+ const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
const int block_size = 256;
const int num_per_thread = 16;
- const int dyn_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread));
+ const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
const int max_blocks = device.getNumCudaMultiProcessors() *
device.maxCudaThreadsPerMultiProcessor() / block_size;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
- LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread>),
+ if (num_blocks > 1) {
+ // We initialize the outputs in the reduction kernel itself when we don't have to worry
+ // about race conditions between multiple thread blocks.
+ const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+ const int max_blocks = device.getNumCudaMultiProcessors() *
+ device.maxCudaThreadsPerMultiProcessor() / 1024;
+ const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+ LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
+ num_blocks, 1024, 0, device, reducer.initialize(),
+ num_preserved_vals, output);
+ }
+
+ LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
}
};
-#undef DIVUP
-
#endif
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
index 6b25b2ba0..57197d060 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
@@ -139,6 +139,7 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef
PacketAccess = false,
Layout = PlainObjectType::Layout,
CoordAccess = false, // to be implemented
+ RawAccess = false
};
EIGEN_STRONG_INLINE TensorRef() : m_evaluator(NULL) {
@@ -367,6 +368,7 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
PacketAccess = false,
Layout = TensorRef<Derived>::Layout,
CoordAccess = false, // to be implemented
+ RawAccess = false
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef<Derived>& m, const Device&)
@@ -412,6 +414,7 @@ struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<cons
enum {
IsAligned = false,
PacketAccess = false,
+ RawAccess = false
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(TensorRef<Derived>& m, const Device& d) : Base(m, d)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index 10328c61f..846f81e0f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -113,6 +113,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
+ RawAccess = false
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
@@ -239,6 +240,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
+ RawAccess = false
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index 15a22aa1b..c4adb7d4c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -113,6 +113,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
+ RawAccess = false
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -225,6 +226,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
enum {
IsAligned = false,
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+ RawAccess = false
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 97b6168a9..2c2eb6515 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -112,6 +112,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
+ RawAccess = false
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -258,6 +259,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
+ RawAccess = false
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
index 7a9568b36..2f06f8442 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
@@ -20,7 +20,7 @@ class compute_tensor_flags
enum {
is_dynamic_size_storage = 1,
- aligned_bit =
+ is_aligned =
(
((Options&DontAlign)==0) && (
#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
@@ -35,12 +35,12 @@ class compute_tensor_flags
0
#endif
)
- ) ? AlignedBit : 0,
- packet_access_bit = packet_traits<Scalar>::Vectorizable && aligned_bit ? PacketAccessBit : 0
+ ),
+ packet_access_bit = packet_traits<Scalar>::Vectorizable && is_aligned ? PacketAccessBit : 0
};
public:
- enum { ret = packet_access_bit | aligned_bit};
+ enum { ret = packet_access_bit};
};
@@ -86,7 +86,7 @@ struct traits<TensorMap<PlainObjectType, Options_> >
static const int Layout = BaseTraits::Layout;
enum {
Options = Options_,
- Flags = (BaseTraits::Flags & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0),
+ Flags = BaseTraits::Flags,
};
};
@@ -102,7 +102,7 @@ struct traits<TensorRef<PlainObjectType> >
static const int Layout = BaseTraits::Layout;
enum {
Options = BaseTraits::Options,
- Flags = (BaseTraits::Flags & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0),
+ Flags = BaseTraits::Flags,
};
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
index 6625c66d5..52b78b261 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@@ -181,6 +181,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = NumDims == 6,
+ RawAccess = false
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h
index bc4f2025f..1b9fe2779 100644
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h
+++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h
@@ -55,7 +55,7 @@ class DynamicSGroup
inline internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup> operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const
{
static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
- return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{{firstIndex, otherIndices...}});
+ return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{firstIndex, otherIndices...});
}
template<typename Tensor_>
@@ -90,7 +90,7 @@ class DynamicSGroup
template<typename Index, std::size_t N, int... n>
inline std::array<Index, N> h_permute(std::size_t which, const std::array<Index, N>& idx, internal::numeric_list<int, n...>) const
{
- return std::array<Index, N>{{ idx[n >= m_numIndices ? n : m_elements[which].representation[n]]... }};
+ return std::array<Index, N>{ idx[n >= m_numIndices ? n : m_elements[which].representation[n]]... };
}
template<typename Index>
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h
index 942293bd7..255c344b4 100644
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h
+++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h
@@ -217,7 +217,7 @@ class StaticSGroup
inline internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>> operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const
{
static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
- return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{{firstIndex, otherIndices...}});
+ return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{firstIndex, otherIndices...});
}
template<typename Tensor_>
diff --git a/unsupported/Eigen/src/SparseExtra/RandomSetter.h b/unsupported/Eigen/src/SparseExtra/RandomSetter.h
index eb3e17330..ee97299af 100644
--- a/unsupported/Eigen/src/SparseExtra/RandomSetter.h
+++ b/unsupported/Eigen/src/SparseExtra/RandomSetter.h
@@ -95,10 +95,10 @@ template<typename Scalar> struct GoogleSparseHashMapTraits
*
* \brief The RandomSetter is a wrapper object allowing to set/update a sparse matrix with random access
*
- * \param SparseMatrixType the type of the sparse matrix we are updating
- * \param MapTraits a traits class representing the map implementation used for the temporary sparse storage.
+ * \tparam SparseMatrixType the type of the sparse matrix we are updating
+ * \tparam MapTraits a traits class representing the map implementation used for the temporary sparse storage.
* Its default value depends on the system.
- * \param OuterPacketBits defines the number of rows (or columns) manage by a single map object
+ * \tparam OuterPacketBits defines the number of rows (or columns) manage by a single map object
* as a power of two exponent.
*
* This class temporarily represents a sparse matrix object using a generic map implementation allowing for
diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp
index 2ddf47234..6fdefd66c 100644
--- a/unsupported/test/cxx11_tensor_broadcasting.cpp
+++ b/unsupported/test/cxx11_tensor_broadcasting.cpp
@@ -167,13 +167,13 @@ static void test_fixed_size_broadcasting()
TensorFixedSize<float, Sizes<1>, DataLayout> t2;
t2 = t2.constant(20.0f);
- Tensor<float, 1, DataLayout> t3 = t1 + t2.broadcast(Eigen::array<int, 1>{{10}});
+ Tensor<float, 1, DataLayout> t3 = t1 + t2.broadcast(Eigen::array<int, 1>{10});
for (int i = 0; i < 10; ++i) {
VERIFY_IS_APPROX(t3(i), t1(i) + t2(0));
}
- TensorMap<TensorFixedSize<float, Sizes<1>, DataLayout> > t4(t2.data(), {{1}});
- Tensor<float, 1, DataLayout> t5 = t1 + t4.broadcast(Eigen::array<int, 1>{{10}});
+ TensorMap<TensorFixedSize<float, Sizes<1>, DataLayout> > t4(t2.data(), {1});
+ Tensor<float, 1, DataLayout> t5 = t1 + t4.broadcast(Eigen::array<int, 1>{10});
for (int i = 0; i < 10; ++i) {
VERIFY_IS_APPROX(t5(i), t1(i) + t2(0));
}
diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp
index b0d52c6cf..c5f3af73e 100644
--- a/unsupported/test/cxx11_tensor_contraction.cpp
+++ b/unsupported/test/cxx11_tensor_contraction.cpp
@@ -456,7 +456,7 @@ static void test_tensor_product()
mat1.setRandom();
mat2.setRandom();
- Tensor<float, 4, DataLayout> result = mat1.contract(mat2, Eigen::array<DimPair, 0>{{}});
+ Tensor<float, 4, DataLayout> result = mat1.contract(mat2, Eigen::array<DimPair, 0>{});
VERIFY_IS_EQUAL(result.dimension(0), 2);
VERIFY_IS_EQUAL(result.dimension(1), 3);
diff --git a/unsupported/test/cxx11_tensor_map.cpp b/unsupported/test/cxx11_tensor_map.cpp
index a8a095e38..dc0f8a5a2 100644
--- a/unsupported/test/cxx11_tensor_map.cpp
+++ b/unsupported/test/cxx11_tensor_map.cpp
@@ -130,7 +130,7 @@ static void test_3d()
}
TensorMap<Tensor<const int, 3>> mat3(mat1.data(), 2, 3, 7);
- TensorMap<Tensor<const int, 3, RowMajor>> mat4(mat2.data(), array<DenseIndex, 3>{{2, 3, 7}});
+ TensorMap<Tensor<const int, 3, RowMajor>> mat4(mat2.data(), array<DenseIndex, 3>{2, 3, 7});
VERIFY_IS_EQUAL(mat3.rank(), 3);
VERIFY_IS_EQUAL(mat3.size(), 2*3*7);