diff options
author | 2016-01-21 23:08:54 -0700 | |
---|---|---|
committer | 2016-01-21 23:08:54 -0700 | |
commit | 9b6c72958a567e78e81b116eba255ba5b1b121ba (patch) | |
tree | 4da7f84648f06827d0bfe32c0e5b53f4a3d085c8 /unsupported | |
parent | 73aec9219b323b21710cb92f7f2ccfbfc253a367 (diff) | |
parent | 7b68cf2e0f3b1b54e099a94606ff951d319d7389 (diff) |
Update to latest default branch
Diffstat (limited to 'unsupported')
46 files changed, 853 insertions, 474 deletions
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 7481a9ddb..1c5734383 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -88,6 +88,7 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorReductionCuda.h" #include "src/Tensor/TensorArgMax.h" #include "src/Tensor/TensorConcatenation.h" +#include "src/Tensor/TensorContractionMapper.h" #include "src/Tensor/TensorContraction.h" #include "src/Tensor/TensorContractionThreadPool.h" #include "src/Tensor/TensorContractionCuda.h" diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h index 3f149c6a3..c1c57041f 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h @@ -109,11 +109,9 @@ template<int n, typename x> struct get; template<int n, typename a, typename... as> struct get<n, type_list<a, as...>> : get<n-1, type_list<as...>> {}; template<typename a, typename... as> struct get<0, type_list<a, as...>> { typedef a type; }; -template<int n EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, as)> struct get<n, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(as)>> { static_assert((n - n) < 0, "meta-template get: The element to extract from a list must be smaller than the size of the list."); }; template<typename T, int n, T a, T... as> struct get<n, numeric_list<T, a, as...>> : get<n-1, numeric_list<T, as...>> {}; template<typename T, T a, T... as> struct get<0, numeric_list<T, a, as...>> { constexpr static T value = a; }; -template<typename T, int n EIGEN_TPL_PP_SPEC_HACK_DEFC(T, as)> struct get<n, numeric_list<T EIGEN_TPL_PP_SPEC_HACK_USEC(as)>> { static_assert((n - n) < 0, "meta-template get: The element to extract from a list must be smaller than the size of the list."); }; /* always get type, regardless of dummy; good for parameter pack expansion */ @@ -406,7 +404,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) { template<typename Op, typename A, typename B, std::size_t N, int... n> constexpr inline array<decltype(Op::run(A(), B())),N> h_array_zip(array<A, N> a, array<B, N> b, numeric_list<int, n...>) { - return array<decltype(Op::run(A(), B())),N>{{ Op::run(array_get<n>(a), array_get<n>(b))... }}; + return array<decltype(Op::run(A(), B())),N>{ Op::run(array_get<n>(a), array_get<n>(b))... }; } template<typename Op, typename A, typename B, std::size_t N> @@ -434,7 +432,7 @@ constexpr inline auto array_zip_and_reduce(array<A, N> a, array<B, N> b) -> decl template<typename Op, typename A, std::size_t N, int... n> constexpr inline array<decltype(Op::run(A())),N> h_array_apply(array<A, N> a, numeric_list<int, n...>) { - return array<decltype(Op::run(A())),N>{{ Op::run(array_get<n>(a))... }}; + return array<decltype(Op::run(A())),N>{ Op::run(array_get<n>(a))... }; } template<typename Op, typename A, std::size_t N> diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h index ab9c2ec3e..456b34d0b 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h @@ -132,13 +132,13 @@ template <typename T> class array<T, 0> { return *static_cast<const T*>(NULL); } - static EIGEN_ALWAYS_INLINE std::size_t size() { return 0; } + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::size_t size() { return 0; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array() { } #ifdef EIGEN_HAS_VARIADIC_TEMPLATES - array(std::initializer_list<T> l) { + EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) { eigen_assert(l.size() == 0); } #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index ad525bac8..092e30c1f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -78,7 +78,8 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0) & !(Options_&DontAlign), PacketAccess = (internal::packet_traits<Scalar>::size > 1), Layout = Options_ & RowMajor ? RowMajor : ColMajor, - CoordAccess = true + CoordAccess = true, + RawAccess = true }; static const int Options = Options_; @@ -118,7 +119,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp { // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return coeff(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}}); + return coeff(array<Index, NumIndices>{firstIndex, secondIndex, otherIndices...}); } #endif @@ -158,7 +159,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp { // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return coeffRef(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}}); + return coeffRef(array<Index, NumIndices>{firstIndex, secondIndex, otherIndices...}); } #endif @@ -198,7 +199,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp { // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return this->operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}}); + return this->operator()(array<Index, NumIndices>{firstIndex, secondIndex, otherIndices...}); } #else EIGEN_DEVICE_FUNC @@ -265,7 +266,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp { // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}}); + return operator()(array<Index, NumIndices>{firstIndex, secondIndex, otherIndices...}); } #else EIGEN_DEVICE_FUNC @@ -341,7 +342,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp #ifdef EIGEN_HAS_VARIADIC_TEMPLATES template<typename... IndexTypes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index firstDimension, IndexTypes... otherDimensions) - : m_storage(internal::array_prod(array<Index, NumIndices>{{firstDimension, otherDimensions...}}), array<Index, NumIndices>{{firstDimension, otherDimensions...}}) + : m_storage(internal::array_prod(array<Index, NumIndices>{firstDimension, otherDimensions...}), array<Index, NumIndices>{firstDimension, otherDimensions...}) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) @@ -426,7 +427,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp { // The number of dimensions used to resize a tensor must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - resize(array<Index, NumIndices>{{firstDimension, otherDimensions...}}); + resize(array<Index, NumIndices>{firstDimension, otherDimensions...}); } #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index c783aab97..f1ec04c49 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -89,6 +89,7 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device> BlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -134,7 +135,7 @@ struct traits<TensorTupleReducerOp<ReduceOp, Dims, XprType> > : public traits<Xp typedef Index Scalar; typedef typename XprType::Nested Nested; typedef typename remove_reference<Nested>::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; + static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value; static const int Layout = XprTraits::Layout; }; @@ -210,6 +211,7 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi BlockAccess = false, Layout = TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index a41d4d265..10fac0cc5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -97,6 +97,7 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device> IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned, PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess, Layout = TensorEvaluator<LeftArgType, Device>::Layout, + RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess, }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : @@ -152,6 +153,8 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device> return m_leftImpl.template packet<LoadMode>(index); } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_leftImpl.data(); } + private: TensorEvaluator<LeftArgType, Device> m_leftImpl; TensorEvaluator<RightArgType, Device> m_rightImpl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index dc64959e1..efca7cd79 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -46,6 +46,21 @@ struct nested<TensorBroadcastingOp<Broadcast, XprType>, 1, typename eval<TensorB typedef TensorBroadcastingOp<Broadcast, XprType> type; }; +template <typename Dims> +struct is_input_scalar { + static const bool value = false; +}; +template <> +struct is_input_scalar<Sizes<> > { + static const bool value = true; +}; +#ifndef EIGEN_EMULATE_CXX11_META_H +template <typename std::size_t... Indices> +struct is_input_scalar<Sizes<Indices...> > { + static const bool value = (Sizes<Indices...>::total_size == 1); +}; +#endif + } // end namespace internal @@ -94,6 +109,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> IsAligned = false, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, Layout = TensorEvaluator<ArgType, Device>::Layout, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -103,7 +119,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> // and store the result in a scalar. Instead one should reshape the scalar into a a N-D // tensor with N >= 1 of 1 element first and then broadcast. EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); + const InputDimensions& input_dims = m_impl.dimensions(); const Broadcast& broadcast = op.broadcast(); for (int i = 0; i < NumDims; ++i) { eigen_assert(input_dims[i] > 0); @@ -143,6 +159,10 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const { + if (internal::is_input_scalar<typename internal::remove_all<InputDimensions>::type>::value) { + return m_impl.coeff(0); + } + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { return coeffColMajor(index); } else { @@ -214,6 +234,10 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const { + if (internal::is_input_scalar<typename internal::remove_all<InputDimensions>::type>::value) { + return internal::pset1<PacketReturnType>(m_impl.coeff(0)); + } + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { return packetColMajor<LoadMode>(index); } else { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index abc3c92ca..a209e885b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -145,6 +145,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -304,6 +305,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device> enum { IsAligned = false, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 3d153bb94..f57d2bb7d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -125,6 +125,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy IsAligned = false, PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess, Layout = TensorEvaluator<LeftArgType, Device>::Layout, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -287,6 +288,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De IsAligned = false, PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess, Layout = TensorEvaluator<LeftArgType, Device>::Layout, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index eda93a1de..624e814e2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -21,358 +21,6 @@ namespace Eigen { */ namespace internal { -enum { - Rhs = 0, - Lhs = 1, -}; - -/* - * Implementation of the Eigen blas_data_mapper class for tensors. - */ -template<typename Scalar, typename Index, int side, - typename Tensor, - typename nocontract_t, typename contract_t, - int packet_size, bool inner_dim_contiguous> -class SimpleTensorContractionMapper { - public: - EIGEN_DEVICE_FUNC - SimpleTensorContractionMapper(const Tensor& tensor, - const nocontract_t& nocontract_strides, - const nocontract_t& ij_strides, - const contract_t& contract_strides, - const contract_t& k_strides) : - m_tensor(tensor), - m_nocontract_strides(nocontract_strides), - m_ij_strides(ij_strides), - m_contract_strides(contract_strides), - m_k_strides(k_strides) { } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar operator()(Index row) const { - // column major assumption - return operator()(row, 0); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const { - return m_tensor.coeff(computeIndex(row, col)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const { - const bool left = (side == Lhs); - Index nocontract_val = left ? row : col; - Index linidx = 0; - for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) { - const Index idx = nocontract_val / m_ij_strides[i]; - linidx += idx * m_nocontract_strides[i]; - nocontract_val -= idx * m_ij_strides[i]; - } - if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) { - if (side == Lhs && inner_dim_contiguous) { - eigen_assert(m_nocontract_strides[0] == 1); - linidx += nocontract_val; - } else { - linidx += nocontract_val * m_nocontract_strides[0]; - } - } - - Index contract_val = left ? col : row; - for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) { - const Index idx = contract_val / m_k_strides[i]; - linidx += idx * m_contract_strides[i]; - contract_val -= idx * m_k_strides[i]; - } - - if(array_size<contract_t>::value > 0) { - if (side == Rhs && inner_dim_contiguous) { - eigen_assert(m_contract_strides[0] == 1); - linidx += contract_val; - } else { - linidx += contract_val * m_contract_strides[0]; - } - } - - return linidx; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE IndexPair<Index> computeIndexPair(Index row, Index col, const Index distance) const { - const bool left = (side == Lhs); - Index nocontract_val[2] = {left ? row : col, left ? row + distance : col}; - Index linidx[2] = {0, 0}; - for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) { - const Index idx0 = nocontract_val[0] / m_ij_strides[i]; - const Index idx1 = nocontract_val[1] / m_ij_strides[i]; - linidx[0] += idx0 * m_nocontract_strides[i]; - linidx[1] += idx1 * m_nocontract_strides[i]; - nocontract_val[0] -= idx0 * m_ij_strides[i]; - nocontract_val[1] -= idx1 * m_ij_strides[i]; - } - if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) { - if (side == Lhs && inner_dim_contiguous) { - eigen_assert(m_nocontract_strides[0] == 1); - linidx[0] += nocontract_val[0]; - linidx[1] += nocontract_val[1]; - } else { - linidx[0] += nocontract_val[0] * m_nocontract_strides[0]; - linidx[1] += nocontract_val[1] * m_nocontract_strides[0]; - } - } - - Index contract_val[2] = {left ? col : row, left ? col : row + distance}; - for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) { - const Index idx0 = contract_val[0] / m_k_strides[i]; - const Index idx1 = contract_val[1] / m_k_strides[i]; - linidx[0] += idx0 * m_contract_strides[i]; - linidx[1] += idx1 * m_contract_strides[i]; - contract_val[0] -= idx0 * m_k_strides[i]; - contract_val[1] -= idx1 * m_k_strides[i]; - } - - if (side == Rhs && inner_dim_contiguous) { - eigen_assert(m_contract_strides[0] == 1); - linidx[0] += contract_val[0]; - linidx[1] += contract_val[1]; - } else { - linidx[0] += contract_val[0] * m_contract_strides[0]; - linidx[1] += contract_val[1] * m_contract_strides[0]; - } - return IndexPair<Index>(linidx[0], linidx[1]); - } - - Index firstAligned(Index size) const { - return size; - } - Index stride() const { - return 1; - } - - protected: - const Tensor m_tensor; - const nocontract_t m_nocontract_strides; - const nocontract_t m_ij_strides; - const contract_t m_contract_strides; - const contract_t m_k_strides; -}; - - -template<typename Scalar, typename Index, int side, - typename Tensor, - typename nocontract_t, typename contract_t, - int packet_size, bool inner_dim_contiguous, - bool inner_dim_reordered, int Alignment> - class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous> -{ - public: - typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous> ParentMapper; - - EIGEN_DEVICE_FUNC - BaseTensorContractionMapper(const Tensor& tensor, - const nocontract_t& nocontract_strides, - const nocontract_t& ij_strides, - const contract_t& contract_strides, - const contract_t& k_strides) : - ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } - - typedef typename packet_traits<Scalar>::type Packet; - typedef typename packet_traits<Scalar>::half HalfPacket; - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { - // whole method makes column major assumption - - // don't need to add offsets for now (because operator handles that) - // current code assumes packet size must be a multiple of 2 - EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - - if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) { - const Index index = this->computeIndex(i, j); - eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1); - return this->m_tensor.template packet<Alignment>(index); - } - - const IndexPair<Index> indexPair = this->computeIndexPair(i, j, packet_size - 1); - const Index first = indexPair.first; - const Index last = indexPair.second; - - // We can always do optimized packet reads from left hand side right now, because - // the vertical matrix dimension on the left hand side is never contracting. - // On the right hand side we need to check if the contracting dimensions may have - // been shuffled first. - if (Tensor::PacketAccess && - (side == Lhs || internal::array_size<contract_t>::value <= 1 || !inner_dim_reordered) && - (last - first) == (packet_size - 1)) { - - return this->m_tensor.template packet<Alignment>(first); - } - - EIGEN_ALIGN_MAX Scalar data[packet_size]; - - data[0] = this->m_tensor.coeff(first); - for (Index k = 1; k < packet_size - 1; k += 2) { - const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1); - data[k] = this->m_tensor.coeff(internal_pair.first); - data[k + 1] = this->m_tensor.coeff(internal_pair.second); - } - data[packet_size - 1] = this->m_tensor.coeff(last); - - return pload<Packet>(data); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { - // whole method makes column major assumption - - // don't need to add offsets for now (because operator handles that) - const Index half_packet_size = unpacket_traits<HalfPacket>::size; - if (half_packet_size == packet_size) { - return loadPacket(i, j); - } - EIGEN_ALIGN_MAX Scalar data[half_packet_size]; - for (Index k = 0; k < half_packet_size; k++) { - data[k] = operator()(i + k, j); - } - return pload<HalfPacket>(data); - } -}; - - -template<typename Scalar, typename Index, int side, - typename Tensor, - typename nocontract_t, typename contract_t, - bool inner_dim_contiguous, - bool inner_dim_reordered, int Alignment> -class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous> -{ - public: - typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous> ParentMapper; - - EIGEN_DEVICE_FUNC - BaseTensorContractionMapper(const Tensor& tensor, - const nocontract_t& nocontract_strides, - const nocontract_t& ij_strides, - const contract_t& contract_strides, - const contract_t& k_strides) : - ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } - - typedef typename packet_traits<Scalar>::type Packet; - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { - EIGEN_ALIGN_MAX Scalar data[1]; - data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); - return pload<typename packet_traits<Scalar>::type>(data); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const { - return loadPacket(i, j); - } -}; - -template<typename Scalar, typename Index, int side, - typename Tensor, - typename nocontract_t, typename contract_t, - int packet_size, - bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment> -class TensorContractionInputMapper; - -template<typename Scalar, typename Index, int side, - typename Tensor, - typename nocontract_t, typename contract_t, - int packet_size, - bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment> -class TensorContractionSubMapper { - public: - typedef typename packet_traits<Scalar>::type Packet; - typedef typename packet_traits<Scalar>::half HalfPacket; - - typedef TensorContractionInputMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper; - typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self; - typedef Self LinearMapper; - - EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset) - : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { - return m_base_mapper(i + m_vert_offset, m_horiz_offset); - } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const { - return m_base_mapper(i + m_vert_offset, j + m_horiz_offset); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { - return m_base_mapper.loadPacket(i + m_vert_offset, m_horiz_offset); - } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { - return m_base_mapper.loadPacket(i + m_vert_offset, j + m_horiz_offset); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { - return m_base_mapper.loadHalfPacket(i + m_vert_offset, m_horiz_offset); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const { - m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { - return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset); - } - - template <typename PacketT, int AlignmentType> - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const { - EIGEN_STATIC_ASSERT((internal::is_same<PacketT, Packet>::value), YOU_MADE_A_PROGRAMMING_MISTAKE); - EIGEN_STATIC_ASSERT((AlignmentType == Aligned || Alignment == Unaligned), YOU_MADE_A_PROGRAMMING_MISTAKE); - return loadPacket(i); - } - - template <typename Packet> - EIGEN_DEVICE_FUNC bool aligned(Index) const { - return false; - } - - private: - const ParentMapper& m_base_mapper; - const Index m_vert_offset; - const Index m_horiz_offset; -}; - - -template<typename Scalar, typename Index, int side, - typename Tensor, - typename nocontract_t, typename contract_t, - int packet_size, - bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment> -class TensorContractionInputMapper - : public BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> { - - public: - typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Base; - typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper; - typedef SubMapper VectorMapper; - - EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor, - const nocontract_t& nocontract_strides, - const nocontract_t& ij_strides, - const contract_t& contract_strides, - const contract_t& k_strides) - : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const { - return SubMapper(*this, i, j); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { - return VectorMapper(*this, i, j); - } -}; - - - template<typename Dimensions, typename LhsXprType, typename RhsXprType> struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> > { @@ -480,6 +128,7 @@ struct TensorContractionEvaluatorBase PacketAccess = (internal::packet_traits<Scalar>::size > 1), Layout = TensorEvaluator<LeftArgType, Device>::Layout, CoordAccess = false, // to be implemented + RawAccess = true }; // Most of the code is assuming that both input tensors are ColMajor. If the @@ -498,8 +147,6 @@ struct TensorContractionEvaluatorBase static const int ContractDims = internal::array_size<Indices>::value; static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size; - typedef array<Index, LDims> left_dim_mapper_t; - typedef array<Index, RDims> right_dim_mapper_t; typedef array<Index, ContractDims> contract_t; typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t; typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t; @@ -546,8 +193,8 @@ struct TensorContractionEvaluatorBase // We need to flip all the pairs of contracting indices as well as // reversing the dimensions. for (int i = 0; i < ContractDims; i++) { - eval_op_indices[i].first = LDims - 1 - op.indices()[i].second; - eval_op_indices[i].second = RDims - 1 - op.indices()[i].first; + eval_op_indices[i].first = LDims - 1 - op.indices()[ContractDims - 1 - i].second; + eval_op_indices[i].second = RDims - 1 - op.indices()[ContractDims - 1 - i].first; } } @@ -741,17 +388,19 @@ struct TensorContractionEvaluatorBase typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator; const Index lhs_packet_size = internal::packet_traits<LhsScalar>::size; const Index rhs_packet_size = internal::packet_traits<RhsScalar>::size; + const int lhs_alignment = LeftEvaluator::IsAligned ? Aligned : Unaligned; + const int rhs_alignment = RightEvaluator::IsAligned ? Aligned : Unaligned; typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t, contract_t, lhs_packet_size, lhs_inner_dim_contiguous, - false, Unaligned> LhsMapper; + false, lhs_alignment> LhsMapper; typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t, contract_t, rhs_packet_size, rhs_inner_dim_contiguous, - rhs_inner_dim_reordered, Unaligned> RhsMapper; + rhs_inner_dim_reordered, rhs_alignment> RhsMapper; LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides); @@ -784,11 +433,11 @@ struct TensorContractionEvaluatorBase } template<int LoadMode> - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return internal::ploadt<Packet, LoadMode>(m_result + index); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; } protected: // Prevent assignment @@ -853,9 +502,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value; static const int ContractDims = internal::array_size<Indices>::value; - typedef array<Index, LDims> left_dim_mapper_t; - typedef array<Index, RDims> right_dim_mapper_t; - typedef array<Index, ContractDims> contract_t; typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t; typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index 90ee50678..a5f3debc4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -1261,7 +1261,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT Base(op, device) {} // We need to redefine this method to make nvcc happy - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { this->m_leftImpl.evalSubExprsIfNeeded(NULL); this->m_rightImpl.evalSubExprsIfNeeded(NULL); if (data) { @@ -1317,6 +1317,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT void evalTyped(Scalar* buffer) const { // columns in left side, rows in right side const Index k = this->m_k_size; + EIGEN_UNUSED_VARIABLE(k) // rows in left side const Index m = this->m_i_size; @@ -1361,7 +1362,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT const dim3 block_size(16, 16, 1); LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k); } else { - const Index m_blocks = (m + 127) / 128; + const Index m_blocks = (m + 127) / 128; const Index n_blocks = (n + 63) / 64; const dim3 num_blocks(m_blocks, n_blocks, 1); const dim3 block_size(8, 32, 1); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h new file mode 100644 index 000000000..9b6d18090 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -0,0 +1,464 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H + +namespace Eigen { + +namespace internal { + +enum { + Rhs = 0, + Lhs = 1, +}; + +/* + * Implementation of the Eigen blas_data_mapper class for tensors. + */ + +template <typename Tensor, bool HasRawAccess> struct CoeffLoader { + enum { + DirectOffsets = false + }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_tensor(tensor) { } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index) { + eigen_assert(false && "unsupported"); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return m_tensor.coeff(index); } + + template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename Tensor::PacketReturnType packet(typename Tensor::Index index) const + { + return m_tensor.template packet<LoadMode>(index); + } + + + private: + const Tensor m_tensor; +}; + +template <typename Tensor> struct CoeffLoader<Tensor, true> { + enum { + DirectOffsets = true + }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_data(tensor.data()) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) { + m_data += offset; + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return loadConstant(m_data+index); } + + template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename Tensor::PacketReturnType packet(typename Tensor::Index index) const + { + return internal::ploadt_ro<typename Tensor::PacketReturnType, LoadMode>(m_data + index); + } + private: + typedef typename Tensor::Scalar Scalar; + const Scalar* m_data; +}; + +template<typename Scalar, typename Index, int side, + typename Tensor, + typename nocontract_t, typename contract_t, + int packet_size, bool inner_dim_contiguous, int Alignment> +class SimpleTensorContractionMapper { + public: + EIGEN_DEVICE_FUNC + SimpleTensorContractionMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) : + m_tensor(tensor), + m_nocontract_strides(nocontract_strides), + m_ij_strides(ij_strides), + m_contract_strides(contract_strides), + m_k_strides(k_strides) { } + + enum { + DirectOffsets = CoeffLoader<Tensor, Tensor::RawAccess>::DirectOffsets + }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) { + m_tensor.offsetBuffer(offset); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar operator()(Index row) const { + // column major assumption + return operator()(row, 0); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const { + return m_tensor.coeff(computeIndex(row, col)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const { + const bool left = (side == Lhs); + Index nocontract_val = left ? row : col; + Index linidx = 0; + for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) { + const Index idx = nocontract_val / m_ij_strides[i]; + linidx += idx * m_nocontract_strides[i]; + nocontract_val -= idx * m_ij_strides[i]; + } + if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) { + if (side == Lhs && inner_dim_contiguous) { + eigen_assert(m_nocontract_strides[0] == 1); + linidx += nocontract_val; + } else { + linidx += nocontract_val * m_nocontract_strides[0]; + } + } + + Index contract_val = left ? col : row; + for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) { + const Index idx = contract_val / m_k_strides[i]; + linidx += idx * m_contract_strides[i]; + contract_val -= idx * m_k_strides[i]; + } + + if(array_size<contract_t>::value > 0) { + if (side == Rhs && inner_dim_contiguous) { + eigen_assert(m_contract_strides[0] == 1); + linidx += contract_val; + } else { + linidx += contract_val * m_contract_strides[0]; + } + } + + return linidx; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE IndexPair<Index> computeIndexPair(Index row, Index col, const Index distance) const { + const bool left = (side == Lhs); + Index nocontract_val[2] = {left ? row : col, left ? row + distance : col}; + Index linidx[2] = {0, 0}; + for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) { + const Index idx0 = nocontract_val[0] / m_ij_strides[i]; + const Index idx1 = nocontract_val[1] / m_ij_strides[i]; + linidx[0] += idx0 * m_nocontract_strides[i]; + linidx[1] += idx1 * m_nocontract_strides[i]; + nocontract_val[0] -= idx0 * m_ij_strides[i]; + nocontract_val[1] -= idx1 * m_ij_strides[i]; + } + if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) { + if (side == Lhs && inner_dim_contiguous) { + eigen_assert(m_nocontract_strides[0] == 1); + linidx[0] += nocontract_val[0]; + linidx[1] += nocontract_val[1]; + } else { + linidx[0] += nocontract_val[0] * m_nocontract_strides[0]; + linidx[1] += nocontract_val[1] * m_nocontract_strides[0]; + } + } + + Index contract_val[2] = {left ? col : row, left ? col : row + distance}; + for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) { + const Index idx0 = contract_val[0] / m_k_strides[i]; + const Index idx1 = contract_val[1] / m_k_strides[i]; + linidx[0] += idx0 * m_contract_strides[i]; + linidx[1] += idx1 * m_contract_strides[i]; + contract_val[0] -= idx0 * m_k_strides[i]; + contract_val[1] -= idx1 * m_k_strides[i]; + } + + if (side == Rhs && inner_dim_contiguous) { + eigen_assert(m_contract_strides[0] == 1); + linidx[0] += contract_val[0]; + linidx[1] += contract_val[1]; + } else { + linidx[0] += contract_val[0] * m_contract_strides[0]; + linidx[1] += contract_val[1] * m_contract_strides[0]; + } + return IndexPair<Index>(linidx[0], linidx[1]); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index firstAligned(Index size) const { + // Only claim alignment when we can compute the actual stride (ie when we're + // dealing with the lhs with inner_dim_contiguous. This is because the + // matrix-vector product relies on the stride when dealing with aligned inputs. + return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size; + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const { + return ((side == Lhs) && inner_dim_contiguous) ? m_contract_strides[0] : 1; + } + + protected: + CoeffLoader<Tensor, Tensor::RawAccess> m_tensor; + const nocontract_t m_nocontract_strides; + const nocontract_t m_ij_strides; + const contract_t m_contract_strides; + const contract_t m_k_strides; +}; + + +template<typename Scalar, typename Index, int side, + typename Tensor, + typename nocontract_t, typename contract_t, + int packet_size, bool inner_dim_contiguous, + bool inner_dim_reordered, int Alignment> +class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment> +{ + public: + typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment> ParentMapper; + + EIGEN_DEVICE_FUNC + BaseTensorContractionMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) : + ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } + + typedef typename packet_traits<Scalar>::type Packet; + typedef typename packet_traits<Scalar>::half HalfPacket; + + template <int AlignmentType = Alignment> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { + // whole method makes column major assumption + + // don't need to add offsets for now (because operator handles that) + // current code assumes packet size must be a multiple of 2 + EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + + if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) { + const Index index = this->computeIndex(i, j); + eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1); + return this->m_tensor.template packet<AlignmentType>(index); + } + + const IndexPair<Index> indexPair = this->computeIndexPair(i, j, packet_size - 1); + const Index first = indexPair.first; + const Index last = indexPair.second; + + // We can always do optimized packet reads from left hand side right now, because + // the vertical matrix dimension on the left hand side is never contracting. + // On the right hand side we need to check if the contracting dimensions may have + // been shuffled first. + if (Tensor::PacketAccess && + (side == Lhs || internal::array_size<contract_t>::value <= 1 || !inner_dim_reordered) && + (last - first) == (packet_size - 1)) { + + return this->m_tensor.template packet<AlignmentType>(first); + } + + EIGEN_ALIGN_MAX Scalar data[packet_size]; + + data[0] = this->m_tensor.coeff(first); + for (Index k = 1; k < packet_size - 1; k += 2) { + const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1); + data[k] = this->m_tensor.coeff(internal_pair.first); + data[k + 1] = this->m_tensor.coeff(internal_pair.second); + } + data[packet_size - 1] = this->m_tensor.coeff(last); + + return pload<Packet>(data); + } + + template <int AlignmentType = Alignment> + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { + // whole method makes column major assumption + + // don't need to add offsets for now (because operator handles that) + const Index half_packet_size = unpacket_traits<HalfPacket>::size; + if (half_packet_size == packet_size) { + return loadPacket<AlignmentType>(i, j); + } + EIGEN_ALIGN_MAX Scalar data[half_packet_size]; + for (Index k = 0; k < half_packet_size; k++) { + data[k] = operator()(i + k, j); + } + return pload<HalfPacket>(data); + } +}; + + +template<typename Scalar, typename Index, int side, + typename Tensor, + typename nocontract_t, typename contract_t, + bool inner_dim_contiguous, + bool inner_dim_reordered, int Alignment> +class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment> +{ + public: + typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment> ParentMapper; + + EIGEN_DEVICE_FUNC + BaseTensorContractionMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) : + ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } + + typedef typename packet_traits<Scalar>::type Packet; + template <int> EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { + EIGEN_ALIGN_MAX Scalar data[1]; + data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); + return pload<typename packet_traits<Scalar>::type>(data); + } + template <int> EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const { + return loadPacket(i, j); + } +}; + + +template<typename Scalar, typename Index, int side, + typename Tensor, + typename nocontract_t, typename contract_t, + int packet_size, + bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment> +class TensorContractionSubMapper { + public: + typedef typename packet_traits<Scalar>::type Packet; + typedef typename packet_traits<Scalar>::half HalfPacket; + + typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper; + typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self; + typedef Self LinearMapper; + + enum { + // We can use direct offsets iff the parent mapper supports then and we can compute the strides. + // TODO: we should also enable direct offsets for the Rhs case. + UseDirectOffsets = (side == Lhs) && inner_dim_contiguous && ParentMapper::DirectOffsets + }; + + EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset) + : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { + // Bake the offsets into the buffer used by the base mapper whenever possible. This avoids the need to recompute + // this offset every time we attempt to access a coefficient. + if (UseDirectOffsets) { + Index stride = m_base_mapper.stride(); + m_base_mapper.offsetBuffer(vert_offset + horiz_offset * stride); + } + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { + if (UseDirectOffsets) { + return m_base_mapper(i, 0); + } + return m_base_mapper(i + m_vert_offset, m_horiz_offset); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const { + if (UseDirectOffsets) { + return m_base_mapper(i, j); + } + return m_base_mapper(i + m_vert_offset, j + m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { + if (UseDirectOffsets) { + return m_base_mapper.template loadPacket<Alignment>(i, 0); + } + return m_base_mapper.template loadPacket<Alignment>(i + m_vert_offset, m_horiz_offset); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { + if (UseDirectOffsets) { + return m_base_mapper.template loadPacket<Alignment>(i, j); + } + return m_base_mapper.template loadPacket<Alignment>(i + m_vert_offset, j + m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { + if (UseDirectOffsets) { + return m_base_mapper.template loadHalfPacket<Alignment>(i, 0); + } + return m_base_mapper.template loadHalfPacket<Alignment>(i + m_vert_offset, m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const { + if (UseDirectOffsets) { + m_base_mapper.storePacket(i, 0, p); + } + m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + if (UseDirectOffsets) { + return LinearMapper(m_base_mapper, i, j); + } + return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset); + } + + template <typename PacketT, int AlignmentType> + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const { + EIGEN_STATIC_ASSERT((internal::is_same<PacketT, Packet>::value), YOU_MADE_A_PROGRAMMING_MISTAKE); + const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned; + if (UseDirectOffsets) { + return m_base_mapper.template loadPacket<ActualAlignment>(i, 0); + } + return m_base_mapper.template loadPacket<ActualAlignment>(i + m_vert_offset, m_horiz_offset); + } + + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const { + return false; + } + + private: + ParentMapper m_base_mapper; + const Index m_vert_offset; + const Index m_horiz_offset; +}; + + +template<typename Scalar, typename Index, int side, + typename Tensor, + typename nocontract_t, typename contract_t, + int packet_size, + bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment> +class TensorContractionInputMapper + : public BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> { + + public: + typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Base; + typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper; + typedef SubMapper VectorMapper; + + EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) + : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const { + return SubMapper(*this, i, j); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { + return VectorMapper(*this, i, j); + } +}; + + + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index 3ca7daf32..877bcd0df 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -162,6 +162,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device> IsAligned = false, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess && internal::type_casting_traits<SrcType, TargetType>::VectorizedCast, Layout = TensorEvaluator<ArgType, Device>::Layout, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index a82bfc0aa..367a152a0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -306,6 +306,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess, Layout = TensorEvaluator<InputArgType, Device>::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -752,6 +753,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr PacketAccess = false, Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h index 0157f6fab..0f8a98caf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -95,6 +95,7 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi BlockAccess = false, Layout = TensorEvaluator<XprType, Device>::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device) @@ -250,6 +251,7 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, BlockAccess = false, Layout = TensorEvaluator<LhsXprType, Device>::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index 4d7570077..5abdc489b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -10,7 +10,6 @@ #if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H) #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H - namespace Eigen { // This defines an interface that GPUDevice can take to use @@ -206,20 +205,45 @@ struct GpuDevice { #endif } - inline int getNumCudaMultiProcessors() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const { +#ifndef __CUDA_ARCH__ return stream_->deviceProperties().multiProcessorCount; +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); + return 0; +#endif } - inline int maxCudaThreadsPerBlock() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const { +#ifndef __CUDA_ARCH__ return stream_->deviceProperties().maxThreadsPerBlock; +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); + return 0; +#endif } - inline int maxCudaThreadsPerMultiProcessor() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const { +#ifndef __CUDA_ARCH__ return stream_->deviceProperties().maxThreadsPerMultiProcessor; +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); + return 0; +#endif } - inline int sharedMemPerBlock() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int sharedMemPerBlock() const { +#ifndef __CUDA_ARCH__ return stream_->deviceProperties().sharedMemPerBlock; +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); + return 0; +#endif } - inline int majorDeviceVersion() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { +#ifndef __CUDA_ARCH__ return stream_->deviceProperties().major; +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); + return 0; +#endif } // This function checks if the CUDA runtime recorded an error for the @@ -239,23 +263,29 @@ struct GpuDevice { }; #ifndef __CUDA_ARCH__ -#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ - (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ +#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ + (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ assert(cudaGetLastError() == cudaSuccess); #else -#define LAUNCH_CUDA_KERNEL(...) \ - eigen_assert(false && "Cannot launch a kernel from another kernel"); +#define LAUNCH_CUDA_KERNEL(kernel, ...) \ + { const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); } \ + eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__); #endif + // FIXME: Should be device and kernel specific. #ifdef __CUDACC__ -static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) { +static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) { +#ifndef __CUDA_ARCH__ cudaError_t status = cudaDeviceSetSharedMemConfig(config); EIGEN_UNUSED_VARIABLE(status) assert(status == cudaSuccess); +#else + EIGEN_UNUSED_VARIABLE(config) +#endif } #endif } // end namespace Eigen -#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index f3c9a3148..06cac3570 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -110,14 +110,14 @@ struct Sizes : internal::numeric_list<std::ptrdiff_t, Indices...> { return internal::arg_prod(Indices...); } - Sizes() { } + EIGEN_DEVICE_FUNC Sizes() { } template <typename DenseIndex> - explicit Sizes(const array<DenseIndex, Base::count>& /*indices*/) { + explicit EIGEN_DEVICE_FUNC Sizes(const array<DenseIndex, Base::count>& /*indices*/) { // todo: add assertion } #ifdef EIGEN_HAS_VARIADIC_TEMPLATES - template <typename... DenseIndex> Sizes(DenseIndex...) { } - explicit Sizes(std::initializer_list<std::ptrdiff_t> /*l*/) { + template <typename... DenseIndex> EIGEN_DEVICE_FUNC Sizes(DenseIndex...) { } + explicit EIGEN_DEVICE_FUNC Sizes(std::initializer_list<std::ptrdiff_t> /*l*/) { // todo: add assertion } #endif @@ -289,7 +289,7 @@ struct DSizes : array<DenseIndex, NumDims> { template<typename... IndexTypes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, IndexTypes... otherDimensions) { EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE) - (*this) = array<DenseIndex, NumDims>{{firstDimension, otherDimensions...}}; + (*this) = array<DenseIndex, NumDims>{firstDimension, otherDimensions...}; } #else EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) { @@ -405,20 +405,20 @@ template <std::size_t n, std::size_t V1, std::size_t V2, std::size_t V3, std::si template <typename Dims1, typename Dims2, size_t n, size_t m> struct sizes_match_below_dim { - static inline bool run(Dims1&, Dims2&) { + static EIGEN_DEVICE_FUNC inline bool run(Dims1&, Dims2&) { return false; } }; template <typename Dims1, typename Dims2, size_t n> struct sizes_match_below_dim<Dims1, Dims2, n, n> { - static inline bool run(Dims1& dims1, Dims2& dims2) { + static EIGEN_DEVICE_FUNC inline bool run(Dims1& dims1, Dims2& dims2) { return (array_get<n-1>(dims1) == array_get<n-1>(dims2)) & sizes_match_below_dim<Dims1, Dims2, n-1, n-1>::run(dims1, dims2); } }; template <typename Dims1, typename Dims2> struct sizes_match_below_dim<Dims1, Dims2, 0, 0> { - static inline bool run(Dims1&, Dims2&) { + static EIGEN_DEVICE_FUNC inline bool run(Dims1&, Dims2&) { return true; } }; @@ -427,7 +427,7 @@ struct sizes_match_below_dim<Dims1, Dims2, 0, 0> { template <typename Dims1, typename Dims2> -bool dimensions_match(Dims1& dims1, Dims2& dims2) { +EIGEN_DEVICE_FUNC bool dimensions_match(Dims1& dims1, Dims2& dims2) { return internal::sizes_match_below_dim<Dims1, Dims2, internal::array_size<Dims1>::value, internal::array_size<Dims2>::value>::run(dims1, dims2); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index ff4373f59..e7daf7304 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -98,6 +98,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device> PacketAccess = true, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented + RawAccess = true }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -140,7 +141,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device> return internal::ploadt<Packet, LoadMode>(m_buffer + index); } - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_buffer; } private: TensorEvaluator<ArgType, Device> m_impl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 902f25247..f726585b1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -43,6 +43,7 @@ struct TensorEvaluator PacketAccess = Derived::PacketAccess, Layout = Derived::Layout, CoordAccess = NumCoords > 0, + RawAccess = true }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) @@ -148,6 +149,7 @@ struct TensorEvaluator<const Derived, Device> PacketAccess = Derived::PacketAccess, Layout = Derived::Layout, CoordAccess = NumCoords > 0, + RawAccess = true }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) @@ -207,6 +209,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device> PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC @@ -257,6 +260,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device> PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess & internal::functor_traits<UnaryOp>::PacketAccess, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) @@ -312,6 +316,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg internal::functor_traits<BinaryOp>::PacketAccess, Layout = TensorEvaluator<LeftArgType, Device>::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) @@ -378,6 +383,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType> internal::packet_traits<Scalar>::HasBlend, Layout = TensorEvaluator<IfArgType, Device>::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index c28078882..d2ab70f2b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -156,14 +156,14 @@ template <typename Expression> class TensorExecutor<Expression, GpuDevice, false> { public: typedef typename Expression::Index Index; - EIGEN_DEVICE_FUNC static void run(const Expression& expr, const GpuDevice& device); + static EIGEN_DEVICE_FUNC void run(const Expression& expr, const GpuDevice& device); }; template <typename Expression> class TensorExecutor<Expression, GpuDevice, true> { public: typedef typename Expression::Index Index; - EIGEN_DEVICE_FUNC static void run(const Expression& expr, const GpuDevice& device); + static EIGEN_DEVICE_FUNC void run(const Expression& expr, const GpuDevice& device); }; #if defined(__CUDACC__) @@ -215,7 +215,6 @@ EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) { template <typename Expression> EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device) { -#ifndef __CUDA_ARCH__ TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) @@ -228,9 +227,6 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run( LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size); } evaluator.cleanup(); -#else - eigen_assert(false && "Cannot launch a kernel from another kernel"); -#endif } @@ -238,7 +234,6 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run( template<typename Expression> EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(const Expression& expr, const GpuDevice& device) { -#ifndef __CUDA_ARCH__ TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) @@ -251,9 +246,6 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(c LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size); } evaluator.cleanup(); -#else - eigen_assert(false && "Cannot launch a kernel from another kernel"); -#endif } #endif // __CUDACC__ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index 215a4ebad..3bfaf6d23 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -135,6 +135,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D BlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index a4d6ce6b3..7d0858d02 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -44,7 +44,8 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_, PacketAccess = (internal::packet_traits<Scalar>::size > 1), Layout = Options_ & RowMajor ? RowMajor : ColMajor, CoordAccess = true, - }; + RawAccess = true + }; typedef Dimensions_ Dimensions; static const std::size_t NumIndices = Dimensions::count; @@ -53,7 +54,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_, TensorStorage<Scalar, Dimensions, Options> m_storage; public: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } @@ -72,7 +73,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_, { // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return coeff(array<Index, NumIndices>{{firstIndex, otherIndices...}}); + return coeff(array<Index, NumIndices>{firstIndex, otherIndices...}); } #endif @@ -104,7 +105,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_, { // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return coeffRef(array<Index, NumIndices>{{firstIndex, otherIndices...}}); + return coeffRef(array<Index, NumIndices>{firstIndex, otherIndices...}); } #endif @@ -136,7 +137,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_, { // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return this->operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}}); + return this->operator()(array<Index, NumIndices>{firstIndex, otherIndices...}); } #endif @@ -175,7 +176,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_, { // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}}); + return operator()(array<Index, NumIndices>{firstIndex, otherIndices...}); } #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index c16bf7e67..c9b0b2f28 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -92,6 +92,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device> IsAligned = true, PacketAccess = (internal::packet_traits<Scalar>::size > 1), Layout = TensorEvaluator<ArgType, Device>::Layout, + RawAccess = true }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index 9316c9831..96f74b992 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -95,6 +95,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device> BlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 11e510414..2ab332add 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -168,6 +168,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = NumDims == 5, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index ae9e9f751..2798956ae 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -91,6 +91,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device> BlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index f612bbd45..a37516974 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -123,6 +123,7 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device> PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor, CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator<ArgType, Device>::RawAccess }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 5c759af09..7233f4c89 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -49,7 +49,8 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor IsAligned = ((int(Options_)&Aligned)==Aligned), PacketAccess = (internal::packet_traits<Scalar>::size > 1), Layout = PlainObjectType::Layout, - CoordAccess = true + CoordAccess = true, + RawAccess = true }; EIGEN_DEVICE_FUNC @@ -140,10 +141,10 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor { EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) if (PlainObjectType::Options&RowMajor) { - const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumIndices>{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumIndices>{firstIndex, otherIndices...}); return m_data[index]; } else { - const Index index = m_dimensions.IndexOfColMajor(array<Index, NumIndices>{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfColMajor(array<Index, NumIndices>{firstIndex, otherIndices...}); return m_data[index]; } } @@ -227,10 +228,10 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor static_assert(sizeof...(otherIndices) + 1 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); const std::size_t NumDims = sizeof...(otherIndices) + 1; if (PlainObjectType::Options&RowMajor) { - const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumDims>{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumDims>{firstIndex, otherIndices...}); return m_data[index]; } else { - const Index index = m_dimensions.IndexOfColMajor(array<Index, NumDims>{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfColMajor(array<Index, NumDims>{firstIndex, otherIndices...}); return m_data[index]; } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index f28a9699d..d6ad65070 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -24,6 +24,11 @@ const T2& choose(Cond<false>, const T1&, const T2& second) { return second; } +template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T divup(const T x, const T y) { + return (x + y - 1) / y; +} + template <size_t n> struct max_n_1 { static const size_t size = n; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index d8c923d74..11284315c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -110,6 +110,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator<ArgType, Device>::RawAccess }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -145,7 +146,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> return m_impl.template packet<LoadMode>(index); } - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_impl.data(); } + EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); } const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } @@ -170,6 +171,7 @@ template<typename NewDimensions, typename ArgType, typename Device> PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator<ArgType, Device>::RawAccess }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -317,6 +319,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -545,6 +548,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 91e32d200..39a305a93 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -93,6 +93,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = true, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 8fb53f4f2..2cbb820b1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -94,6 +94,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = true, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index c30980a49..09ee0c2c6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -24,11 +24,14 @@ template<typename Op, typename Dims, typename XprType> struct traits<TensorReductionOp<Op, Dims, XprType> > : traits<XprType> { - typedef typename traits<XprType>::Scalar Scalar; + typedef traits<XprType> XprTraits; + typedef typename XprTraits::Scalar Scalar; typedef typename internal::packet_traits<Scalar>::type Packet; - typedef typename traits<XprType>::StorageKind StorageKind; - typedef typename traits<XprType>::Index Index; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; + static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value; + static const int Layout = XprTraits::Layout; }; template<typename Op, typename Dims, typename XprType> @@ -337,12 +340,22 @@ struct FullReducer<Self, Op, ThreadPoolDevice, true> { #endif +// Default inner reducer +template <typename Self, typename Op, typename Device> +struct InnerReducer { + static const bool HasOptimizedImplementation = false; + + static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { + assert(false && "Not implemented"); + } +}; + // Default outer reducer template <typename Self, typename Op, typename Device> struct OuterReducer { static const bool HasOptimizedImplementation = false; - static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { + static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { assert(false && "Not implemented"); } }; @@ -353,6 +366,9 @@ template <int B, int N, typename S, typename R, typename I> __global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*); template <int NPT, typename S, typename R, typename I> +__global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); + +template <int NPT, typename S, typename R, typename I> __global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); #endif @@ -412,6 +428,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> PacketAccess = Self::InputPacketAccess && Op::PacketAccess, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value; @@ -425,19 +442,18 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)), YOU_MADE_A_PROGRAMMING_MISTAKE); - // Bitmap indicating if an input dimension is reduced or not. - array<bool, NumInputDims> reduced; + // Build the bitmap indicating if an input dimension is reduced or not. for (int i = 0; i < NumInputDims; ++i) { - reduced[i] = false; + m_reduced[i] = false; } for (int i = 0; i < NumReducedDims; ++i) { eigen_assert(op.dims()[i] >= 0); eigen_assert(op.dims()[i] < NumInputDims); - reduced[op.dims()[i]] = true; + m_reduced[op.dims()[i]] = true; } const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); - internal::DimInitializer<Dimensions>::run(input_dims, reduced, &m_dimensions, &m_reducedDims); + internal::DimInitializer<Dimensions>::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims); // Precompute output strides. if (NumOutputDims > 0) { @@ -472,7 +488,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> int outputIndex = 0; int reduceIndex = 0; for (int i = 0; i < NumInputDims; ++i) { - if (reduced[i]) { + if (m_reduced[i]) { m_reducedStrides[reduceIndex] = input_strides[i]; ++reduceIndex; } else { @@ -493,7 +509,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) { m_impl.evalSubExprsIfNeeded(NULL); // Use the FullReducer if possible. @@ -514,26 +530,41 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> } // Attempt to use an optimized reduction. -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) else if (RunningOnGPU && data && (m_device.majorDeviceVersion() >= 3)) { + bool reducing_inner_dims = true; + for (int i = 0; i < NumReducedDims; ++i) { + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + reducing_inner_dims &= m_reduced[i]; + } else { + reducing_inner_dims &= m_reduced[NumInputDims - 1 - i]; + } + } + if (internal::InnerReducer<Self, Op, Device>::HasOptimizedImplementation && + (reducing_inner_dims || ReducingInnerMostDims)) { + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); + Op reducer(m_reducer); + internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); + return false; + } + bool preserving_inner_dims = true; for (int i = 0; i < NumReducedDims; ++i) { if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - preserving_inner_dims &= m_reducedDims[NumInputDims - 1 - i]; + preserving_inner_dims &= m_reduced[NumInputDims - 1 - i]; } else { - preserving_inner_dims &= m_reducedDims[i]; + preserving_inner_dims &= m_reduced[i]; } } - if (internal::OuterReducer<Self, Op, GpuDevice>::HasOptimizedImplementation && + if (internal::OuterReducer<Self, Op, Device>::HasOptimizedImplementation && preserving_inner_dims) { const Index num_values_to_reduce = internal::array_prod(m_reducedDims); const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); Op reducer(m_reducer); - internal::OuterReducer<Self, Op, GpuDevice>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); + internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); return false; } } -#endif return true; } @@ -615,6 +646,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> #endif #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*); + template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); #endif @@ -660,6 +692,8 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> return startInput; } + // Bitmap indicating if an input dimension is reduced or not. + array<bool, NumInputDims> m_reduced; // Dimensions of the output of the operation. Dimensions m_dimensions; // Precomputed strides for the output tensor. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 8e250867c..2da18b147 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -76,13 +76,24 @@ __device__ inline void atomicReduce(T* output, T accum, SumReducer<T>&) { #endif } + +template <typename CoeffType, typename Index> +__global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) { + const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; + const Index num_threads = blockDim.x * gridDim.x; + for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { + output[i] = val; + } +} + template <int BlockSize, int NumPerThread, typename Self, typename Reducer, typename Index> __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs, typename Self::CoeffReturnType* output) { const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x; - if (first_index == 0) { + // Initialize the output value if it wasn't initialized by the ReductionInitKernel + if (gridDim.x == 1 && first_index == 0) { *output = reducer.initialize(); } @@ -115,23 +126,138 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> { internal::is_same<typename Self::CoeffReturnType, float>::value; template <typename OutputType> - EIGEN_DEVICE_FUNC static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { + static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { assert(false && "Should only be called on floats"); } - EIGEN_DEVICE_FUNC static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) { + static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) { typedef typename Self::Index Index; const Index num_coeffs = array_prod(self.m_impl.dimensions()); const int block_size = 256; const int num_per_thread = 128; const int num_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread)); - LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread>), + + if (num_blocks > 1) { + // We initialize the outputs outside the reduction kernel when we can't be sure that there + // won't be a race conditions between multiple thread blocks. + LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>), + 1, 32, 0, device, reducer.initialize(), 1, output); + } + + LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>), num_blocks, block_size, 0, device, reducer, self, num_coeffs, output); } }; -#define DIVUP(x, y) (((x) + (y)-1) / (y)) + +template <int NumPerThread, typename Self, + typename Reducer, typename Index> +__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, + typename Self::CoeffReturnType* output) { + eigen_assert(blockDim.y == 1); + eigen_assert(blockDim.z == 1); + eigen_assert(gridDim.y == 1); + eigen_assert(gridDim.z == 1); + + const int unroll_times = 16; + eigen_assert(NumPerThread % unroll_times == 0); + + const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread); + const Index num_input_blocks = input_col_blocks * num_preserved_coeffs; + + const Index num_threads = blockDim.x * gridDim.x; + const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + // Initialize the output values if they weren't initialized by the ReductionInitKernel + if (gridDim.x == 1) { + for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { + output[i] = reducer.initialize(); + } + } + + for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) { + const Index row = i / input_col_blocks; + + if (row < num_preserved_coeffs) { + const Index col_block = i % input_col_blocks; + const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x; + + float reduced_val = reducer.initialize(); + + for (Index j = 0; j < NumPerThread; j += unroll_times) { + const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1); + if (last_col >= num_coeffs_to_reduce) { + for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col +=blockDim.x) { + const float val = input.m_impl.coeff(row * num_coeffs_to_reduce + col); + reducer.reduce(val, &reduced_val); + } + break; + } else { + // Faster version of the loop with no branches after unrolling. +#pragma unroll + for (int k = 0; k < unroll_times; ++k) { + const Index col = col_begin + blockDim.x * (j + k); + reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val); + } + } + } + +#pragma unroll + for (int offset = warpSize/2; offset > 0; offset /= 2) { + reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val); + } + + if ((threadIdx.x & (warpSize - 1)) == 0) { + atomicReduce(&(output[row]), reduced_val, reducer); + } + } + + __syncthreads(); + } +} + +template <typename Self, typename Op> +struct InnerReducer<Self, Op, GpuDevice> { + // Unfortunately nvidia doesn't support well exotic types such as complex, + // so reduce the scope of the optimized version of the code to the simple case + // of floats. + static const bool HasOptimizedImplementation = !Op::IsStateful && + internal::is_same<typename Self::CoeffReturnType, float>::value; + + template <typename Device, typename OutputType> + static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { + assert(false && "Should only be called to reduce floats on a gpu device"); + } + + static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + typedef typename Self::Index Index; + + const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; + const int block_size = 256; + const int num_per_thread = 128; + const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread); + const int max_blocks = device.getNumCudaMultiProcessors() * + device.maxCudaThreadsPerMultiProcessor() / block_size; + const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); + + if (num_blocks > 1) { + // We initialize the outputs outside the reduction kernel when we can't be sure that there + // won't be a race conditions between multiple thread blocks. + const int dyn_blocks = divup<int>(num_preserved_vals, 1024); + const int max_blocks = device.getNumCudaMultiProcessors() * + device.maxCudaThreadsPerMultiProcessor() / 1024; + const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); + LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>), + num_blocks, 1024, 0, device, reducer.initialize(), + num_preserved_vals, output); + } + + LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>), + num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); + } +}; + template <int NumPerThread, typename Self, typename Reducer, typename Index> @@ -139,13 +265,15 @@ __global__ void OuterReductionKernel(Reducer reducer, const Self input, Index nu typename Self::CoeffReturnType* output) { const Index num_threads = blockDim.x * gridDim.x; const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; - // Initialize the output values - for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { - output[i] = reducer.initialize(); + // Initialize the output values if they weren't initialized by the ReductionInitKernel + if (gridDim.x == 1) { + for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { + output[i] = reducer.initialize(); + } } // Do the reduction. - const Index max_iter = DIVUP(num_coeffs_to_reduce, NumPerThread) * num_preserved_coeffs; + const Index max_iter = num_preserved_coeffs * numext::maxi<Index>(1, (num_coeffs_to_reduce - NumPerThread + 1)); for (Index i = thread_id; i < max_iter; i += num_threads) { const Index input_col = i % num_preserved_coeffs; const Index input_row = (i / num_preserved_coeffs) * NumPerThread; @@ -169,28 +297,38 @@ struct OuterReducer<Self, Op, GpuDevice> { internal::is_same<typename Self::CoeffReturnType, float>::value; template <typename Device, typename OutputType> - static void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { + static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { assert(false && "Should only be called to reduce floats on a gpu device"); } - static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { typedef typename Self::Index Index; - const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; + const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; const int block_size = 256; const int num_per_thread = 16; - const int dyn_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread)); + const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread); const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size; const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); - LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread>), + if (num_blocks > 1) { + // We initialize the outputs in the reduction kernel itself when we don't have to worry + // about race conditions between multiple thread blocks. + const int dyn_blocks = divup<int>(num_preserved_vals, 1024); + const int max_blocks = device.getNumCudaMultiProcessors() * + device.maxCudaThreadsPerMultiProcessor() / 1024; + const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); + LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>), + num_blocks, 1024, 0, device, reducer.initialize(), + num_preserved_vals, output); + } + + LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>), num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); } }; -#undef DIVUP - #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h index 6b25b2ba0..57197d060 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h @@ -139,6 +139,7 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef PacketAccess = false, Layout = PlainObjectType::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_STRONG_INLINE TensorRef() : m_evaluator(NULL) { @@ -367,6 +368,7 @@ struct TensorEvaluator<const TensorRef<Derived>, Device> PacketAccess = false, Layout = TensorRef<Derived>::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef<Derived>& m, const Device&) @@ -412,6 +414,7 @@ struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<cons enum { IsAligned = false, PacketAccess = false, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(TensorRef<Derived>& m, const Device& d) : Base(m, d) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index 10328c61f..846f81e0f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -113,6 +113,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, @@ -239,6 +240,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device> PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 15a22aa1b..c4adb7d4c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -113,6 +113,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> PacketAccess = (internal::packet_traits<Scalar>::size > 1), Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -225,6 +226,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device> enum { IsAligned = false, PacketAccess = (internal::packet_traits<Scalar>::size > 1), + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 97b6168a9..2c2eb6515 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -112,6 +112,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -258,6 +259,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device> PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h index 7a9568b36..2f06f8442 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h @@ -20,7 +20,7 @@ class compute_tensor_flags enum { is_dynamic_size_storage = 1, - aligned_bit = + is_aligned = ( ((Options&DontAlign)==0) && ( #if EIGEN_MAX_STATIC_ALIGN_BYTES>0 @@ -35,12 +35,12 @@ class compute_tensor_flags 0 #endif ) - ) ? AlignedBit : 0, - packet_access_bit = packet_traits<Scalar>::Vectorizable && aligned_bit ? PacketAccessBit : 0 + ), + packet_access_bit = packet_traits<Scalar>::Vectorizable && is_aligned ? PacketAccessBit : 0 }; public: - enum { ret = packet_access_bit | aligned_bit}; + enum { ret = packet_access_bit}; }; @@ -86,7 +86,7 @@ struct traits<TensorMap<PlainObjectType, Options_> > static const int Layout = BaseTraits::Layout; enum { Options = Options_, - Flags = (BaseTraits::Flags & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0), + Flags = BaseTraits::Flags, }; }; @@ -102,7 +102,7 @@ struct traits<TensorRef<PlainObjectType> > static const int Layout = BaseTraits::Layout; enum { Options = BaseTraits::Options, - Flags = (BaseTraits::Flags & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0), + Flags = BaseTraits::Flags, }; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index 6625c66d5..52b78b261 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -181,6 +181,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D BlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = NumDims == 6, + RawAccess = false }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h index bc4f2025f..1b9fe2779 100644 --- a/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h +++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h @@ -55,7 +55,7 @@ class DynamicSGroup inline internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup> operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const { static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); - return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{{firstIndex, otherIndices...}}); + return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{firstIndex, otherIndices...}); } template<typename Tensor_> @@ -90,7 +90,7 @@ class DynamicSGroup template<typename Index, std::size_t N, int... n> inline std::array<Index, N> h_permute(std::size_t which, const std::array<Index, N>& idx, internal::numeric_list<int, n...>) const { - return std::array<Index, N>{{ idx[n >= m_numIndices ? n : m_elements[which].representation[n]]... }}; + return std::array<Index, N>{ idx[n >= m_numIndices ? n : m_elements[which].representation[n]]... }; } template<typename Index> diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h index 942293bd7..255c344b4 100644 --- a/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h +++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h @@ -217,7 +217,7 @@ class StaticSGroup inline internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>> operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const { static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); - return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{{firstIndex, otherIndices...}}); + return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{firstIndex, otherIndices...}); } template<typename Tensor_> diff --git a/unsupported/Eigen/src/SparseExtra/RandomSetter.h b/unsupported/Eigen/src/SparseExtra/RandomSetter.h index eb3e17330..ee97299af 100644 --- a/unsupported/Eigen/src/SparseExtra/RandomSetter.h +++ b/unsupported/Eigen/src/SparseExtra/RandomSetter.h @@ -95,10 +95,10 @@ template<typename Scalar> struct GoogleSparseHashMapTraits * * \brief The RandomSetter is a wrapper object allowing to set/update a sparse matrix with random access * - * \param SparseMatrixType the type of the sparse matrix we are updating - * \param MapTraits a traits class representing the map implementation used for the temporary sparse storage. + * \tparam SparseMatrixType the type of the sparse matrix we are updating + * \tparam MapTraits a traits class representing the map implementation used for the temporary sparse storage. * Its default value depends on the system. - * \param OuterPacketBits defines the number of rows (or columns) manage by a single map object + * \tparam OuterPacketBits defines the number of rows (or columns) manage by a single map object * as a power of two exponent. * * This class temporarily represents a sparse matrix object using a generic map implementation allowing for diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp index 2ddf47234..6fdefd66c 100644 --- a/unsupported/test/cxx11_tensor_broadcasting.cpp +++ b/unsupported/test/cxx11_tensor_broadcasting.cpp @@ -167,13 +167,13 @@ static void test_fixed_size_broadcasting() TensorFixedSize<float, Sizes<1>, DataLayout> t2; t2 = t2.constant(20.0f); - Tensor<float, 1, DataLayout> t3 = t1 + t2.broadcast(Eigen::array<int, 1>{{10}}); + Tensor<float, 1, DataLayout> t3 = t1 + t2.broadcast(Eigen::array<int, 1>{10}); for (int i = 0; i < 10; ++i) { VERIFY_IS_APPROX(t3(i), t1(i) + t2(0)); } - TensorMap<TensorFixedSize<float, Sizes<1>, DataLayout> > t4(t2.data(), {{1}}); - Tensor<float, 1, DataLayout> t5 = t1 + t4.broadcast(Eigen::array<int, 1>{{10}}); + TensorMap<TensorFixedSize<float, Sizes<1>, DataLayout> > t4(t2.data(), {1}); + Tensor<float, 1, DataLayout> t5 = t1 + t4.broadcast(Eigen::array<int, 1>{10}); for (int i = 0; i < 10; ++i) { VERIFY_IS_APPROX(t5(i), t1(i) + t2(0)); } diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp index b0d52c6cf..c5f3af73e 100644 --- a/unsupported/test/cxx11_tensor_contraction.cpp +++ b/unsupported/test/cxx11_tensor_contraction.cpp @@ -456,7 +456,7 @@ static void test_tensor_product() mat1.setRandom(); mat2.setRandom(); - Tensor<float, 4, DataLayout> result = mat1.contract(mat2, Eigen::array<DimPair, 0>{{}}); + Tensor<float, 4, DataLayout> result = mat1.contract(mat2, Eigen::array<DimPair, 0>{}); VERIFY_IS_EQUAL(result.dimension(0), 2); VERIFY_IS_EQUAL(result.dimension(1), 3); diff --git a/unsupported/test/cxx11_tensor_map.cpp b/unsupported/test/cxx11_tensor_map.cpp index a8a095e38..dc0f8a5a2 100644 --- a/unsupported/test/cxx11_tensor_map.cpp +++ b/unsupported/test/cxx11_tensor_map.cpp @@ -130,7 +130,7 @@ static void test_3d() } TensorMap<Tensor<const int, 3>> mat3(mat1.data(), 2, 3, 7); - TensorMap<Tensor<const int, 3, RowMajor>> mat4(mat2.data(), array<DenseIndex, 3>{{2, 3, 7}}); + TensorMap<Tensor<const int, 3, RowMajor>> mat4(mat2.data(), array<DenseIndex, 3>{2, 3, 7}); VERIFY_IS_EQUAL(mat3.rank(), 3); VERIFY_IS_EQUAL(mat3.size(), 2*3*7); |