From 6fa6cdd2b988da98cbdd2b1a5fd2fd3b9d56a4b1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 4 Jun 2014 09:21:48 -0700 Subject: Added support for tensor contractions Updated expression evaluation mechanism to also compute the size of the tensor result Misc fixes and improvements. --- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 229 +++++++++++++++++++++ 1 file changed, 229 insertions(+) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h new file mode 100644 index 000000000..d424df36e --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -0,0 +1,229 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H + +namespace Eigen { + +/** \class TensorContraction + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor contraction class. + * + * + */ +namespace internal { +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename internal::promote_storage_type::ret Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorContractionOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorContractionOp type; +}; + +} // end namespace internal + + + +template +class TensorContractionOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename internal::promote_storage_type::ret PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp(const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims) {} + + EIGEN_DEVICE_FUNC + const Indices& indices() const { return m_indices; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + lhsExpression() const { return m_lhs_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + rhsExpression() const { return m_rhs_xpr; } + + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const Indices m_indices; +}; + + +template struct max_n_1 { + static const size_t size = n; +}; +template <> struct max_n_1<0> { + static const size_t size = 1; +}; + + +template +struct TensorEvaluator > +{ + typedef TensorContractionOp XprType; + + static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * Indices::size>::size; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = /*TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess */ + false, + }; + + TensorEvaluator(const XprType& op) + : m_leftImpl(op.lhsExpression()), m_rightImpl(op.rhsExpression()) + { + Index index = 0; + Index stride = 1; + m_shiftright = 1; + + int skipped = 0; + const typename TensorEvaluator::Dimensions& left_dims = m_leftImpl.dimensions(); + for (int i = 0; i < TensorEvaluator::Dimensions::count; ++i) { + bool skip = false; + for (int j = 0; j < Indices::size; ++j) { + if (op.indices()[j].first == i) { + skip = true; + m_leftOffsets[2*skipped] = stride; + m_leftOffsets[2*skipped+1] = stride * left_dims[i]; + m_stitchsize[skipped] = left_dims[i]; + break; + } + } + if (!skip) { + m_dimensions[index++] = left_dims[i]; + m_shiftright *= left_dims[i]; + } else { + ++skipped; + } + stride *= left_dims[i]; + } + + stride = 1; + skipped = 0; + const typename TensorEvaluator::Dimensions& right_dims = m_rightImpl.dimensions(); + for (int i = 0; i < TensorEvaluator::Dimensions::count; ++i) { + bool skip = false; + for (int j = 0; j < Indices::size; ++j) { + if (op.indices()[j].second == i) { + skip = true; + m_rightOffsets[2*skipped] = stride; + m_rightOffsets[2*skipped+1] = stride * right_dims[i]; + break; + } + } + if (!skip) { + m_dimensions[index++] = right_dims[i]; + } else { + ++skipped; + } + stride *= right_dims[i]; + } + + // Scalar case + if (TensorEvaluator::Dimensions::count + TensorEvaluator::Dimensions::count == 2 * Indices::size) { + m_dimensions[0] = 1; + } + } + + // typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + const Dimensions& dimensions() const { return m_dimensions; } + + void evalTo(typename XprType::Scalar* buffer) const { + for (int i = 0; i < dimensions().TotalSize(); ++i) { + buffer[i] += coeff(i); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + const Index startLeft = index % m_shiftright; + const Index startRight = index / m_shiftright; + CoeffReturnType result = CoeffReturnType(0); + partialStitch(startLeft, startRight, 0, result); + return result; + } + + /* TODO: vectorization + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + { + assert(false); + }*/ + + private: + EIGEN_DEVICE_FUNC void partialStitch(Index startLeft, Index startRight, int StitchIndex, CoeffReturnType& accum) const { + Index firstLeft = (startLeft / m_leftOffsets[2*StitchIndex]) * m_leftOffsets[2*StitchIndex+1] + (startLeft % m_leftOffsets[2*StitchIndex]); + Index firstRight = (startRight / m_rightOffsets[2*StitchIndex]) * m_rightOffsets[2*StitchIndex+1] + (startRight % m_rightOffsets[2*StitchIndex]); + + for (int j = 0; j < m_stitchsize[StitchIndex]; ++j) { + const Index left = firstLeft+j*m_leftOffsets[2*StitchIndex]; + const Index right = firstRight+j*m_rightOffsets[2*StitchIndex]; + if (StitchIndex < Indices::size-1) { + partialStitch(left, right, StitchIndex+1, accum); + } else { + accum += m_leftImpl.coeff(left) * m_rightImpl.coeff(right); + } + } + } + + private: + array m_leftOffsets; + array m_rightOffsets; + array m_stitchsize; + Index m_shiftright; + Dimensions m_dimensions; + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H -- cgit v1.2.3 From a669052f12d6d71ba815764d6419726d64fef675 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 9 Jun 2014 09:45:30 -0700 Subject: Improved support for rvalues in tensor expressions. --- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 58 ++++++++++++++++------ .../Eigen/CXX11/src/Tensor/TensorContraction.h | 4 ++ .../Eigen/CXX11/src/Tensor/TensorConvolution.h | 4 ++ unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h | 8 +++ .../CXX11/src/Tensor/TensorForwardDeclarations.h | 6 ++- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 5 +- unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h | 6 ++- 7 files changed, 71 insertions(+), 20 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 932e5c82d..e447a5d40 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -22,7 +22,7 @@ namespace Eigen { */ template -class TensorBase +class TensorBase { public: typedef typename internal::traits::Scalar Scalar; @@ -30,19 +30,6 @@ class TensorBase typedef Scalar CoeffReturnType; typedef typename internal::packet_traits::type PacketReturnType; - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Derived& setZero() { - return setConstant(Scalar(0)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Derived& setConstant(const Scalar& val) { - return derived() = constant(val); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Derived& setRandom() { - return derived() = random(); - } - // Nullary operators EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> @@ -224,14 +211,53 @@ class TensorBase return TensorReshapingOp(derived(), newDimensions); } + protected: + template friend class TensorBase; + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast(this); } +}; + + +template +class TensorBase : public TensorBase { + public: + typedef typename internal::traits::Scalar Scalar; + typedef typename internal::traits::Index Index; + typedef Scalar CoeffReturnType; + typedef typename internal::packet_traits::type PacketReturnType; + + template friend class TensorBase; + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setZero() { + return setConstant(Scalar(0)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setConstant(const Scalar& val) { + return derived() = this->constant(val); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setRandom() { + return derived() = this->random(); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator+=(const OtherDerived& other) { + return derived() = TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator-=(const OtherDerived& other) { + return derived() = TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + // Select the device on which to evaluate the expression. template TensorDevice device(const DeviceType& device) { return TensorDevice(device, derived()); } - protected: - template friend class TensorBase; + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& derived() { return *static_cast(this); } EIGEN_DEVICE_FUNC diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index d424df36e..d371eb76d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -35,6 +35,10 @@ struct traits > typedef typename RhsXprType::Nested RhsNested; typedef typename remove_reference::type _LhsNested; typedef typename remove_reference::type _RhsNested; + + enum { + Flags = 0, + }; }; template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index ca2e0e562..501e9a522 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -35,6 +35,10 @@ struct traits > typedef typename KernelXprType::Nested RhsNested; typedef typename remove_reference::type _LhsNested; typedef typename remove_reference::type _RhsNested; + + enum { + Flags = 0, + }; }; template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h index 60908ee94..de66da13f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -36,6 +36,10 @@ struct traits > typedef typename XprType::Scalar Scalar; typedef typename XprType::Nested XprTypeNested; typedef typename remove_reference::type _XprTypeNested; + + enum { + Flags = 0, + }; }; } // end namespace internal @@ -153,6 +157,10 @@ struct traits > typedef typename RhsXprType::Nested RhsNested; typedef typename remove_reference::type _LhsNested; typedef typename remove_reference::type _RhsNested; + + enum { + Flags = 0, + }; }; template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index b8833362c..1fb90478f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -15,7 +15,7 @@ namespace Eigen { template class Tensor; template class TensorFixedSize; template class TensorMap; -template class TensorBase; +template::value> class TensorBase; template class TensorCwiseNullaryOp; template class TensorCwiseUnaryOp; @@ -29,6 +29,10 @@ template class TensorDevice; // Move to internal? template struct TensorEvaluator; +namespace internal { +template struct TensorAssign; +} // end namespace internal + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 3e089fe1e..7d5f9271e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -21,7 +21,7 @@ namespace Eigen { */ namespace internal { template -struct traits > +struct traits > : public traits { // Type promotion to handle the case where the types of the lhs and the rhs are different. typedef typename XprType::Scalar Scalar; @@ -81,6 +81,7 @@ template struct TensorEvaluator > { typedef TensorReshapingOp XprType; + typedef NewDimensions Dimensions; enum { IsAligned = TensorEvaluator::IsAligned, @@ -95,7 +96,7 @@ struct TensorEvaluator > typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; - const NewDimensions& dimensions() const { return m_dimensions; } + const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h index 2de698a57..40f805741 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h @@ -52,7 +52,7 @@ struct traits > typedef DenseIndex Index; enum { Options = Options_, - Flags = compute_tensor_flags::ret, + Flags = compute_tensor_flags::ret | LvalueBit, }; }; @@ -63,6 +63,10 @@ struct traits > typedef Scalar_ Scalar; typedef Dense StorageKind; typedef DenseIndex Index; + enum { + Options = Options_, + Flags = compute_tensor_flags::ret | LvalueBit, + }; }; -- cgit v1.2.3 From a77458a8ff2a83e716add62253eb50ef64980b21 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 9 Jun 2014 10:06:57 -0700 Subject: Fixes compilation errors triggered when compiling the tensor contraction code with cxx11 enabled. --- .../Eigen/CXX11/src/Core/util/CXX11Workarounds.h | 6 ++++++ .../Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h | 17 +++++++++++++---- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 16 ++++++++-------- 3 files changed, 27 insertions(+), 12 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h') diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h index f102872ae..423ca4be4 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h @@ -66,6 +66,12 @@ template constexpr inline T const& array_ #undef STD_GET_ARR_HACK +template struct array_size; +template struct array_size > { + static const size_t value = N; +}; + + /* Suppose you have a template of the form * template struct X; * And you want to specialize it in such a way: diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h index 636063f9e..1d3164d6a 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h @@ -182,23 +182,32 @@ array repeat(t v) { } template -t array_prod(const array& a) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array& a) { t prod = 1; for (size_t i = 0; i < n; ++i) { prod *= a[i]; } return prod; } template -t array_prod(const array& /*a*/) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array& /*a*/) { return 0; } -template inline T& array_get(array& a) { +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array& a) { return a[I]; } -template inline const T& array_get(const array& a) { +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const T& array_get(const array& a) { return a[I]; } + +template struct array_size; +template struct array_size > { + static const size_t value = N; +}; + + struct sum_op { template static inline bool run(A a, B b) { return a + b; } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index d371eb76d..5149de1bb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -107,7 +107,7 @@ struct TensorEvaluator XprType; - static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * Indices::size>::size; + static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * internal::array_size::value>::size; typedef typename XprType::Index Index; typedef DSizes Dimensions; @@ -128,7 +128,7 @@ struct TensorEvaluator::Dimensions& left_dims = m_leftImpl.dimensions(); for (int i = 0; i < TensorEvaluator::Dimensions::count; ++i) { bool skip = false; - for (int j = 0; j < Indices::size; ++j) { + for (int j = 0; j < internal::array_size::value; ++j) { if (op.indices()[j].first == i) { skip = true; m_leftOffsets[2*skipped] = stride; @@ -151,7 +151,7 @@ struct TensorEvaluator::Dimensions& right_dims = m_rightImpl.dimensions(); for (int i = 0; i < TensorEvaluator::Dimensions::count; ++i) { bool skip = false; - for (int j = 0; j < Indices::size; ++j) { + for (int j = 0; j < internal::array_size::value; ++j) { if (op.indices()[j].second == i) { skip = true; m_rightOffsets[2*skipped] = stride; @@ -168,7 +168,7 @@ struct TensorEvaluator::Dimensions::count + TensorEvaluator::Dimensions::count == 2 * Indices::size) { + if (TensorEvaluator::Dimensions::count + TensorEvaluator::Dimensions::count == 2 * internal::array_size::value) { m_dimensions[0] = 1; } } @@ -209,7 +209,7 @@ struct TensorEvaluator::value-1) { partialStitch(left, right, StitchIndex+1, accum); } else { accum += m_leftImpl.coeff(left) * m_rightImpl.coeff(right); @@ -218,9 +218,9 @@ struct TensorEvaluator m_leftOffsets; - array m_rightOffsets; - array m_stitchsize; + array::value> m_leftOffsets; + array::value> m_rightOffsets; + array::value> m_stitchsize; Index m_shiftright; Dimensions m_dimensions; TensorEvaluator m_leftImpl; -- cgit v1.2.3 From 925fb6b93710b95082ba44d30405289dff3707eb Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Jun 2014 09:14:44 -0700 Subject: TensorEval are now typed on the device: this will make it possible to use partial template specialization to optimize the strategy of each evaluator for each device type. Started work on partial evaluations. --- unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h | 42 ++++++------ unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 14 ++-- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 26 ++++---- .../Eigen/CXX11/src/Tensor/TensorConvolution.h | 20 +++--- unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorDeviceType.h | 28 ++++++-- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 76 +++++++++++----------- .../CXX11/src/Tensor/TensorForwardDeclarations.h | 9 +-- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 14 ++-- 9 files changed, 129 insertions(+), 102 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index da1eb62cb..633a7a31b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -32,15 +32,15 @@ namespace Eigen { namespace internal { // Default strategy: the expressions are evaluated with a single cpu thread. -template::PacketAccess & TensorEvaluator::PacketAccess> +template::PacketAccess & TensorEvaluator::PacketAccess> struct TensorAssign { typedef typename Derived1::Index Index; EIGEN_DEVICE_FUNC - static inline void run(Derived1& dst, const Derived2& src) + static inline void run(Derived1& dst, const Derived2& src, const Device& device = Device()) { - TensorEvaluator evalDst(dst); - TensorEvaluator evalSrc(src); + TensorEvaluator evalDst(dst, device); + TensorEvaluator evalSrc(src, device); const Index size = dst.size(); for (Index i = 0; i < size; ++i) { evalDst.coeffRef(i) = evalSrc.coeff(i); @@ -49,19 +49,19 @@ struct TensorAssign }; -template -struct TensorAssign +template +struct TensorAssign { typedef typename Derived1::Index Index; - static inline void run(Derived1& dst, const Derived2& src) + static inline void run(Derived1& dst, const Derived2& src, const Device& device = Device()) { - TensorEvaluator evalDst(dst); - TensorEvaluator evalSrc(src); + TensorEvaluator evalDst(dst, device); + TensorEvaluator evalSrc(src, device); const Index size = dst.size(); - static const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; - static const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; - static const int PacketSize = unpacket_traits::PacketReturnType>::size; + static const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + static const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + static const int PacketSize = unpacket_traits::PacketReturnType>::size; const int VectorizedSize = (size / PacketSize) * PacketSize; for (Index i = 0; i < VectorizedSize; i += PacketSize) { @@ -116,12 +116,12 @@ struct TensorAssignMultiThreaded typedef typename Derived1::Index Index; static inline void run(Derived1& dst, const Derived2& src, const ThreadPoolDevice& device) { - TensorEvaluator evalDst(dst); - TensorEvaluator evalSrc(src); + TensorEvaluator evalDst(dst, DefaultDevice()); + TensorEvaluator evalSrc(src, Defaultevice()); const Index size = dst.size(); - static const bool Vectorizable = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess; - static const int PacketSize = Vectorizable ? unpacket_traits::PacketReturnType>::size : 1; + static const bool Vectorizable = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess; + static const int PacketSize = Vectorizable ? unpacket_traits::PacketReturnType>::size : 1; int blocksz = static_cast(ceil(static_cast(size)/device.numThreads()) + PacketSize - 1); const Index blocksize = std::max(PacketSize, (blocksz - (blocksz % PacketSize))); @@ -131,7 +131,7 @@ struct TensorAssignMultiThreaded vector > results; results.reserve(numblocks); for (int i = 0; i < numblocks; ++i) { - results.push_back(std::async(std::launch::async, &EvalRange, TensorEvaluator, Index>::run, evalDst, evalSrc, i*blocksize, (i+1)*blocksize)); + results.push_back(std::async(std::launch::async, &EvalRange, TensorEvaluator, Index>::run, evalDst, evalSrc, i*blocksize, (i+1)*blocksize)); } for (int i = 0; i < numblocks; ++i) { @@ -167,19 +167,19 @@ struct TensorAssignGpu typedef typename Derived1::Index Index; static inline void run(Derived1& dst, const Derived2& src, const GpuDevice& device) { - TensorEvaluator evalDst(dst); - TensorEvaluator evalSrc(src); + TensorEvaluator evalDst(dst, device); + TensorEvaluator evalSrc(src, device); const Index size = dst.size(); const int block_size = std::min(size, 32*32); const int num_blocks = size / block_size; - EigenMetaKernelNoCheck, TensorEvaluator > <<>>(evalDst, evalSrc); + EigenMetaKernelNoCheck, TensorEvaluator > <<>>(evalDst, evalSrc); const int remaining_items = size % block_size; if (remaining_items > 0) { const int peel_start_offset = num_blocks * block_size; const int peel_block_size = std::min(size, 32); const int peel_num_blocks = (remaining_items + peel_block_size - 1) / peel_block_size; - EigenMetaKernelPeel, TensorEvaluator > <<>>(evalDst, evalSrc, peel_start_offset, size); + EigenMetaKernelPeel, TensorEvaluator > <<>>(evalDst, evalSrc, peel_start_offset, size); } } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index e447a5d40..6b53d2a3d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -198,19 +198,25 @@ class TensorBase } // Coefficient-wise ternary operators. - template - inline const TensorSelectOp + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorSelectOp select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const { return TensorSelectOp(derived(), thenTensor.derived(), elseTensor.derived()); } // Morphing operators (slicing tbd). - template - inline const TensorReshapingOp + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReshapingOp reshape(const NewDimensions& newDimensions) const { return TensorReshapingOp(derived(), newDimensions); } + // Force the evaluation of the expression. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorForcedEvalOp eval() const { + return TensorForcedEvalOp(derived()); + } + protected: template friend class TensorBase; EIGEN_DEVICE_FUNC diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 5149de1bb..cadbabda2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -102,31 +102,31 @@ template <> struct max_n_1<0> { }; -template -struct TensorEvaluator > +template +struct TensorEvaluator, Device> { typedef TensorContractionOp XprType; - static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * internal::array_size::value>::size; + static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * internal::array_size::value>::size; typedef typename XprType::Index Index; typedef DSizes Dimensions; enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = /*TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess */ false, }; - TensorEvaluator(const XprType& op) - : m_leftImpl(op.lhsExpression()), m_rightImpl(op.rhsExpression()) + TensorEvaluator(const XprType& op, const Device& device) + : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device) { Index index = 0; Index stride = 1; m_shiftright = 1; int skipped = 0; - const typename TensorEvaluator::Dimensions& left_dims = m_leftImpl.dimensions(); - for (int i = 0; i < TensorEvaluator::Dimensions::count; ++i) { + const typename TensorEvaluator::Dimensions& left_dims = m_leftImpl.dimensions(); + for (int i = 0; i < TensorEvaluator::Dimensions::count; ++i) { bool skip = false; for (int j = 0; j < internal::array_size::value; ++j) { if (op.indices()[j].first == i) { @@ -148,8 +148,8 @@ struct TensorEvaluator::Dimensions& right_dims = m_rightImpl.dimensions(); - for (int i = 0; i < TensorEvaluator::Dimensions::count; ++i) { + const typename TensorEvaluator::Dimensions& right_dims = m_rightImpl.dimensions(); + for (int i = 0; i < TensorEvaluator::Dimensions::count; ++i) { bool skip = false; for (int j = 0; j < internal::array_size::value; ++j) { if (op.indices()[j].second == i) { @@ -168,7 +168,7 @@ struct TensorEvaluator::Dimensions::count + TensorEvaluator::Dimensions::count == 2 * internal::array_size::value) { + if (TensorEvaluator::Dimensions::count + TensorEvaluator::Dimensions::count == 2 * internal::array_size::value) { m_dimensions[0] = 1; } } @@ -223,8 +223,8 @@ struct TensorEvaluator::value> m_stitchsize; Index m_shiftright; Dimensions m_dimensions; - TensorEvaluator m_leftImpl; - TensorEvaluator m_rightImpl; + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 501e9a522..a554b8260 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -94,27 +94,27 @@ class TensorConvolutionOp : public TensorBase -struct TensorEvaluator > +template +struct TensorEvaluator, Device> { typedef TensorConvolutionOp XprType; - static const int NumDims = TensorEvaluator::Dimensions::count; + static const int NumDims = TensorEvaluator::Dimensions::count; static const int KernelDims = Indices::size; typedef typename XprType::Index Index; typedef DSizes Dimensions; enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = /*TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess */ false, }; - TensorEvaluator(const XprType& op) - : m_inputImpl(op.inputExpression()), m_kernelImpl(op.kernelExpression()), m_dimensions(op.inputExpression().dimensions()) + TensorEvaluator(const XprType& op, const Device& device) + : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_dimensions(op.inputExpression().dimensions()) { - const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); - const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); for (int i = 0; i < NumDims; ++i) { if (i > 0) { @@ -200,8 +200,8 @@ struct TensorEvaluator m_indexStride; array m_kernelStride; Dimensions m_dimensions; - TensorEvaluator m_inputImpl; - TensorEvaluator m_kernelImpl; + TensorEvaluator m_inputImpl; + TensorEvaluator m_kernelImpl; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h index dbe60a165..ce524a818 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h @@ -31,7 +31,7 @@ template class TensorDevice { template EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { - internal::TensorAssign::run(m_expression, other); + internal::TensorAssign::run(m_expression, other, m_device); return *this; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h index d7f5ab7c9..142edda14 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h @@ -15,6 +15,12 @@ namespace Eigen { // Default device for the machine (typically a single cpu core) struct DefaultDevice { + EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + return internal::aligned_malloc(num_bytes); + } + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + internal::aligned_free(buffer); + } }; @@ -22,14 +28,19 @@ struct DefaultDevice { // We should really use a thread pool here but first we need to find a portable thread pool library. #ifdef EIGEN_USE_THREADS struct ThreadPoolDevice { - ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : /*pool_(pool), */num_threads_(num_cores) { } + ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : /*pool_(pool), */num_threads_(num_cores) { } size_t numThreads() const { return num_threads_; } - /*ThreadPool* threadPool() const { return pool_; }*/ + + EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + return internal::aligned_malloc(num_bytes); + } + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + internal::aligned_free(buffer); + } private: // todo: NUMA, ... size_t num_threads_; - /*ThreadPool* pool_;*/ }; #endif @@ -40,7 +51,16 @@ struct GpuDevice { // The cudastream is not owned: the caller is responsible for its initialization and eventual destruction. GpuDevice(const cudaStream_t* stream) : stream_(stream) { eigen_assert(stream); } - const cudaStream_t& stream() const { return *stream_; } + EIGEN_STRONG_INLINE const cudaStream_t& stream() const { return *stream_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + void* result; + cudaMalloc(&result, num_bytes); + return result; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + cudaFree(buffer); + } private: // TODO: multigpu. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index ab2513cea..80fe06957 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -23,7 +23,7 @@ namespace Eigen { * leading to lvalues (slicing, reshaping, etc...) */ -template +template struct TensorEvaluator { typedef typename Derived::Index Index; @@ -38,7 +38,7 @@ struct TensorEvaluator PacketAccess = Derived::PacketAccess, }; - EIGEN_DEVICE_FUNC TensorEvaluator(Derived& m) + EIGEN_DEVICE_FUNC TensorEvaluator(Derived& m, const Device&) : m_data(const_cast(m.data())), m_dims(m.dimensions()) { } @@ -73,8 +73,8 @@ struct TensorEvaluator // -------------------- CwiseNullaryOp -------------------- -template -struct TensorEvaluator > +template +struct TensorEvaluator, Device> { typedef TensorCwiseNullaryOp XprType; @@ -84,14 +84,14 @@ struct TensorEvaluator > }; EIGEN_DEVICE_FUNC - TensorEvaluator(const XprType& op) - : m_functor(op.functor()), m_argImpl(op.nestedExpression()) + TensorEvaluator(const XprType& op, const Device& device) + : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device) { } typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; - typedef typename TensorEvaluator::Dimensions Dimensions; + typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } @@ -108,32 +108,32 @@ struct TensorEvaluator > private: const NullaryOp m_functor; - TensorEvaluator m_argImpl; + TensorEvaluator m_argImpl; }; // -------------------- CwiseUnaryOp -------------------- -template -struct TensorEvaluator > +template +struct TensorEvaluator, Device> { typedef TensorCwiseUnaryOp XprType; enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op) + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : m_functor(op.functor()), - m_argImpl(op.nestedExpression()) + m_argImpl(op.nestedExpression(), device) { } typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; - typedef typename TensorEvaluator::Dimensions Dimensions; + typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } @@ -150,33 +150,33 @@ struct TensorEvaluator > private: const UnaryOp m_functor; - TensorEvaluator m_argImpl; + TensorEvaluator m_argImpl; }; // -------------------- CwiseBinaryOp -------------------- -template -struct TensorEvaluator > +template +struct TensorEvaluator, Device> { typedef TensorCwiseBinaryOp XprType; enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op) + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : m_functor(op.functor()), - m_leftImpl(op.lhsExpression()), - m_rightImpl(op.rhsExpression()) + m_leftImpl(op.lhsExpression(), device), + m_rightImpl(op.rhsExpression(), device) { } typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; - typedef typename TensorEvaluator::Dimensions Dimensions; + typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { @@ -196,34 +196,34 @@ struct TensorEvaluator m_leftImpl; - TensorEvaluator m_rightImpl; + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; }; // -------------------- SelectOp -------------------- -template -struct TensorEvaluator > +template +struct TensorEvaluator, Device> { typedef TensorSelectOp XprType; enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess/* & + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess/* & TensorEvaluator::PacketAccess*/, }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op) - : m_condImpl(op.ifExpression()), - m_thenImpl(op.thenExpression()), - m_elseImpl(op.elseExpression()) + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_condImpl(op.ifExpression(), device), + m_thenImpl(op.thenExpression(), device), + m_elseImpl(op.elseExpression(), device) { } typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; - typedef typename TensorEvaluator::Dimensions Dimensions; + typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { @@ -248,9 +248,9 @@ struct TensorEvaluator } private: - TensorEvaluator m_condImpl; - TensorEvaluator m_thenImpl; - TensorEvaluator m_elseImpl; + TensorEvaluator m_condImpl; + TensorEvaluator m_thenImpl; + TensorEvaluator m_elseImpl; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 1fb90478f..27bfe1d73 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -21,16 +21,17 @@ template class TensorCwiseNullaryO template class TensorCwiseUnaryOp; template class TensorCwiseBinaryOp; template class TensorSelectOp; +template class TensorReductionOp; template class TensorContractionOp; template class TensorConvolutionOp; template class TensorReshapingOp; -template class TensorDevice; +template class TensorForcedEvalOp; -// Move to internal? -template struct TensorEvaluator; +template class TensorDevice; +template struct TensorEvaluator; namespace internal { -template struct TensorAssign; +template struct TensorAssign; } // end namespace internal } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 7d5f9271e..e9e74581f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -77,19 +77,19 @@ class TensorReshapingOp : public TensorBase -struct TensorEvaluator > +template +struct TensorEvaluator, Device> { typedef TensorReshapingOp XprType; typedef NewDimensions Dimensions; enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess, + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, }; - TensorEvaluator(const XprType& op) - : m_impl(op.expression()), m_dimensions(op.dimensions()) + TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_dimensions(op.dimensions()) { } typedef typename XprType::Index Index; @@ -111,7 +111,7 @@ struct TensorEvaluator > private: NewDimensions m_dimensions; - TensorEvaluator m_impl; + TensorEvaluator m_impl; }; -- cgit v1.2.3 From 38ab7e6ed0491bd5a0c639f218d5ea4728bf1e81 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 13 Jun 2014 09:56:51 -0700 Subject: Reworked the expression evaluation mechanism in order to make it possible to efficiently compute convolutions and contractions in the future: * The scheduling of computation is moved out the the assignment code and into a new TensorExecutor class * The assignment itself is now a regular node on the expression tree * The expression evaluators start by recursively evaluating all their subexpressions if needed --- unsupported/Eigen/CXX11/Tensor | 4 + unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 4 +- unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h | 250 +++++++++------------ .../Eigen/CXX11/src/Tensor/TensorContraction.h | 8 + .../Eigen/CXX11/src/Tensor/TensorConvolution.h | 9 + unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h | 16 +- unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 146 ++++++++++++ .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 56 ++++- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 194 ++++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorFixedSize.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 142 ++++++++++++ .../CXX11/src/Tensor/TensorForwardDeclarations.h | 5 +- unsupported/Eigen/CXX11/src/Tensor/TensorMap.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 7 + 14 files changed, 685 insertions(+), 164 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h') diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index c67020581..7e504b302 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -42,8 +42,12 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h" + #include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h" #include "unsupported/Eigen/CXX11/src/Tensor/Tensor.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index 7f614bbe8..09601fc7d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -236,7 +236,9 @@ class Tensor : public TensorBase > // FIXME: we need to resize the tensor to fix the dimensions of the other. // Unfortunately this isn't possible yet when the rhs is an expression. // resize(other.dimensions()); - internal::TensorAssign::run(*this, other); + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); return *this; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 633a7a31b..a2a925775 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -10,10 +10,6 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H #define EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H -#ifdef EIGEN_USE_THREADS -#include -#endif - namespace Eigen { /** \class TensorAssign @@ -21,172 +17,134 @@ namespace Eigen { * * \brief The tensor assignment class. * - * This class is responsible for triggering the evaluation of the expressions - * used on the lhs and rhs of an assignment operator and copy the result of - * the evaluation of the rhs expression at the address computed during the - * evaluation lhs expression. - * - * TODO: vectorization. For now the code only uses scalars - * TODO: parallelisation using multithreading on cpu, or kernels on gpu. + * This class is represents the assignment of the values resulting from the evaluation of + * the rhs expression to the memory locations denoted by the lhs expression. */ namespace internal { - -// Default strategy: the expressions are evaluated with a single cpu thread. -template::PacketAccess & TensorEvaluator::PacketAccess> -struct TensorAssign +template +struct traits > { - typedef typename Derived1::Index Index; - EIGEN_DEVICE_FUNC - static inline void run(Derived1& dst, const Derived2& src, const Device& device = Device()) - { - TensorEvaluator evalDst(dst, device); - TensorEvaluator evalSrc(src, device); - const Index size = dst.size(); - for (Index i = 0; i < size; ++i) { - evalDst.coeffRef(i) = evalSrc.coeff(i); - } - } + typedef typename LhsXprType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename traits::StorageKind StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; + + enum { + Flags = 0, + }; }; +template +struct eval, Eigen::Dense> +{ + typedef const TensorAssignOp& type; +}; -template -struct TensorAssign +template +struct nested, 1, typename eval >::type> { - typedef typename Derived1::Index Index; - static inline void run(Derived1& dst, const Derived2& src, const Device& device = Device()) - { - TensorEvaluator evalDst(dst, device); - TensorEvaluator evalSrc(src, device); - const Index size = dst.size(); - - static const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; - static const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; - static const int PacketSize = unpacket_traits::PacketReturnType>::size; - const int VectorizedSize = (size / PacketSize) * PacketSize; - - for (Index i = 0; i < VectorizedSize; i += PacketSize) { - evalDst.template writePacket(i, evalSrc.template packet(i)); - } - for (Index i = VectorizedSize; i < size; ++i) { - evalDst.coeffRef(i) = evalSrc.coeff(i); - } - } + typedef TensorAssignOp type; }; +} // end namespace internal -// Multicore strategy: the index space is partitioned and each core is assigned to a partition -#ifdef EIGEN_USE_THREADS -template -struct EvalRange { - static void run(LhsEval& dst, const RhsEval& src, const Index first, const Index last) { - eigen_assert(last > first); - for (Index i = first; i < last; ++i) { - dst.coeffRef(i) = src.coeff(i); - } - } -}; -template -struct EvalRange { - static void run(LhsEval& dst, const RhsEval& src, const Index first, const Index last) { - eigen_assert(last > first); - - Index i = first; - static const int PacketSize = unpacket_traits::size; - if (last - first > PacketSize) { - static const int LhsStoreMode = LhsEval::IsAligned ? Aligned : Unaligned; - static const int RhsLoadMode = RhsEval::IsAligned ? Aligned : Unaligned; - eigen_assert(first % PacketSize == 0); - Index lastPacket = last - (last % PacketSize); - for (; i < lastPacket; i += PacketSize) { - dst.template writePacket(i, src.template packet(i)); - } - } - - for (; i < last; ++i) { - dst.coeffRef(i) = src.coeff(i); - } - } +template +class TensorAssignOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename LhsXprType::CoeffReturnType CoeffReturnType; + typedef typename LhsXprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {} + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + typename internal::remove_all::type& + lhsExpression() const { return *((typename internal::remove_all::type*)&m_lhs_xpr); } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + rhsExpression() const { return m_rhs_xpr; } + + protected: + typename internal::remove_all::type& m_lhs_xpr; + const typename internal::remove_all::type& m_rhs_xpr; }; -template -struct TensorAssignMultiThreaded + +template +struct TensorEvaluator, Device> { - typedef typename Derived1::Index Index; - static inline void run(Derived1& dst, const Derived2& src, const ThreadPoolDevice& device) + typedef TensorAssignOp XprType; + + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + m_leftImpl(op.lhsExpression(), device), + m_rightImpl(op.rhsExpression(), device) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename TensorEvaluator::Dimensions Dimensions; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { - TensorEvaluator evalDst(dst, DefaultDevice()); - TensorEvaluator evalSrc(src, Defaultevice()); - const Index size = dst.size(); - - static const bool Vectorizable = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess; - static const int PacketSize = Vectorizable ? unpacket_traits::PacketReturnType>::size : 1; - - int blocksz = static_cast(ceil(static_cast(size)/device.numThreads()) + PacketSize - 1); - const Index blocksize = std::max(PacketSize, (blocksz - (blocksz % PacketSize))); - const Index numblocks = size / blocksize; - - Index i = 0; - vector > results; - results.reserve(numblocks); - for (int i = 0; i < numblocks; ++i) { - results.push_back(std::async(std::launch::async, &EvalRange, TensorEvaluator, Index>::run, evalDst, evalSrc, i*blocksize, (i+1)*blocksize)); - } - - for (int i = 0; i < numblocks; ++i) { - results[i].get(); - } - - if (numblocks * blocksize < size) { - EvalRange, TensorEvaluator, Index>::run(evalDst, evalSrc, numblocks * blocksize, size); - } + // TODO: use left impl instead if right impl dimensions are known at compile time. + return m_rightImpl.dimensions(); } -}; -#endif - -// GPU: the evaluation of the expressions is offloaded to a GPU. -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) -template -__global__ void EigenMetaKernelNoCheck(LhsEvaluator evalDst, const RhsEvaluator evalSrc) { - const int index = blockIdx.x * blockDim.x + threadIdx.x; - evalDst.coeffRef(index) = evalSrc.coeff(index); -} -template -__global__ void EigenMetaKernelPeel(LhsEvaluator evalDst, const RhsEvaluator evalSrc, int peel_start_offset, int size) { - const int index = peel_start_offset + blockIdx.x * blockDim.x + threadIdx.x; - if (index < size) { - evalDst.coeffRef(index) = evalSrc.coeff(index); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { + m_leftImpl.evalSubExprsIfNeeded(); + m_rightImpl.evalSubExprsIfNeeded(); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); } -} -template -struct TensorAssignGpu -{ - typedef typename Derived1::Index Index; - static inline void run(Derived1& dst, const Derived2& src, const GpuDevice& device) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { + m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { + static const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + static const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + m_leftImpl.template writePacket(i, m_rightImpl.template packet(i)); + } + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { - TensorEvaluator evalDst(dst, device); - TensorEvaluator evalSrc(src, device); - const Index size = dst.size(); - const int block_size = std::min(size, 32*32); - const int num_blocks = size / block_size; - EigenMetaKernelNoCheck, TensorEvaluator > <<>>(evalDst, evalSrc); - - const int remaining_items = size % block_size; - if (remaining_items > 0) { - const int peel_start_offset = num_blocks * block_size; - const int peel_block_size = std::min(size, 32); - const int peel_num_blocks = (remaining_items + peel_block_size - 1) / peel_block_size; - EigenMetaKernelPeel, TensorEvaluator > <<>>(evalDst, evalSrc, peel_start_offset, size); - } + return m_leftImpl.coeff(index); } + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + { + return m_leftImpl.template packet(index); + } + + private: + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; }; -#endif -} // end namespace internal +} -} // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index cadbabda2..b2e12fd15 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -184,6 +184,14 @@ struct TensorEvaluator class TensorDevice { template EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { - internal::TensorAssign::run(m_expression, other, m_device); + typedef TensorAssignOp Assign; + Assign assign(m_expression, other); + static const bool Vectorize = TensorEvaluator::PacketAccess; + internal::TensorExecutor::run(assign, m_device); return *this; } @@ -48,7 +51,10 @@ template class TensorDevice EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { - internal::TensorAssignMultiThreaded::run(m_expression, other, m_device); + typedef TensorAssignOp Assign; + Assign assign(m_expression, other); + static const bool Vectorize = TensorEvaluator::PacketAccess; + internal::TensorExecutor::run(assign, m_device); return *this; } @@ -67,13 +73,15 @@ template class TensorDevice template EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { - internal::TensorAssignGpu::run(m_expression, other, m_device); + typedef TensorAssignOp Assign; + Assign assign(m_expression, other); + internal::TensorExecutor::run(assign, m_device); return *this; } protected: const GpuDevice& m_device; - ExpressionType& m_expression; + ExpressionType m_expression; }; #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h new file mode 100644 index 000000000..db716a80e --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -0,0 +1,146 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H +#define EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H + +namespace Eigen { + +/** \class TensorForcedEval + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reshaping class. + * + * + */ +namespace internal { +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename XprType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + + enum { + Flags = 0, + }; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorEvalToOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorEvalToOp type; +}; + +} // end namespace internal + + + + +template +class TensorEvalToOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(Scalar* buffer, const XprType& expr) + : m_xpr(expr), m_buffer(buffer) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC Scalar* buffer() const { return m_buffer; } + + protected: + typename XprType::Nested m_xpr; + Scalar* m_buffer; +}; + + + +template +struct TensorEvaluator, Device> +{ + typedef TensorEvalToOp XprType; + typedef typename ArgType::Scalar Scalar; + typedef typename ArgType::Packet Packet; + typedef typename TensorEvaluator::Dimensions Dimensions; + + enum { + IsAligned = true, + PacketAccess = true, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_device(device), m_buffer(op.buffer()) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() { + } + + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { + m_impl.evalSubExprsIfNeeded(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { + m_buffer[i] = m_impl.coeff(i); + } + EIGEN_STRONG_INLINE void evalPacket(Index i) { + internal::pstoret(m_buffer + i, m_impl.template packet::IsAligned ? Aligned : Unaligned>(i)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_buffer[index]; + } + + template + EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return internal::ploadt(m_buffer + index); + } + + private: + TensorEvaluator m_impl; + const Device& m_device; + Scalar* m_buffer; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 80fe06957..5c8b079da 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -38,27 +38,32 @@ struct TensorEvaluator PacketAccess = Derived::PacketAccess, }; - EIGEN_DEVICE_FUNC TensorEvaluator(Derived& m, const Device&) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(Derived& m, const Device&) : m_data(const_cast(m.data())), m_dims(m.dimensions()) { } - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dims; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(m_data); return m_data[index]; } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { + eigen_assert(m_data); return m_data[index]; } - template + template EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return internal::ploadt(m_data + index); } - template + template EIGEN_STRONG_INLINE void writePacket(Index index, const Packet& x) { return internal::pstoret(m_data + index, x); @@ -95,13 +100,16 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { return m_functor(index); } template - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return m_functor.packetOp(index); } @@ -137,13 +145,20 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { + m_argImpl.evalSubExprsIfNeeded(); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_argImpl.cleanup(); + } + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { return m_functor(m_argImpl.coeff(index)); } template - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return m_functor.packetOp(m_argImpl.template packet(index)); } @@ -184,12 +199,21 @@ struct TensorEvaluator - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return m_functor.packetOp(m_leftImpl.template packet(index), m_rightImpl.template packet(index)); } @@ -230,12 +254,24 @@ struct TensorEvaluator // TODO: use then or else impl instead if they happen to be known at compile time. return m_condImpl.dimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { + m_condImpl.evalSubExprsIfNeeded(); + m_thenImpl.evalSubExprsIfNeeded(); + m_elseImpl.evalSubExprsIfNeeded(); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_condImpl.cleanup(); + m_thenImpl.cleanup(); + m_elseImpl.cleanup(); + } + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index); } template - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + PacketReturnType packet(Index index) const { static const int PacketSize = internal::unpacket_traits::size; internal::Selector select; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h new file mode 100644 index 000000000..3e41f3290 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -0,0 +1,194 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H + +#ifdef EIGEN_USE_THREADS +#include +#endif + +namespace Eigen { + +/** \class TensorExecutor + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor executor class. + * + * This class is responsible for launch the evaluation of the expression on + * the specified computing device. + */ +namespace internal { + +// Default strategy: the expression is evaluated with a single cpu thread. +template::PacketAccess> +struct TensorExecutor +{ + typedef typename Expression::Index Index; + EIGEN_DEVICE_FUNC + static inline void run(const Expression& expr, const Device& device = Device()) + { + TensorEvaluator evaluator(expr, device); + evaluator.evalSubExprsIfNeeded(); + + const Index size = evaluator.dimensions().TotalSize(); + for (Index i = 0; i < size; ++i) { + evaluator.evalScalar(i); + } + + evaluator.cleanup(); + } +}; + + +template +struct TensorExecutor +{ + typedef typename Expression::Index Index; + static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) + { + TensorEvaluator evaluator(expr, device); + evaluator.evalSubExprsIfNeeded(); + + const Index size = evaluator.dimensions().TotalSize(); + static const int PacketSize = unpacket_traits::PacketReturnType>::size; + const int VectorizedSize = (size / PacketSize) * PacketSize; + + for (Index i = 0; i < VectorizedSize; i += PacketSize) { + evaluator.evalPacket(i); + } + for (Index i = VectorizedSize; i < size; ++i) { + evaluator.evalScalar(i); + } + + evaluator.cleanup(); + } +}; + + + +// Multicore strategy: the index space is partitioned and each partition is executed on a single core +#ifdef EIGEN_USE_THREADS +template +struct EvalRange { + static void run(Evaluator& evaluator, const Index first, const Index last) { + eigen_assert(last > first); + for (Index i = first; i < last; ++i) { + evaluator.evalScalar(i); + } + } +}; + +template +struct EvalRange { + static void run(Evaluator& evaluator, const Index first, const Index last,) { + eigen_assert(last > first); + + Index i = first; + static const int PacketSize = unpacket_traits::size; + if (last - first > PacketSize) { + eigen_assert(first % PacketSize == 0); + Index lastPacket = last - (last % PacketSize); + for (; i < lastPacket; i += PacketSize) { + evaluator.evalPacket(i); + } + } + + for (; i < last; ++i) { + evaluator.evalScalar(i); + } + } +}; + +template +struct TensorExecutor +{ + typedef typename Expression::Index Index; + static inline void run(const Expression& expr, const ThreadPoolDevice& device) + { + TensorEvaluator evaluator(expr, device); + evaluator.evalSubExprsIfNeeded(); + + const Index size = evaluator.dimensions().TotalSize(); + + static const int PacketSize = Vectorizable ? unpacket_traits::PacketReturnType>::size : 1; + + int blocksz = std::ceil(static_cast(size)/device.numThreads()) + PacketSize - 1; + const Index blocksize = std::max(PacketSize, (blocksz - (blocksz % PacketSize))); + const Index numblocks = size / blocksize; + + TensorEvaluator single_threaded_eval(expr, DefaultDevice()); + + Index i = 0; + vector > results; + results.reserve(numblocks); + for (int i = 0; i < numblocks; ++i) { + results.push_back(std::async(std::launch::async, &EvalRange, Index>::run, single_threaded_eval, i*blocksize, (i+1)*blocksize)); + } + + for (int i = 0; i < numblocks; ++i) { + results[i].get(); + } + + if (numblocks * blocksize < size) { + EvalRange, Index>::run(single_threaded_eval, numblocks * blocksize, size, nullptr); + } + + evaluator.cleanup(); + } +}; +#endif + + +// GPU: the evaluation of the expression is offloaded to a GPU. +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +template +__global__ void EigenMetaKernelNoCheck(Evaluator eval) { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + eval.evalScalar(index); +} +template +__global__ void EigenMetaKernelPeel(Evaluator eval, int peel_start_offset, int size) { + const int index = peel_start_offset + blockIdx.x * blockDim.x + threadIdx.x; + if (index < size) { + eval.evalScalar(index); + } +} + +template +struct TensorExecutor +{ + typedef typename Expression::Index Index; + static inline void run(const Expression& expr, const GpuDevice& device) + { + TensorEvaluator evaluator(expr, device); + evaluator.evalSubExprsIfNeeded(); + + const Index size = evaluator.dimensions().TotalSize(); + const int block_size = std::min(size, 32*32); + const int num_blocks = size / block_size; + EigenMetaKernelNoCheck > <<>>(evaluator); + + const int remaining_items = size % block_size; + if (remaining_items > 0) { + const int peel_start_offset = num_blocks * block_size; + const int peel_block_size = std::min(size, 32); + const int peel_num_blocks = (remaining_items + peel_block_size - 1) / peel_block_size; + EigenMetaKernelPeel > <<>>(evaluator, peel_start_offset, size); + } + evaluator.cleanup(); + } +}; +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index 789c04238..d42167da9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -200,7 +200,9 @@ class TensorFixedSize : public TensorBase::run(*this, other); + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); return *this; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h new file mode 100644 index 000000000..6f6641de6 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -0,0 +1,142 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H +#define EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H + +namespace Eigen { + +/** \class TensorForcedEval + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reshaping class. + * + * + */ +namespace internal { +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename XprType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + + enum { + Flags = 0, + }; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorForcedEvalOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorForcedEvalOp type; +}; + +} // end namespace internal + + + +template +class TensorForcedEvalOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorForcedEvalOp(const XprType& expr) + : m_xpr(expr) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; +}; + + +template +struct TensorEvaluator, Device> +{ + typedef TensorForcedEvalOp XprType; + typedef typename ArgType::Scalar Scalar; + typedef typename ArgType::Packet Packet; + typedef typename TensorEvaluator::Dimensions Dimensions; + + enum { + IsAligned = true, + PacketAccess = true, + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) + { } + + EIGEN_DEVICE_FUNC ~TensorEvaluator() { + eigen_assert(!m_buffer); + } + + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { + m_impl.evalSubExprsIfNeeded(); + m_buffer = (Scalar*)m_device.allocate(m_impl.dimensions().TotalSize() * sizeof(Scalar)); + + typedef TensorEvalToOp EvalTo; + EvalTo evalToTmp(m_buffer, m_op); + internal::TensorExecutor::PacketAccess>::run(evalToTmp, m_device); + m_impl.cleanup(); + } + EIGEN_STRONG_INLINE void cleanup() { + m_device.deallocate(m_buffer); + m_buffer = NULL; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_buffer[index]; + } + + template + EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return internal::ploadt(m_buffer + index); + } + + private: + TensorEvaluator m_impl; + const ArgType m_op; + const Device& m_device; + Scalar* m_buffer; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 27bfe1d73..c0dffbd0c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -25,13 +25,16 @@ template class TensorReductionOp; template class TensorContractionOp; template class TensorConvolutionOp; template class TensorReshapingOp; +template class TensorAssignOp; + +template class TensorEvalToOp; template class TensorForcedEvalOp; template class TensorDevice; template struct TensorEvaluator; namespace internal { -template struct TensorAssign; +template class TensorExecutor; } // end namespace internal } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 3a06170fa..c97135b63 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -246,7 +246,9 @@ template class TensorMap : public Tensor EIGEN_DEVICE_FUNC Self& operator=(const OtherDerived& other) { - internal::TensorAssign::run(*this, other); + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); return *this; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index e9e74581f..764bba4e6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -98,6 +98,13 @@ struct TensorEvaluator, Device> const Dimensions& dimensions() const { return m_dimensions; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { + m_impl.evalSubExprsIfNeeded(); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_impl.coeff(index); -- cgit v1.2.3 From f8fad09301106c574ed88ffde52e15483d14673f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 13 Aug 2014 08:33:18 -0700 Subject: Updated the convolution and contraction evaluators to follow the new EvalSubExprsIfNeeded apu. --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 7 ++++--- unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h | 11 ++++++----- 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index b2e12fd15..8d7a1351e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -184,9 +184,10 @@ struct TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); @@ -151,11 +151,12 @@ struct TensorEvaluator Date: Wed, 13 Aug 2014 08:36:33 -0700 Subject: Added missing apis. --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 4 +++- unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 8d7a1351e..b2969337f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -173,7 +173,7 @@ struct TensorEvaluator::value> m_leftOffsets; array::value> m_rightOffsets; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 8864c5329..e3068dcae 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -148,6 +148,7 @@ struct TensorEvaluator m_inputStride; array m_outputStride; -- cgit v1.2.3 From f1d8c13dbcbe38938dcd727f9b50339a981197c3 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 13 Aug 2014 08:40:26 -0700 Subject: Fixed misc typos. --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index b2969337f..897d73806 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -184,7 +184,7 @@ struct TensorEvaluator Date: Sat, 6 Sep 2014 13:28:24 -0700 Subject: Fixed a typo in the contraction code --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 897d73806..46624724c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -168,7 +168,7 @@ struct TensorEvaluator::Dimensions::count + TensorEvaluator::Dimensions::count == 2 * internal::array_size::value) { + if (TensorEvaluator::Dimensions::count + TensorEvaluator::Dimensions::count == 2 * internal::array_size::value) { m_dimensions[0] = 1; } } -- cgit v1.2.3 From 12693928228922ecf8fa3fcf14341d195e376a11 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 3 Oct 2014 10:16:59 -0700 Subject: Created the IndexPair type to store pair of tensor indices. CUDA doesn't support std::pair so we can't use them when targeting GPUs. Improved the performance on tensor contractions --- .../Eigen/CXX11/src/Core/util/CXX11Workarounds.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 729 ++++++++++++++++++--- .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 7 + 3 files changed, 656 insertions(+), 84 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h') diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h index 3812ecd1f..227522ecb 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h @@ -69,11 +69,13 @@ template constexpr inline T const& array_ #undef STD_GET_ARR_HACK template struct array_size; +template struct array_size > { + static const size_t value = N; +}; template struct array_size > { static const size_t value = N; }; - /* Suppose you have a template of the form * template struct X; * And you want to specialize it in such a way: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 46624724c..1e6f276e0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -20,6 +20,319 @@ namespace Eigen { * */ namespace internal { + +enum { + Rhs = 0, + Lhs = 1, +}; + +/* + * Implementation of the Eigen blas_data_mapper class for tensors. + */ +template +class BaseTensorContractionMapper { + public: + EIGEN_DEVICE_FUNC + BaseTensorContractionMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) : + m_tensor(tensor), + m_nocontract_strides(nocontract_strides), + m_ij_strides(ij_strides), + m_contract_strides(contract_strides), + m_k_strides(k_strides) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void prefetch(int i) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar operator()(Index row) const { + // column major assumption + return operator()(row, 0); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const { + return m_tensor.coeff(computeIndex(row, col)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const { + const bool left = (side == Lhs); + Index nocontract_val = left ? row : col; + Index linidx = 0; + for (int i = array_size::value - 1; i > 0; i--) { + const Index idx = nocontract_val / m_ij_strides[i]; + linidx += idx * m_nocontract_strides[i]; + nocontract_val -= idx * m_ij_strides[i]; + } + if (array_size::value > array_size::value) { + if (side == Lhs && inner_dim_contiguous) { + eigen_assert(m_nocontract_strides[0] == 1); + linidx += nocontract_val; + } else { + linidx += nocontract_val * m_nocontract_strides[0]; + } + } + + Index contract_val = left ? col : row; + for (int i = array_size::value - 1; i > 0; i--) { + const Index idx = contract_val / m_k_strides[i]; + linidx += idx * m_contract_strides[i]; + contract_val -= idx * m_k_strides[i]; + } + EIGEN_STATIC_ASSERT(array_size::value > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + if (side == Rhs && inner_dim_contiguous) { + eigen_assert(m_contract_strides[0] == 1); + linidx += contract_val; + } else { + linidx += contract_val * m_contract_strides[0]; + } + + return linidx; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE IndexPair computeIndexPair(Index row, Index col, const Index distance) const { + const bool left = (side == Lhs); + Index nocontract_val[2] = {left ? row : col, left ? row + distance : col}; + Index linidx[2] = {0, 0}; + for (int i = array_size::value - 1; i > 0; i--) { + const Index idx0 = nocontract_val[0] / m_ij_strides[i]; + const Index idx1 = nocontract_val[1] / m_ij_strides[i]; + linidx[0] += idx0 * m_nocontract_strides[i]; + linidx[1] += idx1 * m_nocontract_strides[i]; + nocontract_val[0] -= idx0 * m_ij_strides[i]; + nocontract_val[1] -= idx1 * m_ij_strides[i]; + } + if (array_size::value > array_size::value) { + if (side == Lhs && inner_dim_contiguous) { + eigen_assert(m_nocontract_strides[0] == 1); + linidx[0] += nocontract_val[0]; + linidx[1] += nocontract_val[1]; + } else { + linidx[0] += nocontract_val[0] * m_nocontract_strides[0]; + linidx[1] += nocontract_val[1] * m_nocontract_strides[0]; + } + } + + Index contract_val[2] = {left ? col : row, left ? col : row + distance}; + for (int i = array_size::value - 1; i > 0; i--) { + const Index idx0 = contract_val[0] / m_k_strides[i]; + const Index idx1 = contract_val[1] / m_k_strides[i]; + linidx[0] += idx0 * m_contract_strides[i]; + linidx[1] += idx1 * m_contract_strides[i]; + contract_val[0] -= idx0 * m_k_strides[i]; + contract_val[1] -= idx1 * m_k_strides[i]; + } + EIGEN_STATIC_ASSERT(array_size::value > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + if (side == Rhs && inner_dim_contiguous) { + eigen_assert(m_contract_strides[0] == 1); + linidx[0] += contract_val[0]; + linidx[1] += contract_val[1]; + } else { + linidx[0] += contract_val[0] * m_contract_strides[0]; + linidx[1] += contract_val[1] * m_contract_strides[0]; + } + return IndexPair(linidx[0], linidx[1]); + } + + protected: + const Tensor m_tensor; + const nocontract_t m_nocontract_strides; + const nocontract_t m_ij_strides; + const contract_t m_contract_strides; + const contract_t m_k_strides; +}; + + + +template +class TensorContractionInputMapper; + +template +class TensorContractionSubMapper { + public: + typedef typename packet_traits::type Packet; + typedef typename packet_traits::half HalfPacket; + + typedef TensorContractionInputMapper ParentMapper; + typedef TensorContractionSubMapper Self; + typedef Self LinearMapper; + + EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset) + : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { + return m_base_mapper(i + m_vert_offset, m_horiz_offset); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const { + return m_base_mapper(i + m_vert_offset, j + m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { + return m_base_mapper.loadPacket(i + m_vert_offset, m_horiz_offset); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { + return m_base_mapper.loadPacket(i + m_vert_offset, j + m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { + return m_base_mapper.loadHalfPacket(i + m_vert_offset, m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const { + m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset); + } + + private: + const ParentMapper& m_base_mapper; + const Index m_vert_offset; + const Index m_horiz_offset; +}; + + +template::size : 1), + bool inner_dim_contiguous = false, bool inner_dim_reordered = (side != Lhs), int Alignment=Unaligned> +class TensorContractionInputMapper + : public BaseTensorContractionMapper { + + public: + typedef BaseTensorContractionMapper Base; + typedef TensorContractionSubMapper SubMapper; + + TensorContractionInputMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) + : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const { + return SubMapper(*this, i, j); + } + + typedef typename packet_traits::type Packet; + typedef typename packet_traits::half HalfPacket; + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { + // whole method makes column major assumption + + // don't need to add offsets for now (because operator handles that) + // current code assumes packet size must be a multiple of 2 + EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + + if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) { + const Index index = this->computeIndex(i, j); + eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1); + return this->m_tensor.template packet(index); + } + + const IndexPair indexPair = this->computeIndexPair(i, j, packet_size - 1); + const Index first = indexPair.first; + const Index last = indexPair.second; + + // We can always do optimized packet reads from left hand side right now, because + // the vertical matrix dimension on the left hand side is never contracting. + // On the right hand side we need to check if the contracting dimensions may have + // been shuffled first. + if (Tensor::PacketAccess && + (side == Lhs || internal::array_size::value <= 1 || !inner_dim_reordered) && + (last - first) == (packet_size - 1)) { + + return this->m_tensor.template packet(first); + } + + EIGEN_ALIGN_DEFAULT Scalar data[packet_size]; + + data[0] = this->m_tensor.coeff(first); + for (Index k = 1; k < packet_size - 1; k += 2) { + const IndexPair internal_pair = this->computeIndexPair(i + k, j, 1); + data[k] = this->m_tensor.coeff(internal_pair.first); + data[k + 1] = this->m_tensor.coeff(internal_pair.second); + } + data[packet_size - 1] = this->m_tensor.coeff(last); + + return pload(data); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { + // whole method makes column major assumption + + // don't need to add offsets for now (because operator handles that) + const Index half_packet_size = unpacket_traits::size; + if (half_packet_size == packet_size) { + return loadPacket(i, j); + } + EIGEN_ALIGN_DEFAULT Scalar data[half_packet_size]; + for (Index k = 0; k < half_packet_size; k++) { + data[k] = operator()(i + k, j); + } + return pload(data); + } +}; + + +template +class TensorContractionInputMapper + : public BaseTensorContractionMapper { + + public: + typedef BaseTensorContractionMapper Base; + typedef TensorContractionSubMapper SubMapper; + + TensorContractionInputMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) + : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const { + return SubMapper(*this, i, j); + } + + typedef typename packet_traits::type Packet; + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { + EIGEN_ALIGN_DEFAULT Scalar data[1]; + data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); + return pload::type>(data); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const { + return loadPacket(i, j); + } +}; + + template struct traits > { @@ -53,6 +366,14 @@ struct nested, 1, typena typedef TensorContractionOp type; }; +template +struct traits, Device_> > { + typedef Indices_ Indices; + typedef LeftArgType_ LeftArgType; + typedef RightArgType_ RightArgType; + typedef Device_ Device; +}; + } // end namespace internal @@ -102,143 +423,385 @@ template <> struct max_n_1<0> { }; -template -struct TensorEvaluator, Device> +template +struct TensorContractionEvaluatorBase { + typedef typename internal::traits::Indices Indices; + typedef typename internal::traits::LeftArgType LeftArgType; + typedef typename internal::traits::RightArgType RightArgType; + typedef typename internal::traits::Device Device; + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Packet Packet; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + typedef array::Dimensions::count> left_dim_mapper_t; + typedef array::Dimensions::count> right_dim_mapper_t; + + typedef array::value> contract_t; + typedef array::Dimensions::count - internal::array_size::value>::size> left_nocontract_t; + typedef array::Dimensions::count - internal::array_size::value>::size> right_nocontract_t; static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * internal::array_size::value>::size; - typedef typename XprType::Index Index; + typedef DSizes Dimensions; enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = /*TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess */ - false, + IsAligned = true, + PacketAccess = (internal::packet_traits::size > 1), }; - TensorEvaluator(const XprType& op, const Device& device) - : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionEvaluatorBase(const XprType& op, const Device& device) + : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_device(device), m_result(NULL) { - Index index = 0; - Index stride = 1; - m_shiftright = 1; + eigen_assert((internal::array_size::value > 0) && "Must contract on some indices"); + + array::Dimensions::count> lhs_strides; + lhs_strides[0] = 1; + for (int i = 0; i < TensorEvaluator::Dimensions::count-1; ++i) { + lhs_strides[i+1] = lhs_strides[i] * m_leftImpl.dimensions()[i]; + } + + array::Dimensions::count> rhs_strides; + rhs_strides[0] = 1; + for (int i = 0; i < TensorEvaluator::Dimensions::count-1; ++i) { + rhs_strides[i+1] = rhs_strides[i] * m_rightImpl.dimensions()[i]; + } - int skipped = 0; + m_i_strides[0] = 1; + m_j_strides[0] = 1; + m_k_strides[0] = 1; + + m_i_size = 1; + m_j_size = 1; + m_k_size = 1; + + // To compute the dimension, we simply concatenate the non-contracting + // dimensions of the left and then the right tensor. Additionally, we also + // compute the strides corresponding to the left non-contracting + // dimensions and right non-contracting dimensions. + m_lhs_inner_dim_contiguous = true; + int dim_idx = 0; + int nocontract_idx = 0; const typename TensorEvaluator::Dimensions& left_dims = m_leftImpl.dimensions(); - for (int i = 0; i < TensorEvaluator::Dimensions::count; ++i) { - bool skip = false; - for (int j = 0; j < internal::array_size::value; ++j) { + for (int i = 0; i < TensorEvaluator::Dimensions::count; i++) { + // find if we are contracting on index i of left tensor + bool contracting = false; + for (int j = 0; j < internal::array_size::value; j++) { if (op.indices()[j].first == i) { - skip = true; - m_leftOffsets[2*skipped] = stride; - m_leftOffsets[2*skipped+1] = stride * left_dims[i]; - m_stitchsize[skipped] = left_dims[i]; + contracting = true; break; } } - if (!skip) { - m_dimensions[index++] = left_dims[i]; - m_shiftright *= left_dims[i]; - } else { - ++skipped; + if (!contracting) { + // add dimension size to output dimensions + m_dimensions[dim_idx] = left_dims[i]; + m_left_nocontract_strides[nocontract_idx] = lhs_strides[i]; + if (dim_idx != i) { + m_lhs_inner_dim_contiguous = false; + } + if (nocontract_idx+1 < internal::array_size::value) { + m_i_strides[nocontract_idx+1] = m_i_strides[nocontract_idx] * left_dims[i]; + } else { + m_i_size = m_i_strides[nocontract_idx] * left_dims[i]; + } + dim_idx++; + nocontract_idx++; } - stride *= left_dims[i]; } - stride = 1; - skipped = 0; + nocontract_idx = 0; const typename TensorEvaluator::Dimensions& right_dims = m_rightImpl.dimensions(); - for (int i = 0; i < TensorEvaluator::Dimensions::count; ++i) { - bool skip = false; - for (int j = 0; j < internal::array_size::value; ++j) { + for (int i = 0; i < TensorEvaluator::Dimensions::count; i++) { + bool contracting = false; + // find if we are contracting on index i of right tensor + for (int j = 0; j < internal::array_size::value; j++) { if (op.indices()[j].second == i) { - skip = true; - m_rightOffsets[2*skipped] = stride; - m_rightOffsets[2*skipped+1] = stride * right_dims[i]; + contracting = true; break; } } - if (!skip) { - m_dimensions[index++] = right_dims[i]; + if (!contracting) { + m_dimensions[dim_idx] = right_dims[i]; + if (nocontract_idx+1 < internal::array_size::value) { + m_j_strides[nocontract_idx+1] = m_j_strides[nocontract_idx] * right_dims[i]; + } else { + m_j_size = m_j_strides[nocontract_idx] * right_dims[i]; + } + m_right_nocontract_strides[nocontract_idx] = rhs_strides[i]; + dim_idx++; + nocontract_idx++; + } + } + + // Now compute the strides corresponding to the contracting dimensions. We + // assumed above that non-contracting axes are represented in the same order + // in the matrix as they are in the tensor. This is not the case for + // contracting axes. As the contracting axes must be of the same size in + // each tensor, we'll only look at the first tensor here. + m_rhs_inner_dim_contiguous = true; + m_rhs_inner_dim_reordered = false; + for (int i = 0; i < internal::array_size::value; i++) { + Index left = op.indices()[i].first; + Index right = op.indices()[i].second; + + Index size = left_dims[left]; + eigen_assert(size == right_dims[right] && "Contraction axes must be same size"); + + if (i+1 < internal::array_size::value) { + m_k_strides[i+1] = m_k_strides[i] * size; } else { - ++skipped; + m_k_size = m_k_strides[i] * size; + } + m_left_contracting_strides[i] = lhs_strides[left]; + m_right_contracting_strides[i] = rhs_strides[right]; + + if (i > 0 && right < op.indices()[i-1].second) { + m_rhs_inner_dim_reordered = true; + } + if (right != i) { + m_rhs_inner_dim_contiguous = false; } - stride *= right_dims[i]; } - // Scalar case + // Scalar case. We represent the result as a 1d tensor of size 1. if (TensorEvaluator::Dimensions::count + TensorEvaluator::Dimensions::count == 2 * internal::array_size::value) { m_dimensions[0] = 1; } } - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; - - const Dimensions& dimensions() const { return m_dimensions; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - void evalTo(typename XprType::Scalar* buffer) const { - for (int i = 0; i < dimensions().TotalSize(); ++i) { - buffer[i] += coeff(i); - } - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { m_leftImpl.evalSubExprsIfNeeded(NULL); m_rightImpl.evalSubExprsIfNeeded(NULL); - return true; + if (data) { + evalTo(data); + return false; + } else { + m_result = static_cast(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + evalTo(m_result); + return true; + } + } + + EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const { + if (this->m_lhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + static_cast(this)->template evalTyped(buffer); + } + else { + static_cast(this)->template evalTyped(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + static_cast(this)->template evalTyped(buffer); + } + else { + static_cast(this)->template evalTyped(buffer); + } + } + } + else { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + static_cast(this)->template evalTyped(buffer); + } + else { + static_cast(this)->template evalTyped(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + static_cast(this)->template evalTyped(buffer); + } + else { + static_cast(this)->template evalTyped(buffer); + } + } + } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_leftImpl.cleanup(); m_rightImpl.cleanup(); + + if (m_result != NULL) { + m_device.deallocate(m_result); + m_result = NULL; + } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - const Index startLeft = index % m_shiftright; - const Index startRight = index / m_shiftright; - CoeffReturnType result = CoeffReturnType(0); - partialStitch(startLeft, startRight, 0, result); - return result; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return m_result[index]; } - /* TODO: vectorization template - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const - { - assert(false); - }*/ - - private: - EIGEN_DEVICE_FUNC void partialStitch(Index startLeft, Index startRight, int StitchIndex, CoeffReturnType& accum) const { - Index firstLeft = (startLeft / m_leftOffsets[2*StitchIndex]) * m_leftOffsets[2*StitchIndex+1] + (startLeft % m_leftOffsets[2*StitchIndex]); - Index firstRight = (startRight / m_rightOffsets[2*StitchIndex]) * m_rightOffsets[2*StitchIndex+1] + (startRight % m_rightOffsets[2*StitchIndex]); - - for (int j = 0; j < m_stitchsize[StitchIndex]; ++j) { - const Index left = firstLeft+j*m_leftOffsets[2*StitchIndex]; - const Index right = firstRight+j*m_rightOffsets[2*StitchIndex]; - if (StitchIndex < internal::array_size::value-1) { - partialStitch(left, right, StitchIndex+1, accum); - } else { - accum += m_leftImpl.coeff(left) * m_rightImpl.coeff(right); - } - } + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { + return internal::ploadt(m_result + index); } - Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + // Prevent assignment + TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&); - private: - array::value> m_leftOffsets; - array::value> m_rightOffsets; - array::value> m_stitchsize; - Index m_shiftright; Dimensions m_dimensions; + + contract_t m_k_strides; + contract_t m_left_contracting_strides; + contract_t m_right_contracting_strides; + + bool m_lhs_inner_dim_contiguous; + bool m_rhs_inner_dim_contiguous; + bool m_rhs_inner_dim_reordered; + + left_nocontract_t m_i_strides; + right_nocontract_t m_j_strides; + left_nocontract_t m_left_nocontract_strides; + right_nocontract_t m_right_nocontract_strides; + + Index m_i_size; + Index m_j_size; + Index m_k_size; + + const Device& m_device; + Scalar* m_result; TensorEvaluator m_leftImpl; TensorEvaluator m_rightImpl; }; +template +struct TensorEvaluator, Device> : + public TensorContractionEvaluatorBase, Device> > { + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Packet Packet; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + typedef array::Dimensions::count> left_dim_mapper_t; + typedef array::Dimensions::count> right_dim_mapper_t; + + typedef array::value> contract_t; + typedef array::Dimensions::count - internal::array_size::value>::size> left_nocontract_t; + typedef array::Dimensions::count - internal::array_size::value>::size> right_nocontract_t; + + static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * internal::array_size::value>::size; + + typedef DSizes Dimensions; + + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + Base(op, device) { } + + template + EIGEN_DEVICE_FUNC void evalTyped(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + // define mr, nr, and all of my data mapper types + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + typedef typename internal::gebp_traits Traits; + + const Index nr = Traits::nr; + const Index mr = Traits::mr; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + + const int lhs_packet_size = internal::packet_traits::size; + const int rhs_packet_size = internal::packet_traits::size; + + typedef internal::TensorContractionInputMapper LhsMapper; + + typedef internal::TensorContractionInputMapper RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + + // Declare GEBP packing and kernel structs + internal::gemm_pack_lhs pack_lhs; + internal::gemm_pack_rhs pack_rhs; + internal::gebp_kernel gebp; + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + typedef typename internal::gemm_blocking_space BlockingType; + + // Sizes of the blocks to load in cache. See the Goto paper for details. + BlockingType blocking(m, n, k, true); + const Index kc = blocking.kc(); + const Index mc = (std::min)(m, blocking.mc()); + const Index nc = (std::min)(n, blocking.nc()); + int sizeA = mc * kc; + int sizeB = kc * nc; + + LhsScalar* blockA = static_cast(this->m_device.allocate(sizeA * sizeof(LhsScalar))); + RhsScalar* blockB = static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar))); + + for(Index i2=0; i2m_device.deallocate(blockA); + this->m_device.deallocate(blockB); + } +}; + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 5a113dc19..11590b474 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -29,6 +29,13 @@ namespace Eigen { * \sa Tensor */ +// Can't use std::pairs on cuda devices +template struct IndexPair { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) { } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Index f, Index s) : first(f), second(s) { } + Index first; + Index second; +}; // Boiler plate code -- cgit v1.2.3 From bfdd9f3ac95d9a2b41e6f2ec1f7434331125b9e1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 15 Oct 2014 15:32:59 -0700 Subject: Made the blocking computation aware of the l3 cache Also optimized the blocking parameters to take into account the number of threads used for a computation --- Eigen/src/Core/SolveTriangular.h | 2 +- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 124 ++++++++++++++------- Eigen/src/Core/products/GeneralMatrixMatrix.h | 16 +-- .../Core/products/GeneralMatrixMatrixTriangular.h | 2 +- Eigen/src/Core/products/Parallelizer.h | 4 +- Eigen/src/Core/products/SelfadjointMatrixMatrix.h | 6 +- Eigen/src/Core/products/TriangularMatrixMatrix.h | 2 +- Eigen/src/Core/products/TriangularSolverMatrix.h | 4 +- blas/level3_impl.h | 12 +- test/product_large.cpp | 7 +- unsupported/Eigen/CXX11/Tensor | 2 +- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 2 +- .../CXX11/src/Tensor/TensorContractionThreadPool.h | 13 +-- 13 files changed, 117 insertions(+), 79 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h') diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h index ef17f288e..e158e3162 100644 --- a/Eigen/src/Core/SolveTriangular.h +++ b/Eigen/src/Core/SolveTriangular.h @@ -96,7 +96,7 @@ struct triangular_solver_selector typedef internal::gemm_blocking_space<(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar, Rhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxRowsAtCompileTime,4> BlockingType; - BlockingType blocking(rhs.rows(), rhs.cols(), size); + BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false); triangular_solve_matrix diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 090c8f4e6..b91786037 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -26,28 +26,37 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff } /** \internal */ -inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0) +inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3) { - static std::ptrdiff_t m_l1CacheSize = 0; - static std::ptrdiff_t m_l2CacheSize = 0; - if(m_l2CacheSize==0) + static bool m_cache_sizes_initialized = false; + static std::ptrdiff_t m_l1CacheSize = 32*1024; + static std::ptrdiff_t m_l2CacheSize = 256*1024; + static std::ptrdiff_t m_l3CacheSize = 2*1024*1024; + + if(!m_cache_sizes_initialized) { - m_l1CacheSize = manage_caching_sizes_helper(queryL1CacheSize(),8 * 1024); - m_l2CacheSize = manage_caching_sizes_helper(queryTopLevelCacheSize(),1*1024*1024); + int l1CacheSize, l2CacheSize, l3CacheSize; + queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize); + m_l1CacheSize = manage_caching_sizes_helper(l1CacheSize, 8*1024); + m_l2CacheSize = manage_caching_sizes_helper(l2CacheSize, 256*1024); + m_l3CacheSize = manage_caching_sizes_helper(l3CacheSize, 8*1024*1024); + m_cache_sizes_initialized = true; } - + if(action==SetAction) { // set the cpu cache size and cache all block sizes from a global cache size in byte eigen_internal_assert(l1!=0 && l2!=0); m_l1CacheSize = *l1; m_l2CacheSize = *l2; + m_l3CacheSize = *l3; } else if(action==GetAction) { eigen_internal_assert(l1!=0 && l2!=0); *l1 = m_l1CacheSize; *l2 = m_l2CacheSize; + *l3 = m_l3CacheSize; } else { @@ -70,10 +79,11 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdi * - the number of scalars that fit into a packet (when vectorization is enabled). * * \sa setCpuCacheSizes */ +#define CEIL(a, b) ((a)+(b)-1)/(b) + template -void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) +void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads) { - EIGEN_UNUSED_VARIABLE(n); // Explanations: // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed @@ -81,43 +91,71 @@ void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) // at the register level. For vectorization purpose, these small vertical panels are unpacked, // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to // stay in L1 cache. - std::ptrdiff_t l1, l2; - - typedef gebp_traits Traits; - enum { - kdiv = KcFactor * 2 * Traits::nr - * Traits::RhsProgress * sizeof(RhsScalar), - mr = gebp_traits::mr, - mr_mask = (0xffffffff/mr)*mr - }; + std::ptrdiff_t l1, l2, l3; + manage_caching_sizes(GetAction, &l1, &l2, &l3); + + if (num_threads > 1) { + typedef gebp_traits Traits; + typedef typename Traits::ResScalar ResScalar; + enum { + kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)), + ksub = Traits::mr * Traits::nr * sizeof(ResScalar), + k_mask = (0xffffffff/8)*8, + + mr = Traits::mr, + mr_mask = (0xffffffff/mr)*mr, + + nr = Traits::nr, + nr_mask = (0xffffffff/nr)*nr + }; + SizeType k_cache = (l1-ksub)/kdiv; + if (k_cache < k) { + k = k_cache & k_mask; + eigen_assert(k > 0); + } - manage_caching_sizes(GetAction, &l1, &l2); + SizeType n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k); + SizeType n_per_thread = CEIL(n, num_threads); + if (n_cache <= n_per_thread) { + // Don't exceed the capacity of the l2 cache. + eigen_assert(n_cache >= static_cast(nr)); + n = n_cache & nr_mask; + eigen_assert(n > 0); + } else { + n = (std::min)(n, (n_per_thread + nr - 1) & nr_mask); + } -// k = std::min(k, l1/kdiv); -// SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0; -// if(_m l2) { + // l3 is shared between all cores, so we'll give each thread its own chunk of l3. + SizeType m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads); + SizeType m_per_thread = CEIL(m, num_threads); + if(m_cache < m_per_thread && m_cache >= static_cast(mr)) { + m = m_cache & mr_mask; + eigen_assert(m > 0); + } else { + m = (std::min)(m, (m_per_thread + mr - 1) & mr_mask); + } + } + } + else { + // In unit tests we do not want to use extra large matrices, + // so we reduce the block size to check the blocking strategy is not flawed #ifndef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS -// k = std::min(k,240); -// n = std::min(n,3840/sizeof(RhsScalar)); -// m = std::min(m,3840/sizeof(RhsScalar)); - - k = std::min(k,sizeof(LhsScalar)<=4 ? 360 : 240); - n = std::min(n,3840/sizeof(RhsScalar)); - m = std::min(m,3840/sizeof(RhsScalar)); + k = std::min(k,sizeof(LhsScalar)<=4 ? 360 : 240); + n = std::min(n,3840/sizeof(RhsScalar)); + m = std::min(m,3840/sizeof(RhsScalar)); #else - k = std::min(k,24); - n = std::min(n,384/sizeof(RhsScalar)); - m = std::min(m,384/sizeof(RhsScalar)); + k = std::min(k,24); + n = std::min(n,384/sizeof(RhsScalar)); + m = std::min(m,384/sizeof(RhsScalar)); #endif + } } template -inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) +inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads) { - computeProductBlockingSizes(k, m, n); + computeProductBlockingSizes(k, m, n, num_threads); } #ifdef EIGEN_HAS_FUSE_CJMADD @@ -1846,8 +1884,8 @@ EIGEN_DONT_INLINE void gemm_pack_rhsm_mc = ActualRows; this->m_nc = ActualCols; @@ -331,21 +331,21 @@ class gemm_blocking_spacem_mc = Transpose ? cols : rows; this->m_nc = Transpose ? rows : cols; this->m_kc = depth; - if(full_rows) + if(l3_blocking) { - DenseIndex m = this->m_mc; - computeProductBlockingSizes(this->m_kc, m, this->m_nc); + computeProductBlockingSizes(this->m_kc, this->m_mc, this->m_nc, num_threads); } - else // full columns + else // no l3 blocking { + DenseIndex m = this->m_mc; DenseIndex n = this->m_nc; - computeProductBlockingSizes(this->m_kc, this->m_mc, n); + computeProductBlockingSizes(this->m_kc, m, n, num_threads); } m_sizeA = this->m_mc * this->m_kc; @@ -451,7 +451,7 @@ class GeneralProduct (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>, _ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor; - BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), true); + BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true); internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit); } diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index daa8a1d8a..8de39f76f 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -72,7 +72,7 @@ struct general_matrix_matrix_triangular_product(kc, mc, nc); + computeProductBlockingSizes(kc, mc, nc, 1); // !!! mc must be a multiple of nr: if(mc > Traits::nr) mc = (mc/Traits::nr)*Traits::nr; diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h index 4079063eb..837e69415 100644 --- a/Eigen/src/Core/products/Parallelizer.h +++ b/Eigen/src/Core/products/Parallelizer.h @@ -49,8 +49,8 @@ inline void initParallel() { int nbt; internal::manage_multi_threading(GetAction, &nbt); - std::ptrdiff_t l1, l2; - internal::manage_caching_sizes(GetAction, &l1, &l2); + std::ptrdiff_t l1, l2, l3; + internal::manage_caching_sizes(GetAction, &l1, &l2, &l3); } /** \returns the max number of threads reserved for Eigen diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index d9e6084c3..21f8175d2 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -343,7 +343,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix(kc, mc, nc); + computeProductBlockingSizes(kc, mc, nc, 1); // kc must smaller than mc kc = (std::min)(kc,mc); @@ -432,10 +432,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix(kc, mc, nc); + computeProductBlockingSizes(kc, mc, nc, 1); std::size_t sizeB = kc*cols; ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index 77aa3e5ee..4cbb79da0 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -412,7 +412,7 @@ struct TriangularProduct Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(),lhs.rows())) : ((IsLower) ? rhs.rows() : (std::min)(rhs.rows(),rhs.cols())); - BlockingType blocking(stripedRows, stripedCols, stripedDepth); + BlockingType blocking(stripedRows, stripedCols, stripedDepth, 1, false); internal::product_triangular_matrix_matrix0 ? l2/(4 * sizeof(Scalar) * otherStride) : 0; subcols = std::max((subcols/Traits::nr)*Traits::nr, Traits::nr); diff --git a/blas/level3_impl.h b/blas/level3_impl.h index a05872666..37a803ced 100644 --- a/blas/level3_impl.h +++ b/blas/level3_impl.h @@ -56,7 +56,7 @@ int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScal else matrix(c, *m, *n, *ldc) *= beta; } - internal::gemm_blocking_space blocking(*m,*n,*k,true); + internal::gemm_blocking_space blocking(*m,*n,*k,1,true); int code = OP(*opa) | (OP(*opb) << 2); func[code](*m, *n, *k, a, *lda, b, *ldb, c, *ldc, alpha, blocking, 0); @@ -131,12 +131,12 @@ int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m, if(SIDE(*side)==LEFT) { - internal::gemm_blocking_space blocking(*m,*n,*m); + internal::gemm_blocking_space blocking(*m,*n,*m,1,false); func[code](*m, *n, a, *lda, b, *ldb, blocking); } else { - internal::gemm_blocking_space blocking(*m,*n,*n); + internal::gemm_blocking_space blocking(*m,*n,*n,1,false); func[code](*n, *m, a, *lda, b, *ldb, blocking); } @@ -222,12 +222,12 @@ int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m, if(SIDE(*side)==LEFT) { - internal::gemm_blocking_space blocking(*m,*n,*m); + internal::gemm_blocking_space blocking(*m,*n,*m,1,false); func[code](*m, *n, *m, a, *lda, tmp.data(), tmp.outerStride(), b, *ldb, alpha, blocking); } else { - internal::gemm_blocking_space blocking(*m,*n,*n); + internal::gemm_blocking_space blocking(*m,*n,*n,1,false); func[code](*m, *n, *n, tmp.data(), tmp.outerStride(), a, *lda, b, *ldb, alpha, blocking); } return 1; @@ -577,7 +577,7 @@ int EIGEN_BLAS_FUNC(her2k)(char *uplo, char *op, int *n, int *k, RealScalar *pal else if(*n<0) info = 3; else if(*k<0) info = 4; else if(*lda(10000,20000); - std::ptrdiff_t l2 = internal::random(1000000,2000000); - setCpuCacheSizes(l1,l2); + std::ptrdiff_t l2 = internal::random(100000,200000); + std::ptrdiff_t l3 = internal::random(1000000,2000000); + setCpuCacheSizes(l1,l2,l3); VERIFY(l1==l1CacheSize()); VERIFY(l2==l2CacheSize()); std::ptrdiff_t k1 = internal::random(10,100)*16; std::ptrdiff_t m1 = internal::random(10,100)*16; std::ptrdiff_t n1 = internal::random(10,100)*16; // only makes sure it compiles fine - internal::computeProductBlockingSizes(k1,m1,n1); + internal::computeProductBlockingSizes(k1,m1,n1,1); } { diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 7ec60044e..47447f446 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -55,7 +55,7 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h" -//#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 1e6f276e0..cd992daab 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -766,7 +766,7 @@ struct TensorEvaluator BlockingType; // Sizes of the blocks to load in cache. See the Goto paper for details. - BlockingType blocking(m, n, k, true); + BlockingType blocking(m, n, k, 1, true); const Index kc = blocking.kc(); const Index mc = (std::min)(m, blocking.mc()); const Index nc = (std::min)(n, blocking.nc()); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index dc0513305..8e4c7c11d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -152,7 +152,7 @@ struct TensorEvaluator(kc, mc, nc/*, num_threads*/); + internal::computeProductBlockingSizes(kc, mc, nc, num_threads); eigen_assert(mc <= m); eigen_assert(nc <= n); eigen_assert(kc <= k); @@ -197,9 +197,10 @@ struct TensorEvaluator kernel_promises(num_kernel_promises, p); + std::vector kernel_promises(num_kernel_promises); + for (int i = 0; i < kernel_promises.size(); ++i) { + kernel_promises[i].set_value(); + } for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) { const Index k_start = k_block_idx * kc; @@ -275,8 +276,7 @@ struct TensorEvaluator) Func; - this->m_device.enqueueNoFuture(&Self::packRhsAndKernel, arg); + this->m_device.enqueueNoFuture(&Self::packRhsAndKernel, arg); } } } @@ -338,7 +338,6 @@ struct TensorEvaluator Date: Thu, 16 Oct 2014 14:52:50 -0700 Subject: Silenced a few compilation warnings Generalized a TensorMap constructor --- unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 3 ++- unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorMap.h | 3 ++- unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 6 +++--- unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h | 2 +- unsupported/test/cxx11_tensor_fixed_size.cpp | 10 +++++----- 13 files changed, 24 insertions(+), 22 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index 879057f38..ceed09505 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -1,6 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // +// Copyright (C) 2014 Benoit Steiner // Copyright (C) 2013 Christian Seiler // // This Source Code Form is subject to the terms of the Mozilla @@ -82,7 +83,7 @@ class Tensor : public TensorBase > static const std::size_t NumIndices = NumIndices_; - typedef DSizes Dimensions; + typedef DSizes Dimensions; protected: TensorStorage m_storage; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 0e55d4de1..2bd158dac 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -114,7 +114,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 9ecea9108..3aa3eba24 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -136,7 +136,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index b8e43f484..74485b15b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -140,7 +140,7 @@ struct TensorEvaluator m_outputStrides; array m_leftStrides; array m_rightStrides; TensorEvaluator m_leftImpl; TensorEvaluator m_rightImpl; + const Axis m_axis; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index cd992daab..0db34adb1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -671,10 +671,10 @@ struct TensorContractionEvaluatorBase Index m_j_size; Index m_k_size; - const Device& m_device; - Scalar* m_result; TensorEvaluator m_leftImpl; TensorEvaluator m_rightImpl; + const Device& m_device; + Scalar* m_result; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 34bdd5309..50cb10a33 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -230,7 +230,7 @@ struct TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 2c0d2cd0f..0a8c10ac7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -64,7 +64,8 @@ template class TensorMap : public Tensor } #endif - inline TensorMap(PointerArgType dataPtr, const array& dimensions) + template + inline TensorMap(PointerArgType dataPtr, const Dimensions& dimensions) : m_data(dataPtr), m_dimensions(dimensions) { } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 13109f514..686bf5c24 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -130,8 +130,8 @@ struct TensorEvaluator, Device> Scalar* data() const { return m_impl.data(); } protected: - NewDimensions m_dimensions; TensorEvaluator m_impl; + NewDimensions m_dimensions; }; @@ -381,13 +381,13 @@ struct TensorEvaluator, Devi return inputIndex; } - Dimensions m_dimensions; array m_outputStrides; array, NumDims> m_fastOutputStrides; array m_inputStrides; - const StartIndices m_offsets; TensorEvaluator m_impl; const Device& m_device; + Dimensions m_dimensions; + const StartIndices m_offsets; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 8da6e0f26..89c0cff05 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -215,11 +215,11 @@ struct TensorEvaluator, Device return rslt; } - PaddingDimensions m_padding; Dimensions m_dimensions; array m_outputStrides; array m_inputStrides; TensorEvaluator m_impl; + PaddingDimensions m_padding; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 01f2daf52..e2fe32d67 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -120,7 +120,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index eef992106..cbe87394b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -152,7 +152,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -217,8 +217,8 @@ struct TensorEvaluator, Device> array m_preservedStrides; array m_reducedStrides; array m_reducedDims; - Op m_reducer; TensorEvaluator m_impl; + Op m_reducer; }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 7e0063626..831a9f005 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -131,7 +131,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp index b0501aaa3..99ffc7f07 100644 --- a/unsupported/test/cxx11_tensor_fixed_size.cpp +++ b/unsupported/test/cxx11_tensor_fixed_size.cpp @@ -32,10 +32,10 @@ static void test_1d() vec1(5) = 42.0; vec2(5) = 5.0; float data3[6]; - TensorMap > > vec3(data3, 6); + TensorMap > > vec3(data3, Sizes<6>()); vec3 = vec1.sqrt(); float data4[6]; - TensorMap, RowMajor> > vec4(data4, 6); + TensorMap, RowMajor> > vec4(data4, Sizes<6>()); vec4 = vec2.sqrt(); VERIFY_IS_EQUAL((vec3.size()), 6); @@ -68,9 +68,9 @@ static void test_1d() static void test_2d() { float data1[6]; - TensorMap >> mat1(data1,2,3); + TensorMap >> mat1(data1, Sizes<2, 3>()); float data2[6]; - TensorMap, RowMajor>> mat2(data2,2,3); + TensorMap, RowMajor>> mat2(data2, Sizes<2, 3>()); VERIFY_IS_EQUAL((mat1.size()), 2*3); // VERIFY_IS_EQUAL((mat1.dimension(0)), 2); @@ -166,7 +166,7 @@ static void test_array() for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { - mat1(array(i,j,k)) = val; + mat1(array{{i,j,k}}) = val; val += 1.0; } } -- cgit v1.2.3 From 65af852b54afca3c76c978c1bfd27d8a1451cab6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 16 Oct 2014 15:02:30 -0700 Subject: Silenced one last warning --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 0db34adb1..c530b27a7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -48,7 +48,7 @@ class BaseTensorContractionMapper { m_k_strides(k_strides) { } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE void prefetch(int i) { } + EIGEN_STRONG_INLINE void prefetch(int /*i*/) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(Index row) const { -- cgit v1.2.3 From b1789c112b5cf8d478a03786c6c1243320aefd47 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 3 Nov 2014 08:51:33 -0800 Subject: Improved handling of 1d tensors --- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 98 +++++++++++++++++++--- .../CXX11/src/Tensor/TensorContractionThreadPool.h | 12 ++- 2 files changed, 99 insertions(+), 11 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index c530b27a7..8e898619d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -48,7 +48,7 @@ class BaseTensorContractionMapper { m_k_strides(k_strides) { } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE void prefetch(int /*i*/) { } + EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(Index row) const { @@ -142,6 +142,13 @@ class BaseTensorContractionMapper { return IndexPair(linidx[0], linidx[1]); } + Index firstAligned(Index size) const { + return size; + } + Index stride() const { + return 1; + } + protected: const Tensor m_tensor; const nocontract_t m_nocontract_strides; @@ -202,6 +209,18 @@ class TensorContractionSubMapper { return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset); } + template + EIGEN_ALWAYS_INLINE PacketT load(Index i) const { + EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((AlignmentType == Aligned || Alignment == Unaligned), YOU_MADE_A_PROGRAMMING_MISTAKE); + return loadPacket(i); + } + + template + bool aligned(Index /*i*/) const { + return false; + } + private: const ParentMapper& m_base_mapper; const Index m_vert_offset; @@ -220,6 +239,7 @@ class TensorContractionInputMapper public: typedef BaseTensorContractionMapper Base; typedef TensorContractionSubMapper SubMapper; + typedef SubMapper VectorMapper; TensorContractionInputMapper(const Tensor& tensor, const nocontract_t& nocontract_strides, @@ -233,6 +253,10 @@ class TensorContractionInputMapper return SubMapper(*this, i, j); } + EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { + return VectorMapper(*this, i, j); + } + typedef typename packet_traits::type Packet; typedef typename packet_traits::half HalfPacket; @@ -306,6 +330,7 @@ class TensorContractionInputMapper Base; typedef TensorContractionSubMapper SubMapper; + typedef SubMapper VectorMapper; TensorContractionInputMapper(const Tensor& tensor, const nocontract_t& nocontract_strides, @@ -319,6 +344,10 @@ class TensorContractionInputMapper::type Packet; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { @@ -592,41 +621,80 @@ struct TensorContractionEvaluatorBase if (this->m_lhs_inner_dim_contiguous) { if (this->m_rhs_inner_dim_contiguous) { if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalTyped(buffer); + static_cast(this)->template evalProduct(buffer); } else { - static_cast(this)->template evalTyped(buffer); + static_cast(this)->template evalProduct(buffer); } } else { if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalTyped(buffer); + static_cast(this)->template evalProduct(buffer); } else { - static_cast(this)->template evalTyped(buffer); + static_cast(this)->template evalProduct(buffer); } } } else { if (this->m_rhs_inner_dim_contiguous) { if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalTyped(buffer); + static_cast(this)->template evalProduct(buffer); } else { - static_cast(this)->template evalTyped(buffer); + static_cast(this)->template evalProduct(buffer); } } else { if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalTyped(buffer); + static_cast(this)->template evalProduct(buffer); } else { - static_cast(this)->template evalTyped(buffer); + static_cast(this)->template evalProduct(buffer); } } } } + template + void evalGemv(Scalar* buffer) const { + const Index rows = m_i_size; + const Index cols = m_k_size; + + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + const int lhs_packet_size = internal::packet_traits::size; + const int rhs_packet_size = internal::packet_traits::size; + typedef internal::TensorContractionInputMapper LhsMapper; + + typedef internal::TensorContractionInputMapper RhsMapper; + + LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides, + m_left_contracting_strides, m_k_strides); + RhsMapper rhs(m_rightImpl, m_right_nocontract_strides, m_j_strides, + m_right_contracting_strides, m_k_strides); + + const Scalar alpha(1); + const Index resIncr(1); + + // zero out the result buffer (which must be of size at least rows * sizeof(Scalar) + m_device.memset(buffer, 0, rows * sizeof(Scalar)); + + internal::general_matrix_vector_product::run( + rows, cols, lhs, rhs, + buffer, resIncr, alpha); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_leftImpl.cleanup(); m_rightImpl.cleanup(); @@ -707,7 +775,17 @@ struct TensorEvaluator - EIGEN_DEVICE_FUNC void evalTyped(Scalar* buffer) const { + void evalProduct(Scalar* buffer) const { + if (this->m_j_size == 1) { + this->template evalGemv(buffer); + return; + } + + evalGemm(buffer); + } + + template + EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const { // columns in left side, rows in right side const Index k = this->m_k_size; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index cf1352a31..f0e9bb616 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -93,7 +93,17 @@ struct TensorEvaluator - void evalTyped(Scalar* buffer) const { + void evalProduct(Scalar* buffer) const { + if (this->m_j_size == 1) { + this->template evalGemv(buffer); + return; + } + + evalGemm(buffer); + } + + template + void evalGemm(Scalar* buffer) const { // columns in left side, rows in right side const Index k = this->m_k_size; -- cgit v1.2.3 From 1d3c8306f87b284c26180be6eac13dc8d4aa1b52 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 13 Nov 2014 19:13:17 -0800 Subject: Fixed compilation errors with clang. H: Enter commit message. Lines beginning with 'HG:' are removed. --- unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 1 - unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h | 1 - unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h | 10 +++++----- 4 files changed, 7 insertions(+), 9 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index a77903dca..8cb41aec8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -30,7 +30,6 @@ struct traits > : public traits::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; }; template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 8e898619d..c5ec42cf4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -848,8 +848,8 @@ struct TensorEvaluator(this->m_device.allocate(sizeA * sizeof(LhsScalar))); RhsScalar* blockB = static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar))); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index ce916fdfd..0dfb6913b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -37,7 +37,6 @@ struct traits > : public traits typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions + 1; }; template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h index 010221e74..eaf0195ce 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -110,7 +110,7 @@ struct tuple_coeff<0> { update_value(std::get<0>(t), value); } template - static constexpr bool value_known_statically(const DenseIndex i, const std::tuple& t) { + static constexpr bool value_known_statically(const DenseIndex i, const std::tuple&) { // eigen_assert (i == 0); // gcc fails to compile assertions in constexpr return is_compile_time_constant >::type>::value & (i == 0); } @@ -190,7 +190,7 @@ template struct index_statically_eq > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - IndexList()[i] == value; + (IndexList()[i] == value); } }; @@ -198,7 +198,7 @@ template struct index_statically_eq > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - IndexList()[i] == value; + (IndexList()[i] == value); } }; @@ -213,7 +213,7 @@ template struct index_statically_ne > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - IndexList()[i] != value; + (IndexList()[i] != value); } }; @@ -221,7 +221,7 @@ template struct index_statically_ne > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - IndexList()[i] != value; + (IndexList()[i] != value); } }; -- cgit v1.2.3 From 71676eaddd7fb6b8abdc5713f437750f3c963fcb Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 12:36:57 -0800 Subject: Added support for RowMajor inputs to the contraction code. --- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 265 +++++++++++++++------ .../Eigen/CXX11/src/Tensor/TensorContractionCuda.h | 6 +- .../CXX11/src/Tensor/TensorContractionThreadPool.h | 43 +++- 3 files changed, 220 insertions(+), 94 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index c5ec42cf4..a02a273e7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -320,6 +320,8 @@ class TensorContractionInputMapper }; + + template struct max_n_1 { + static const size_t size = n; +}; +template <> struct max_n_1<0> { + static const size_t size = 1; +}; + + template struct traits > { @@ -378,6 +388,10 @@ struct traits > typedef typename remove_reference::type _LhsNested; typedef typename remove_reference::type _RhsNested; + // From NumDims below. + static const int NumDimensions = max_n_1::NumDimensions + traits::NumDimensions - 2 * array_size::value>::size; + static const int Layout = traits::Layout; + enum { Flags = 0, }; @@ -401,19 +415,19 @@ struct traits::NumDimensions + traits::NumDimensions - 2 * array_size::value>::size; }; } // end namespace internal - - template class TensorContractionOp : public TensorBase > { public: typedef typename Eigen::internal::traits::Scalar Scalar; typedef typename Eigen::internal::traits::Packet Packet; - typedef typename Eigen::NumTraits::Real RealScalar; typedef typename internal::promote_storage_type::ret CoeffReturnType; typedef typename internal::promote_storage_type::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp(const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp( + const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims) : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims) {} - EIGEN_DEVICE_FUNC - const Indices& indices() const { return m_indices; } + EIGEN_DEVICE_FUNC + const Indices& indices() const { return m_indices; } - /** \returns the nested expressions */ - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - lhsExpression() const { return m_lhs_xpr; } + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + lhsExpression() const { return m_lhs_xpr; } - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - rhsExpression() const { return m_rhs_xpr; } + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + rhsExpression() const { return m_rhs_xpr; } protected: typename LhsXprType::Nested m_lhs_xpr; @@ -444,12 +459,17 @@ class TensorContractionOp : public TensorBase struct max_n_1 { - static const size_t size = n; -}; -template <> struct max_n_1<0> { - static const size_t size = 1; -}; +template struct Cond {}; + +template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +const T1& choose(Cond, const T1& first, const T2&) { + return first; +} + +template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +const T2& choose(Cond, const T1&, const T2& second) { + return second; +} template @@ -467,37 +487,94 @@ struct TensorContractionEvaluatorBase typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; - typedef array::Dimensions::count> left_dim_mapper_t; - typedef array::Dimensions::count> right_dim_mapper_t; - - typedef array::value> contract_t; - typedef array::Dimensions::count - internal::array_size::value>::size> left_nocontract_t; - typedef array::Dimensions::count - internal::array_size::value>::size> right_nocontract_t; - - static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * internal::array_size::value>::size; - - typedef DSizes Dimensions; - enum { IsAligned = true, PacketAccess = (internal::packet_traits::size > 1), + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionEvaluatorBase(const XprType& op, const Device& device) - : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_device(device), m_result(NULL) - { + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + Layout == ColMajor, LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + Layout == ColMajor, RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size::Dimensions>::value; + static const int RDims = + internal::array_size::Dimensions>::value; + static const int ContractDims = internal::array_size::value; + static const int NumDims = internal::max_n_1::size; + + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; + typedef array contract_t; + typedef array::size> left_nocontract_t; + typedef array::size> right_nocontract_t; + + typedef DSizes Dimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorContractionEvaluatorBase(const XprType& op, const Device& device) + : m_leftImpl(choose(Cond(), + op.lhsExpression(), op.rhsExpression()), device), + m_rightImpl(choose(Cond(), + op.rhsExpression(), op.lhsExpression()), device), + m_device(device), + m_result(NULL) { + EIGEN_STATIC_ASSERT((TensorEvaluator::Layout == + TensorEvaluator::Layout), + YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert((internal::array_size::value > 0) && "Must contract on some indices"); - array::Dimensions::count> lhs_strides; + + DSizes eval_left_dims; + DSizes eval_right_dims; + array, ContractDims> eval_op_indices; + if (Layout == ColMajor) { + // For ColMajor, we keep using the existing dimensions + for (int i = 0; i < LDims; i++) { + eval_left_dims[i] = m_leftImpl.dimensions()[i]; + } + for (int i = 0; i < RDims; i++) { + eval_right_dims[i] = m_rightImpl.dimensions()[i]; + } + // We keep the pairs of contracting indices. + for (int i = 0; i < ContractDims; i++) { + eval_op_indices[i].first = op.indices()[i].first; + eval_op_indices[i].second = op.indices()[i].second; + } + } else { + // For RowMajor, we need to reverse the existing dimensions + for (int i = 0; i < LDims; i++) { + eval_left_dims[i] = m_leftImpl.dimensions()[LDims - i - 1]; + } + for (int i = 0; i < RDims; i++) { + eval_right_dims[i] = m_rightImpl.dimensions()[RDims - i - 1]; + } + // We need to flip all the pairs of contracting indices as well as + // reversing the dimensions. + for (int i = 0; i < ContractDims; i++) { + eval_op_indices[i].first = LDims - 1 - op.indices()[i].second; + eval_op_indices[i].second = RDims - 1 - op.indices()[i].first; + } + } + + array lhs_strides; lhs_strides[0] = 1; - for (int i = 0; i < TensorEvaluator::Dimensions::count-1; ++i) { - lhs_strides[i+1] = lhs_strides[i] * m_leftImpl.dimensions()[i]; + for (int i = 0; i < LDims-1; ++i) { + lhs_strides[i+1] = lhs_strides[i] * eval_left_dims[i]; } - array::Dimensions::count> rhs_strides; + array rhs_strides; rhs_strides[0] = 1; - for (int i = 0; i < TensorEvaluator::Dimensions::count-1; ++i) { - rhs_strides[i+1] = rhs_strides[i] * m_rightImpl.dimensions()[i]; + for (int i = 0; i < RDims-1; ++i) { + rhs_strides[i+1] = rhs_strides[i] * eval_right_dims[i]; } m_i_strides[0] = 1; @@ -515,27 +592,28 @@ struct TensorContractionEvaluatorBase m_lhs_inner_dim_contiguous = true; int dim_idx = 0; int nocontract_idx = 0; - const typename TensorEvaluator::Dimensions& left_dims = m_leftImpl.dimensions(); - for (int i = 0; i < TensorEvaluator::Dimensions::count; i++) { + + for (int i = 0; i < LDims; i++) { // find if we are contracting on index i of left tensor bool contracting = false; - for (int j = 0; j < internal::array_size::value; j++) { - if (op.indices()[j].first == i) { + for (int j = 0; j < ContractDims; j++) { + if (eval_op_indices[j].first == i) { contracting = true; break; } } if (!contracting) { // add dimension size to output dimensions - m_dimensions[dim_idx] = left_dims[i]; + m_dimensions[dim_idx] = eval_left_dims[i]; m_left_nocontract_strides[nocontract_idx] = lhs_strides[i]; if (dim_idx != i) { m_lhs_inner_dim_contiguous = false; } if (nocontract_idx+1 < internal::array_size::value) { - m_i_strides[nocontract_idx+1] = m_i_strides[nocontract_idx] * left_dims[i]; + m_i_strides[nocontract_idx+1] = + m_i_strides[nocontract_idx] * eval_left_dims[i]; } else { - m_i_size = m_i_strides[nocontract_idx] * left_dims[i]; + m_i_size = m_i_strides[nocontract_idx] * eval_left_dims[i]; } dim_idx++; nocontract_idx++; @@ -543,22 +621,22 @@ struct TensorContractionEvaluatorBase } nocontract_idx = 0; - const typename TensorEvaluator::Dimensions& right_dims = m_rightImpl.dimensions(); - for (int i = 0; i < TensorEvaluator::Dimensions::count; i++) { + for (int i = 0; i < RDims; i++) { bool contracting = false; // find if we are contracting on index i of right tensor - for (int j = 0; j < internal::array_size::value; j++) { - if (op.indices()[j].second == i) { + for (int j = 0; j < ContractDims; j++) { + if (eval_op_indices[j].second == i) { contracting = true; break; } } if (!contracting) { - m_dimensions[dim_idx] = right_dims[i]; + m_dimensions[dim_idx] = eval_right_dims[i]; if (nocontract_idx+1 < internal::array_size::value) { - m_j_strides[nocontract_idx+1] = m_j_strides[nocontract_idx] * right_dims[i]; + m_j_strides[nocontract_idx+1] = + m_j_strides[nocontract_idx] * eval_right_dims[i]; } else { - m_j_size = m_j_strides[nocontract_idx] * right_dims[i]; + m_j_size = m_j_strides[nocontract_idx] * eval_right_dims[i]; } m_right_nocontract_strides[nocontract_idx] = rhs_strides[i]; dim_idx++; @@ -573,12 +651,13 @@ struct TensorContractionEvaluatorBase // each tensor, we'll only look at the first tensor here. m_rhs_inner_dim_contiguous = true; m_rhs_inner_dim_reordered = false; - for (int i = 0; i < internal::array_size::value; i++) { - Index left = op.indices()[i].first; - Index right = op.indices()[i].second; + for (int i = 0; i < ContractDims; i++) { + Index left = eval_op_indices[i].first; + Index right = eval_op_indices[i].second; - Index size = left_dims[left]; - eigen_assert(size == right_dims[right] && "Contraction axes must be same size"); + Index size = eval_left_dims[left]; + eigen_assert(size == eval_right_dims[right] && + "Contraction axes must be same size"); if (i+1 < internal::array_size::value) { m_k_strides[i+1] = m_k_strides[i] * size; @@ -588,7 +667,7 @@ struct TensorContractionEvaluatorBase m_left_contracting_strides[i] = lhs_strides[left]; m_right_contracting_strides[i] = rhs_strides[right]; - if (i > 0 && right < op.indices()[i-1].second) { + if (i > 0 && right < eval_op_indices[i-1].second) { m_rhs_inner_dim_reordered = true; } if (right != i) { @@ -597,9 +676,16 @@ struct TensorContractionEvaluatorBase } // Scalar case. We represent the result as a 1d tensor of size 1. - if (TensorEvaluator::Dimensions::count + TensorEvaluator::Dimensions::count == 2 * internal::array_size::value) { + if (LDims + RDims == 2 * ContractDims) { m_dimensions[0] = 1; } + + // If the layout is RowMajor, we need to reverse the m_dimensions + if (Layout == RowMajor) { + for (int i = 0, j = NumDims - 1; i < j; i++, j--) { + std::swap(m_dimensions[i], m_dimensions[j]); + } + } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -661,10 +747,10 @@ struct TensorContractionEvaluatorBase const Index rows = m_i_size; const Index cols = m_k_size; - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; const int lhs_packet_size = internal::packet_traits::size; const int rhs_packet_size = internal::packet_traits::size; typedef internal::TensorContractionInputMapper m_leftImpl; - TensorEvaluator m_rightImpl; + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; const Device& m_device; Scalar* m_result; }; +// evaluator for default device template struct TensorEvaluator, Device> : - public TensorContractionEvaluatorBase, Device> > { + public TensorContractionEvaluatorBase< + TensorEvaluator, Device> > { typedef TensorEvaluator, Device> Self; typedef TensorContractionEvaluatorBase Base; @@ -759,15 +846,35 @@ struct TensorEvaluator::Dimensions::count> left_dim_mapper_t; - typedef array::Dimensions::count> right_dim_mapper_t; + enum { + Layout = TensorEvaluator::Layout, + }; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + Layout == ColMajor, LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + Layout == ColMajor, RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size::Dimensions>::value; + static const int RDims = + internal::array_size::Dimensions>::value; + static const int ContractDims = internal::array_size::value; - typedef array::value> contract_t; - typedef array::Dimensions::count - internal::array_size::value>::size> left_nocontract_t; - typedef array::Dimensions::count - internal::array_size::value>::size> right_nocontract_t; + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; - static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * internal::array_size::value>::size; + typedef array contract_t; + typedef array::size> left_nocontract_t; + typedef array::size> right_nocontract_t; + static const int NumDims = internal::max_n_1::size; + + // Could we use NumDimensions here? typedef DSizes Dimensions; @@ -799,15 +906,15 @@ struct TensorEvaluatorm_device.memset(buffer, 0, m * n * sizeof(Scalar)); // define mr, nr, and all of my data mapper types - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; typedef typename internal::gebp_traits Traits; const Index nr = Traits::nr; const Index mr = Traits::mr; - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; const int lhs_packet_size = internal::packet_traits::size; const int rhs_packet_size = internal::packet_traits::size; @@ -826,10 +933,10 @@ struct TensorEvaluator OutputMapper; - // Declare GEBP packing and kernel structs internal::gemm_pack_lhs pack_lhs; internal::gemm_pack_rhs pack_rhs; + internal::gebp_kernel gebp; // initialize data mappers diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index f6bd949bd..588770bb4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -1241,10 +1241,10 @@ struct TensorEvaluator right_dim_mapper_t; typedef array contract_t; - typedef array::size> left_nocontract_t; - typedef array::size> right_nocontract_t; + typedef array::size> left_nocontract_t; + typedef array::size> right_nocontract_t; - static const int NumDims = max_n_1::size; + static const int NumDims = internal::max_n_1::size; typedef DSizes Dimensions; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index f0e9bb616..5851e5adc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -70,24 +70,43 @@ struct TensorEvaluator::Dimensions::count> left_dim_mapper_t; - typedef array::Dimensions::count> right_dim_mapper_t; - - typedef array::value> contract_t; - typedef array::Dimensions::count - internal::array_size::value>::size> left_nocontract_t; - typedef array::Dimensions::count - internal::array_size::value>::size> right_nocontract_t; - - static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * internal::array_size::value>::size; + enum { + Layout = TensorEvaluator::Layout, + }; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + Layout == ColMajor, LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + Layout == ColMajor, RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size::Dimensions>::value; + static const int RDims = + internal::array_size::Dimensions>::value; + static const int ContractDims = internal::array_size::value; + + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; + + typedef array contract_t; + typedef array::size> left_nocontract_t; + typedef array::size> right_nocontract_t; + + static const int NumDims = max_n_1::size; typedef DSizes Dimensions; // typedefs needed in evalTo - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; typedef typename internal::gebp_traits Traits; - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {} -- cgit v1.2.3 From e896c0ade7c77a18acb1b3ef01f22ef698c1a2a2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 29 Jan 2015 10:29:47 -0800 Subject: Marked the contraction operation as read only, since its result can't be assigned. --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index a02a273e7..af843654c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -423,7 +423,7 @@ struct traits -class TensorContractionOp : public TensorBase > +class TensorContractionOp : public TensorBase, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; -- cgit v1.2.3 From 410895a7e4276fa2e1f78dbb953c7045818a86ae Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Feb 2015 12:13:19 -0800 Subject: Silenced several compilation warnings --- unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h | 16 ++++++++-------- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 18 +++++++++--------- .../CXX11/src/Tensor/TensorContractionThreadPool.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 4 ++-- 6 files changed, 24 insertions(+), 24 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 93938bd1b..a4f73b2a1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -103,7 +103,7 @@ struct TensorEvaluator, Device> m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device) { - EIGEN_STATIC_ASSERT((TensorEvaluator::Layout == TensorEvaluator::Layout), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); // The dimensions of the lhs and the rhs tensors should be equal to prevent // overflows and ensure the result is fully initialized. eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_leftImpl.dimensions())); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 503803d23..698bcfe18 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -257,13 +257,13 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex; - if ((Layout == ColMajor && m_dim.actualDim() == 0) || - (Layout == RowMajor && m_dim.actualDim() == NumInputDims-1)) { + if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == 0) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(m_stride == 1); inputIndex = index * m_inputStride + m_inputOffset; - } else if ((Layout == ColMajor && m_dim.actualDim() == NumInputDims-1) || - (Layout == RowMajor && m_dim.actualDim() == 0)) { + } else if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumInputDims-1) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { // m_stride is aways greater than index, so let's avoid the integer division. eigen_assert(m_stride > index); inputIndex = index + m_inputOffset; @@ -322,8 +322,8 @@ struct TensorEvaluator, Device> static const int packetSize = internal::unpacket_traits::size; EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - if ((this->Layout == ColMajor && this->m_dim.actualDim() == 0) || - (this->Layout == RowMajor && this->m_dim.actualDim() == NumInputDims-1)) { + if ((static_cast(this->Layout) == static_cast(ColMajor) && this->m_dim.actualDim() == 0) || + (static_cast(this->Layout) == static_cast(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(this->m_stride == 1); EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; @@ -333,8 +333,8 @@ struct TensorEvaluator, Device> this->m_impl.coeffRef(inputIndex) = values[i]; inputIndex += this->m_inputStride; } - } else if ((this->Layout == ColMajor && this->m_dim.actualDim() == NumInputDims-1) || - (this->Layout == RowMajor && this->m_dim.actualDim() == 0)) { + } else if ((static_cast(this->Layout) == static_cast(ColMajor) && this->m_dim.actualDim() == NumInputDims-1) || + (static_cast(this->Layout) == static_cast(RowMajor) && this->m_dim.actualDim() == 0)) { // m_stride is aways greater than index, so let's avoid the integer division. eigen_assert(this->m_stride > index); this->m_impl.template writePacket(index + this->m_inputOffset, x); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index af843654c..e750c21e7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -499,9 +499,9 @@ struct TensorContractionEvaluatorBase // If we want to compute A * B = C, where A is LHS and B is RHS, the code // will pretend B is LHS and A is RHS. typedef typename internal::conditional< - Layout == ColMajor, LeftArgType, RightArgType>::type EvalLeftArgType; + static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; typedef typename internal::conditional< - Layout == ColMajor, RightArgType, LeftArgType>::type EvalRightArgType; + static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; static const int LDims = internal::array_size::Dimensions>::value; @@ -520,14 +520,14 @@ struct TensorContractionEvaluatorBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionEvaluatorBase(const XprType& op, const Device& device) - : m_leftImpl(choose(Cond(), + : m_leftImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), op.lhsExpression(), op.rhsExpression()), device), - m_rightImpl(choose(Cond(), + m_rightImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), op.rhsExpression(), op.lhsExpression()), device), m_device(device), m_result(NULL) { - EIGEN_STATIC_ASSERT((TensorEvaluator::Layout == - TensorEvaluator::Layout), + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); eigen_assert((internal::array_size::value > 0) && "Must contract on some indices"); @@ -681,7 +681,7 @@ struct TensorContractionEvaluatorBase } // If the layout is RowMajor, we need to reverse the m_dimensions - if (Layout == RowMajor) { + if (static_cast(Layout) == static_cast(RowMajor)) { for (int i = 0, j = NumDims - 1; i < j; i++, j--) { std::swap(m_dimensions[i], m_dimensions[j]); } @@ -855,9 +855,9 @@ struct TensorEvaluator::type EvalLeftArgType; + static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; typedef typename internal::conditional< - Layout == ColMajor, RightArgType, LeftArgType>::type EvalRightArgType; + static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; static const int LDims = internal::array_size::Dimensions>::value; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index e358e6a3a..8b87f1045 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -79,9 +79,9 @@ struct TensorEvaluator::type EvalLeftArgType; + static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; typedef typename internal::conditional< - Layout == ColMajor, RightArgType, LeftArgType>::type EvalRightArgType; + static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; static const int LDims = internal::array_size::Dimensions>::value; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 97f225f0a..5e167d4aa 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -94,14 +94,14 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array& coords) { eigen_assert(m_data); - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { return m_data[m_dims.IndexOfColMajor(coords)]; } else { return m_data[m_dims.IndexOfRowMajor(coords)]; } } - Scalar* data() const { return m_data; } + EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; } protected: Scalar* m_data; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index 7e448f7c0..c00810594 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -112,7 +112,7 @@ struct TensorEvaluator, Device> enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, - Layout = (TensorEvaluator::Layout == ColMajor) ? RowMajor : ColMajor, + Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, CoordAccess = false, // to be implemented }; @@ -169,7 +169,7 @@ template enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, - Layout = (TensorEvaluator::Layout == ColMajor) ? RowMajor : ColMajor, + Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, CoordAccess = false, // to be implemented }; -- cgit v1.2.3 From 114e863f086077fc949baf5dfe1f4102222c938e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Feb 2015 12:20:24 -0800 Subject: Silcenced a few compilation warnings --- unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 8 ++++---- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h | 12 ++++++------ unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 14 +++++++------- 4 files changed, 18 insertions(+), 18 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index ef134adf2..5790e19d6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -106,7 +106,7 @@ struct TensorEvaluator, Device> m_dimensions[i] = input_dims[i] * broadcast[i]; } - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { m_inputStrides[0] = 1; m_outputStrides[0] = 1; for (int i = 1; i < NumDims; ++i) { @@ -139,7 +139,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const { - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { return coeffColMajor(index); } else { return coeffRowMajor(index); @@ -210,7 +210,7 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const { - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { return packetColMajor(index); } else { return packetRowMajor(index); @@ -326,7 +326,7 @@ struct TensorEvaluator, Device> } - Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: Dimensions m_dimensions; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index e750c21e7..f7254a24d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -536,7 +536,7 @@ struct TensorContractionEvaluatorBase DSizes eval_left_dims; DSizes eval_right_dims; array, ContractDims> eval_op_indices; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { // For ColMajor, we keep using the existing dimensions for (int i = 0; i < LDims; i++) { eval_left_dims[i] = m_leftImpl.dimensions()[i]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 5e167d4aa..488d32cb4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -167,7 +167,7 @@ struct TensorEvaluator #endif } - const Scalar* data() const { return m_data; } + EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; } protected: const Scalar* m_data; @@ -218,7 +218,7 @@ struct TensorEvaluator, Device> return m_functor.packetOp(index); } - CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } private: const NullaryOp m_functor; @@ -273,7 +273,7 @@ struct TensorEvaluator, Device> return m_functor.packetOp(m_argImpl.template packet(index)); } - CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } private: const UnaryOp m_functor; @@ -301,7 +301,7 @@ struct TensorEvaluator::Layout == TensorEvaluator::Layout || internal::traits::NumDimensions == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout) || internal::traits::NumDimensions == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); } @@ -337,7 +337,7 @@ struct TensorEvaluator(index), m_rightImpl.template packet(index)); } - CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } private: const BinaryOp m_functor; @@ -413,7 +413,7 @@ struct TensorEvaluator m_elseImpl.template packet(index)); } - CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } private: TensorEvaluator m_condImpl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 21416afe0..7643d4cdc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -245,7 +245,7 @@ struct TensorEvaluator, Device> } // Precompute output strides. - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { m_outputStrides[0] = 1; for (int i = 1; i < NumOutputDims; ++i) { m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; @@ -259,7 +259,7 @@ struct TensorEvaluator, Device> // Precompute input strides. array input_strides; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { input_strides[0] = 1; for (int i = 1; i < NumInputDims; ++i) { input_strides[i] = input_strides[i-1] * input_dims[i-1]; @@ -309,7 +309,7 @@ struct TensorEvaluator, Device> Op reducer(m_reducer); if (ReducingInnerMostDims) { const Index num_values_to_reduce = - (Layout == ColMajor) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1]; + (static_cast(Layout) == static_cast(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1]; return internal::InnerMostDimReducer::reduce(*this, firstInput(index), num_values_to_reduce, reducer); } else { @@ -330,7 +330,7 @@ struct TensorEvaluator, Device> EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; if (ReducingInnerMostDims) { const Index num_values_to_reduce = - (Layout == ColMajor) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1]; + (static_cast(Layout) == static_cast(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1]; const Index firstIndex = firstInput(index); for (Index i = 0; i < packetSize; ++i) { Op reducer(m_reducer); @@ -339,7 +339,7 @@ struct TensorEvaluator, Device> } } else if (PreservingInnerMostDims) { const Index firstIndex = firstInput(index); - const int innermost_dim = (Layout == ColMajor) ? 0 : NumOutputDims - 1; + const int innermost_dim = (static_cast(Layout) == static_cast(ColMajor)) ? 0 : NumOutputDims - 1; // TBD: extend this the the n innermost dimensions that we preserve. if (((firstIndex % m_dimensions[innermost_dim]) + packetSize - 1) < m_dimensions[innermost_dim]) { Op reducer(m_reducer); @@ -371,7 +371,7 @@ struct TensorEvaluator, Device> // used to compute the reduction at output index "index". EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { if (ReducingInnerMostDims) { - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { return index * m_preservedStrides[0]; } else { return index * m_preservedStrides[NumOutputDims - 1]; @@ -379,7 +379,7 @@ struct TensorEvaluator, Device> } // TBD: optimize the case where we preserve the innermost dimensions. Index startInput = 0; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumOutputDims - 1; i > 0; --i) { // This is index_i in the output tensor. const Index idx = index / m_outputStrides[i]; -- cgit v1.2.3