diff options
author | Gael Guennebaud <g.gael@free.fr> | 2009-08-06 12:20:02 +0200 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2009-08-06 12:20:02 +0200 |
commit | 56d00779db6975fea0821a71abf6323f98a1f4c0 (patch) | |
tree | 2e49e38c08bc6be41702a21b0987cd0b0c49e8fe /Eigen/src/Core | |
parent | 6b2ab13ac54aff7ff15046d05b3f060a3f1f2044 (diff) |
more product refactoring
Diffstat (limited to 'Eigen/src/Core')
-rw-r--r-- | Eigen/src/Core/Product.h | 413 | ||||
-rw-r--r-- | Eigen/src/Core/SelfAdjointView.h | 95 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralBlockPanelKernel.h | 443 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralMatrixMatrix.h | 458 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralUnrolled.h | 385 | ||||
-rw-r--r-- | Eigen/src/Core/products/SelfadjointMatrixMatrix.h | 52 | ||||
-rw-r--r-- | Eigen/src/Core/products/SelfadjointMatrixVector.h | 43 | ||||
-rw-r--r-- | Eigen/src/Core/util/BlasUtil.h | 2 |
8 files changed, 981 insertions, 910 deletions
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index a4ece7f0d..fe559a991 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -63,23 +63,22 @@ template<typename Lhs, typename Rhs> struct ei_product_type Cols = Rhs::ColsAtCompileTime, Depth = EIGEN_ENUM_MIN(Lhs::ColsAtCompileTime,Rhs::RowsAtCompileTime), - value = ei_product_type_selector<(Rows>8 ? Large : (Rows==1 ? 1 : Small)), - (Cols>8 ? Large : (Cols==1 ? 1 : Small)), - (Depth>8 ? Large : (Depth==1 ? 1 : Small))>::ret + value = ei_product_type_selector<(Rows >=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ? Large : (Rows==1 ? 1 : Small)), + (Cols >=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ? Large : (Cols==1 ? 1 : Small)), + (Depth>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ? Large : (Depth==1 ? 1 : Small))>::ret }; }; +/* The following allows to select the kind of product at compile time + * based on the three dimensions of the product. + * This is a compile time mapping from {1,Small,Large}^3 -> {product types} */ +// FIXME I'm not sure the current mapping is the ideal one. template<int Rows, int Cols> struct ei_product_type_selector<Rows,Cols,1> { enum { ret = OuterProduct }; }; template<int Depth> struct ei_product_type_selector<1,1,Depth> { enum { ret = InnerProduct }; }; template<> struct ei_product_type_selector<1,1,1> { enum { ret = InnerProduct }; }; template<> struct ei_product_type_selector<Small,1,Small> { enum { ret = UnrolledProduct }; }; template<> struct ei_product_type_selector<1,Small,Small> { enum { ret = UnrolledProduct }; }; template<> struct ei_product_type_selector<Small,Small,Small> { enum { ret = UnrolledProduct }; }; - -// template<> struct ei_product_type_selector<Small,1,Small> { enum { ret = GemvProduct }; }; -// template<> struct ei_product_type_selector<1,Small,Small> { enum { ret = GemvProduct }; }; -// template<> struct ei_product_type_selector<Small,Small,Small> { enum { ret = GemmProduct }; }; - template<> struct ei_product_type_selector<1,Large,Small> { enum { ret = GemvProduct }; }; template<> struct ei_product_type_selector<1,Large,Large> { enum { ret = GemvProduct }; }; template<> struct ei_product_type_selector<1,Small,Large> { enum { ret = GemvProduct }; }; @@ -130,48 +129,6 @@ struct ProductReturnType<Lhs,Rhs,UnrolledProduct> /*********************************************************************** -* Implementation of General Matrix Matrix Product -***********************************************************************/ - -template<typename Lhs, typename Rhs> -struct ei_traits<GeneralProduct<Lhs,Rhs,GemmProduct> > - : ei_traits<ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs> > -{}; - -template<typename Lhs, typename Rhs> -class GeneralProduct<Lhs, Rhs, GemmProduct> - : public ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs> -{ - public: - EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct) - - GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} - - template<typename Dest> void addTo(Dest& dst, Scalar alpha) const - { - ei_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); - - const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs); - const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs); - - Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) - * RhsBlasTraits::extractScalarFactor(m_rhs); - - ei_general_matrix_matrix_product< - Scalar, - (_ActualLhsType::Flags&RowMajorBit)?RowMajor:ColMajor, bool(LhsBlasTraits::NeedToConjugate), - (_ActualRhsType::Flags&RowMajorBit)?RowMajor:ColMajor, bool(RhsBlasTraits::NeedToConjugate), - (Dest::Flags&RowMajorBit)?RowMajor:ColMajor> - ::run( - this->rows(), this->cols(), lhs.cols(), - (const Scalar*)&(lhs.const_cast_derived().coeffRef(0,0)), lhs.stride(), - (const Scalar*)&(rhs.const_cast_derived().coeffRef(0,0)), rhs.stride(), - (Scalar*)&(dst.coeffRef(0,0)), dst.stride(), - actualAlpha); - } -}; - -/*********************************************************************** * Implementation of Inner Vector Vector Product ***********************************************************************/ @@ -403,362 +360,6 @@ template<> struct ei_gemv_selector<OnTheRight,RowMajor,false> } }; -/*********************************************************************** -* Implementation of products with small fixed sizes -***********************************************************************/ - -/* Since the all the dimensions of the product are small, here we can rely - * on the generic Assign mechanism to evaluate the product per coeff (or packet). - * - * Note that the here inner-loops should always be unrolled. - */ - -template<int VectorizationMode, int Index, typename Lhs, typename Rhs, typename RetScalar> -struct ei_product_coeff_impl; - -template<int StorageOrder, int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode> -struct ei_product_packet_impl; - -template<typename LhsNested, typename RhsNested> -struct ei_traits<GeneralProduct<LhsNested,RhsNested,UnrolledProduct> > -{ - typedef typename ei_cleantype<LhsNested>::type _LhsNested; - typedef typename ei_cleantype<RhsNested>::type _RhsNested; - typedef typename ei_scalar_product_traits<typename _LhsNested::Scalar, typename _RhsNested::Scalar>::ReturnType Scalar; - - enum { - LhsCoeffReadCost = _LhsNested::CoeffReadCost, - RhsCoeffReadCost = _RhsNested::CoeffReadCost, - LhsFlags = _LhsNested::Flags, - RhsFlags = _RhsNested::Flags, - - RowsAtCompileTime = _LhsNested::RowsAtCompileTime, - ColsAtCompileTime = _RhsNested::ColsAtCompileTime, - InnerSize = EIGEN_ENUM_MIN(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime), - - MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime, - MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime, - - LhsRowMajor = LhsFlags & RowMajorBit, - RhsRowMajor = RhsFlags & RowMajorBit, - - CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit) - && (ColsAtCompileTime == Dynamic || (ColsAtCompileTime % ei_packet_traits<Scalar>::size) == 0), - - CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit) - && (RowsAtCompileTime == Dynamic || (RowsAtCompileTime % ei_packet_traits<Scalar>::size) == 0), - - EvalToRowMajor = RhsRowMajor && (!CanVectorizeLhs), - - RemovedBits = ~(EvalToRowMajor ? 0 : RowMajorBit), - - Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & RemovedBits) - | EvalBeforeAssigningBit - | EvalBeforeNestingBit - | (CanVectorizeLhs || CanVectorizeRhs ? PacketAccessBit : 0) - | (LhsFlags & RhsFlags & AlignedBit), - - CoeffReadCost = InnerSize == Dynamic ? Dynamic - : InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost) - + (InnerSize - 1) * NumTraits<Scalar>::AddCost, - - /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside - * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner - * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect - * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI. - */ - CanVectorizeInner = LhsRowMajor && (!RhsRowMajor) && (LhsFlags & RhsFlags & ActualPacketAccessBit) - && (InnerSize % ei_packet_traits<Scalar>::size == 0) - }; -}; - -template<typename LhsNested, typename RhsNested> class GeneralProduct<LhsNested,RhsNested,UnrolledProduct> - : ei_no_assignment_operator, - public MatrixBase<GeneralProduct<LhsNested, RhsNested, UnrolledProduct> > -{ - public: - - EIGEN_GENERIC_PUBLIC_INTERFACE(GeneralProduct) - - private: - - typedef typename ei_traits<GeneralProduct>::_LhsNested _LhsNested; - typedef typename ei_traits<GeneralProduct>::_RhsNested _RhsNested; - - enum { - PacketSize = ei_packet_traits<Scalar>::size, - InnerSize = ei_traits<GeneralProduct>::InnerSize, - Unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT, - CanVectorizeInner = ei_traits<GeneralProduct>::CanVectorizeInner - }; - - typedef ei_product_coeff_impl<CanVectorizeInner ? InnerVectorization : NoVectorization, - Unroll ? InnerSize-1 : Dynamic, - _LhsNested, _RhsNested, Scalar> ScalarCoeffImpl; - - public: - - template<typename Lhs, typename Rhs> - inline GeneralProduct(const Lhs& lhs, const Rhs& rhs) - : m_lhs(lhs), m_rhs(rhs) - { - // we don't allow taking products of matrices of different real types, as that wouldn't be vectorizable. - // We still allow to mix T and complex<T>. - EIGEN_STATIC_ASSERT((ei_is_same_type<typename Lhs::RealScalar, typename Rhs::RealScalar>::ret), - YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) - ei_assert(lhs.cols() == rhs.rows() - && "invalid matrix product" - && "if you wanted a coeff-wise or a dot product use the respective explicit functions"); - } - - EIGEN_STRONG_INLINE int rows() const { return m_lhs.rows(); } - EIGEN_STRONG_INLINE int cols() const { return m_rhs.cols(); } - - EIGEN_STRONG_INLINE const Scalar coeff(int row, int col) const - { - Scalar res; - ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res); - return res; - } - - /* Allow index-based non-packet access. It is impossible though to allow index-based packed access, - * which is why we don't set the LinearAccessBit. - */ - EIGEN_STRONG_INLINE const Scalar coeff(int index) const - { - Scalar res; - const int row = RowsAtCompileTime == 1 ? 0 : index; - const int col = RowsAtCompileTime == 1 ? index : 0; - ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res); - return res; - } - - template<int LoadMode> - EIGEN_STRONG_INLINE const PacketScalar packet(int row, int col) const - { - PacketScalar res; - ei_product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor, - Unroll ? InnerSize-1 : Dynamic, - _LhsNested, _RhsNested, PacketScalar, LoadMode> - ::run(row, col, m_lhs, m_rhs, res); - return res; - } - - protected: - const LhsNested m_lhs; - const RhsNested m_rhs; -}; - - -/*************************************************************************** -* Normal product .coeff() implementation (with meta-unrolling) -***************************************************************************/ - -/************************************** -*** Scalar path - no vectorization *** -**************************************/ - -template<int Index, typename Lhs, typename Rhs, typename RetScalar> -struct ei_product_coeff_impl<NoVectorization, Index, Lhs, Rhs, RetScalar> -{ - EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar &res) - { - ei_product_coeff_impl<NoVectorization, Index-1, Lhs, Rhs, RetScalar>::run(row, col, lhs, rhs, res); - res += lhs.coeff(row, Index) * rhs.coeff(Index, col); - } -}; - -template<typename Lhs, typename Rhs, typename RetScalar> -struct ei_product_coeff_impl<NoVectorization, 0, Lhs, Rhs, RetScalar> -{ - EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar &res) - { - res = lhs.coeff(row, 0) * rhs.coeff(0, col); - } -}; - -template<typename Lhs, typename Rhs, typename RetScalar> -struct ei_product_coeff_impl<NoVectorization, Dynamic, Lhs, Rhs, RetScalar> -{ - EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar& res) - { - ei_assert(lhs.cols()>0 && "you are using a non initialized matrix"); - res = lhs.coeff(row, 0) * rhs.coeff(0, col); - for(int i = 1; i < lhs.cols(); ++i) - res += lhs.coeff(row, i) * rhs.coeff(i, col); - } -}; - -// prevent buggy user code from causing an infinite recursion -template<typename Lhs, typename Rhs, typename RetScalar> -struct ei_product_coeff_impl<NoVectorization, -1, Lhs, Rhs, RetScalar> -{ - EIGEN_STRONG_INLINE static void run(int, int, const Lhs&, const Rhs&, RetScalar&) {} -}; - -/******************************************* -*** Scalar path with inner vectorization *** -*******************************************/ - -template<int Index, typename Lhs, typename Rhs, typename PacketScalar> -struct ei_product_coeff_vectorized_unroller -{ - enum { PacketSize = ei_packet_traits<typename Lhs::Scalar>::size }; - EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres) - { - ei_product_coeff_vectorized_unroller<Index-PacketSize, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, pres); - pres = ei_padd(pres, ei_pmul( lhs.template packet<Aligned>(row, Index) , rhs.template packet<Aligned>(Index, col) )); - } -}; - -template<typename Lhs, typename Rhs, typename PacketScalar> -struct ei_product_coeff_vectorized_unroller<0, Lhs, Rhs, PacketScalar> -{ - EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres) - { - pres = ei_pmul(lhs.template packet<Aligned>(row, 0) , rhs.template packet<Aligned>(0, col)); - } -}; - -template<int Index, typename Lhs, typename Rhs, typename RetScalar> -struct ei_product_coeff_impl<InnerVectorization, Index, Lhs, Rhs, RetScalar> -{ - typedef typename Lhs::PacketScalar PacketScalar; - enum { PacketSize = ei_packet_traits<typename Lhs::Scalar>::size }; - EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar &res) - { - PacketScalar pres; - ei_product_coeff_vectorized_unroller<Index+1-PacketSize, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, pres); - ei_product_coeff_impl<NoVectorization,Index,Lhs,Rhs,RetScalar>::run(row, col, lhs, rhs, res); - res = ei_predux(pres); - } -}; - -template<typename Lhs, typename Rhs, int LhsRows = Lhs::RowsAtCompileTime, int RhsCols = Rhs::ColsAtCompileTime> -struct ei_product_coeff_vectorized_dyn_selector -{ - EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) - { - res = ei_dot_impl< - Block<Lhs, 1, ei_traits<Lhs>::ColsAtCompileTime>, - Block<Rhs, ei_traits<Rhs>::RowsAtCompileTime, 1>, - LinearVectorization, NoUnrolling>::run(lhs.row(row), rhs.col(col)); - } -}; - -// NOTE the 3 following specializations are because taking .col(0) on a vector is a bit slower -// NOTE maybe they are now useless since we have a specialization for Block<Matrix> -template<typename Lhs, typename Rhs, int RhsCols> -struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,RhsCols> -{ - EIGEN_STRONG_INLINE static void run(int /*row*/, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) - { - res = ei_dot_impl< - Lhs, - Block<Rhs, ei_traits<Rhs>::RowsAtCompileTime, 1>, - LinearVectorization, NoUnrolling>::run(lhs, rhs.col(col)); - } -}; - -template<typename Lhs, typename Rhs, int LhsRows> -struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,LhsRows,1> -{ - EIGEN_STRONG_INLINE static void run(int row, int /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) - { - res = ei_dot_impl< - Block<Lhs, 1, ei_traits<Lhs>::ColsAtCompileTime>, - Rhs, - LinearVectorization, NoUnrolling>::run(lhs.row(row), rhs); - } -}; - -template<typename Lhs, typename Rhs> -struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,1> -{ - EIGEN_STRONG_INLINE static void run(int /*row*/, int /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) - { - res = ei_dot_impl< - Lhs, - Rhs, - LinearVectorization, NoUnrolling>::run(lhs, rhs); - } -}; - -template<typename Lhs, typename Rhs, typename RetScalar> -struct ei_product_coeff_impl<InnerVectorization, Dynamic, Lhs, Rhs, RetScalar> -{ - EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) - { - ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs>::run(row, col, lhs, rhs, res); - } -}; - -/******************* -*** Packet path *** -*******************/ - -template<int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode> -struct ei_product_packet_impl<RowMajor, Index, Lhs, Rhs, PacketScalar, LoadMode> -{ - EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) - { - ei_product_packet_impl<RowMajor, Index-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res); - res = ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.template packet<LoadMode>(Index, col), res); - } -}; - -template<int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode> -struct ei_product_packet_impl<ColMajor, Index, Lhs, Rhs, PacketScalar, LoadMode> -{ - EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) - { - ei_product_packet_impl<ColMajor, Index-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res); - res = ei_pmadd(lhs.template packet<LoadMode>(row, Index), ei_pset1(rhs.coeff(Index, col)), res); - } -}; - -template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode> -struct ei_product_packet_impl<RowMajor, 0, Lhs, Rhs, PacketScalar, LoadMode> -{ - EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) - { - res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col)); - } -}; - -template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode> -struct ei_product_packet_impl<ColMajor, 0, Lhs, Rhs, PacketScalar, LoadMode> -{ - EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) - { - res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1(rhs.coeff(0, col))); - } -}; - -template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode> -struct ei_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, PacketScalar, LoadMode> -{ - EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res) - { - ei_assert(lhs.cols()>0 && "you are using a non initialized matrix"); - res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col)); - for(int i = 1; i < lhs.cols(); ++i) - res = ei_pmadd(ei_pset1(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res); - } -}; - -template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode> -struct ei_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, PacketScalar, LoadMode> -{ - EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res) - { - ei_assert(lhs.cols()>0 && "you are using a non initialized matrix"); - res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1(rhs.coeff(0, col))); - for(int i = 1; i < lhs.cols(); ++i) - res = ei_pmadd(lhs.template packet<LoadMode>(row, i), ei_pset1(rhs.coeff(i, col)), res); - } -}; - /*************************************************************************** * Implementation of matrix base methods ***************************************************************************/ diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h index 883edd165..95ce666c9 100644 --- a/Eigen/src/Core/SelfAdjointView.h +++ b/Eigen/src/Core/SelfAdjointView.h @@ -197,101 +197,6 @@ struct ei_triangular_assignment_selector<Derived1, Derived2, SelfAdjoint, Dynami }; /*************************************************************************** -* Wrapper to ei_product_selfadjoint_vector -***************************************************************************/ - -template<typename Lhs, int LhsMode, typename Rhs> -struct ei_traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true> > - : ei_traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs> > -{}; - -template<typename Lhs, int LhsMode, typename Rhs> -struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true> - : public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs > -{ - EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix) - - enum { - LhsUpLo = LhsMode&(UpperTriangularBit|LowerTriangularBit) - }; - - SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} - - template<typename Dest> void addTo(Dest& dst, Scalar alpha) const - { - ei_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); - - const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs); - const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs); - - Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) - * RhsBlasTraits::extractScalarFactor(m_rhs); - - ei_assert((&dst.coeff(1))-(&dst.coeff(0))==1 && "not implemented yet"); - ei_product_selfadjoint_vector<Scalar, ei_traits<_ActualLhsType>::Flags&RowMajorBit, int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)> - ( - lhs.rows(), // size - &lhs.coeff(0,0), lhs.stride(), // lhs info - &rhs.coeff(0), (&rhs.coeff(1))-(&rhs.coeff(0)), // rhs info - &dst.coeffRef(0), // result info - actualAlpha // scale factor - ); - } -}; - -/*************************************************************************** -* Wrapper to ei_product_selfadjoint_matrix -***************************************************************************/ - -template<typename Lhs, int LhsMode, typename Rhs, int RhsMode> -struct ei_traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false> > - : ei_traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs> > -{}; - -template<typename Lhs, int LhsMode, typename Rhs, int RhsMode> -struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false> - : public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs > -{ - EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix) - - SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} - - enum { - LhsUpLo = LhsMode&(UpperTriangularBit|LowerTriangularBit), - LhsIsSelfAdjoint = (LhsMode&SelfAdjointBit)==SelfAdjointBit, - RhsUpLo = RhsMode&(UpperTriangularBit|LowerTriangularBit), - RhsIsSelfAdjoint = (RhsMode&SelfAdjointBit)==SelfAdjointBit - }; - - template<typename Dest> void addTo(Dest& dst, Scalar alpha) const - { - ei_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); - - const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs); - const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs); - - Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) - * RhsBlasTraits::extractScalarFactor(m_rhs); - - ei_product_selfadjoint_matrix<Scalar, - EIGEN_LOGICAL_XOR(LhsUpLo==UpperTriangular, - ei_traits<Lhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint, - NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsUpLo==UpperTriangular,bool(LhsBlasTraits::NeedToConjugate)), - EIGEN_LOGICAL_XOR(RhsUpLo==UpperTriangular, - ei_traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint, - NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsUpLo==UpperTriangular,bool(RhsBlasTraits::NeedToConjugate)), - ei_traits<Dest>::Flags&RowMajorBit ? RowMajor : ColMajor> - ::run( - lhs.rows(), rhs.cols(), // sizes - &lhs.coeff(0,0), lhs.stride(), // lhs info - &rhs.coeff(0,0), rhs.stride(), // rhs info - &dst.coeffRef(0,0), dst.stride(), // result info - actualAlpha // alpha - ); - } -}; - -/*************************************************************************** * Implementation of MatrixBase methods ***************************************************************************/ diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h new file mode 100644 index 000000000..129fd86e7 --- /dev/null +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -0,0 +1,443 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2009 Gael Guennebaud <g.gael@free.fr> +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see <http://www.gnu.org/licenses/>. + +#ifndef EIGEN_GENERAL_BLOCK_PANEL_H +#define EIGEN_GENERAL_BLOCK_PANEL_H + +#ifndef EIGEN_EXTERN_INSTANTIATIONS + +// optimized GEneral packed Block * packed Panel product kernel +template<typename Scalar, int mr, int nr, typename Conj> +struct ei_gebp_kernel +{ + void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int rows, int depth, int cols, int strideA=-1, int strideB=-1, int offsetA=0, int offsetB=0) + { + typedef typename ei_packet_traits<Scalar>::type PacketType; + enum { PacketSize = ei_packet_traits<Scalar>::size }; + if(strideA==-1) strideA = depth; + if(strideB==-1) strideB = depth; + Conj cj; + int packet_cols = (cols/nr) * nr; + const int peeled_mc = (rows/mr)*mr; + // loops on each cache friendly block of the result/rhs + for(int j2=0; j2<packet_cols; j2+=nr) + { + // loops on each register blocking of lhs/res + for(int i=0; i<peeled_mc; i+=mr) + { + const Scalar* blA = &blockA[i*strideA+offsetA*mr]; + #ifdef EIGEN_VECTORIZE_SSE + _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); + #endif + + // TODO move the res loads to the stores + + // gets res block as register + PacketType C0, C1, C2, C3, C4, C5, C6, C7; + C0 = ei_ploadu(&res[(j2+0)*resStride + i]); + C1 = ei_ploadu(&res[(j2+1)*resStride + i]); + if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i]); + if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]); + C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]); + C5 = ei_ploadu(&res[(j2+1)*resStride + i + PacketSize]); + if(nr==4) C6 = ei_ploadu(&res[(j2+2)*resStride + i + PacketSize]); + if(nr==4) C7 = ei_ploadu(&res[(j2+3)*resStride + i + PacketSize]); + + // performs "inner" product + // TODO let's check wether the flowing peeled loop could not be + // optimized via optimal prefetching from one loop to the other + const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr]; + const int peeled_kc = (depth/4)*4; + for(int k=0; k<peeled_kc; k+=4) + { + PacketType B0, B1, B2, B3, A0, A1; + + A0 = ei_pload(&blA[0*PacketSize]); + A1 = ei_pload(&blA[1*PacketSize]); + B0 = ei_pload(&blB[0*PacketSize]); + B1 = ei_pload(&blB[1*PacketSize]); + C0 = cj.pmadd(A0, B0, C0); + if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); + C4 = cj.pmadd(A1, B0, C4); + if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); + B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]); + C1 = cj.pmadd(A0, B1, C1); + C5 = cj.pmadd(A1, B1, C5); + B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]); + if(nr==4) C2 = cj.pmadd(A0, B2, C2); + if(nr==4) C6 = cj.pmadd(A1, B2, C6); + if(nr==4) B2 = ei_pload(&blB[6*PacketSize]); + if(nr==4) C3 = cj.pmadd(A0, B3, C3); + A0 = ei_pload(&blA[2*PacketSize]); + if(nr==4) C7 = cj.pmadd(A1, B3, C7); + A1 = ei_pload(&blA[3*PacketSize]); + if(nr==4) B3 = ei_pload(&blB[7*PacketSize]); + C0 = cj.pmadd(A0, B0, C0); + C4 = cj.pmadd(A1, B0, C4); + B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]); + C1 = cj.pmadd(A0, B1, C1); + C5 = cj.pmadd(A1, B1, C5); + B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]); + if(nr==4) C2 = cj.pmadd(A0, B2, C2); + if(nr==4) C6 = cj.pmadd(A1, B2, C6); + if(nr==4) B2 = ei_pload(&blB[10*PacketSize]); + if(nr==4) C3 = cj.pmadd(A0, B3, C3); + A0 = ei_pload(&blA[4*PacketSize]); + if(nr==4) C7 = cj.pmadd(A1, B3, C7); + A1 = ei_pload(&blA[5*PacketSize]); + if(nr==4) B3 = ei_pload(&blB[11*PacketSize]); + + C0 = cj.pmadd(A0, B0, C0); + C4 = cj.pmadd(A1, B0, C4); + B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]); + C1 = cj.pmadd(A0, B1, C1); + C5 = cj.pmadd(A1, B1, C5); + B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]); + if(nr==4) C2 = cj.pmadd(A0, B2, C2); + if(nr==4) C6 = cj.pmadd(A1, B2, C6); + if(nr==4) B2 = ei_pload(&blB[14*PacketSize]); + if(nr==4) C3 = cj.pmadd(A0, B3, C3); + A0 = ei_pload(&blA[6*PacketSize]); + if(nr==4) C7 = cj.pmadd(A1, B3, C7); + A1 = ei_pload(&blA[7*PacketSize]); + if(nr==4) B3 = ei_pload(&blB[15*PacketSize]); + C0 = cj.pmadd(A0, B0, C0); + C4 = cj.pmadd(A1, B0, C4); + C1 = cj.pmadd(A0, B1, C1); + C5 = cj.pmadd(A1, B1, C5); + if(nr==4) C2 = cj.pmadd(A0, B2, C2); + if(nr==4) C6 = cj.pmadd(A1, B2, C6); + if(nr==4) C3 = cj.pmadd(A0, B3, C3); + if(nr==4) C7 = cj.pmadd(A1, B3, C7); + + blB += 4*nr*PacketSize; + blA += 4*mr; + } + // process remaining peeled loop + for(int k=peeled_kc; k<depth; k++) + { + PacketType B0, B1, B2, B3, A0, A1; + + A0 = ei_pload(&blA[0*PacketSize]); + A1 = ei_pload(&blA[1*PacketSize]); + B0 = ei_pload(&blB[0*PacketSize]); + B1 = ei_pload(&blB[1*PacketSize]); + C0 = cj.pmadd(A0, B0, C0); + if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); + C4 = cj.pmadd(A1, B0, C4); + if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); + C1 = cj.pmadd(A0, B1, C1); + C5 = cj.pmadd(A1, B1, C5); + if(nr==4) C2 = cj.pmadd(A0, B2, C2); + if(nr==4) C6 = cj.pmadd(A1, B2, C6); + if(nr==4) C3 = cj.pmadd(A0, B3, C3); + if(nr==4) C7 = cj.pmadd(A1, B3, C7); + + blB += nr*PacketSize; + blA += mr; + } + + ei_pstoreu(&res[(j2+0)*resStride + i], C0); + ei_pstoreu(&res[(j2+1)*resStride + i], C1); + if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i], C2); + if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i], C3); + ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4); + ei_pstoreu(&res[(j2+1)*resStride + i + PacketSize], C5); + if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i + PacketSize], C6); + if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i + PacketSize], C7); + } + for(int i=peeled_mc; i<rows; i++) + { + const Scalar* blA = &blockA[i*strideA+offsetA]; + #ifdef EIGEN_VECTORIZE_SSE + _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); + #endif + + // gets a 1 x nr res block as registers + Scalar C0(0), C1(0), C2(0), C3(0); + const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr]; + for(int k=0; k<depth; k++) + { + Scalar B0, B1, B2, B3, A0; + + A0 = blA[k]; + B0 = blB[0*PacketSize]; + B1 = blB[1*PacketSize]; + C0 = cj.pmadd(A0, B0, C0); + if(nr==4) B2 = blB[2*PacketSize]; + if(nr==4) B3 = blB[3*PacketSize]; + C1 = cj.pmadd(A0, B1, C1); + if(nr==4) C2 = cj.pmadd(A0, B2, C2); + if(nr==4) C3 = cj.pmadd(A0, B3, C3); + + blB += nr*PacketSize; + } + res[(j2+0)*resStride + i] += C0; + res[(j2+1)*resStride + i] += C1; + if(nr==4) res[(j2+2)*resStride + i] += C2; + if(nr==4) res[(j2+3)*resStride + i] += C3; + } + } + + // process remaining rhs/res columns one at a time + // => do the same but with nr==1 + for(int j2=packet_cols; j2<cols; j2++) + { + for(int i=0; i<peeled_mc; i+=mr) + { + const Scalar* blA = &blockA[i*strideA+offsetA*mr]; + #ifdef EIGEN_VECTORIZE_SSE + _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); + #endif + + // TODO move the res loads to the stores + + // gets res block as register + PacketType C0, C4; + C0 = ei_ploadu(&res[(j2+0)*resStride + i]); + C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]); + + const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB]; + for(int k=0; k<depth; k++) + { + PacketType B0, A0, A1; + + A0 = ei_pload(&blA[0*PacketSize]); + A1 = ei_pload(&blA[1*PacketSize]); + B0 = ei_pload(&blB[0*PacketSize]); + C0 = cj.pmadd(A0, B0, C0); + C4 = cj.pmadd(A1, B0, C4); + + blB += PacketSize; + blA += mr; + } + + ei_pstoreu(&res[(j2+0)*resStride + i], C0); + ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4); + } + for(int i=peeled_mc; i<rows; i++) + { + const Scalar* blA = &blockA[i*strideA+offsetA]; + #ifdef EIGEN_VECTORIZE_SSE + _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); + #endif + + // gets a 1 x 1 res block as registers + Scalar C0(0); + const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB]; + for(int k=0; k<depth; k++) + C0 = cj.pmadd(blA[k], blB[k*PacketSize], C0); + res[(j2+0)*resStride + i] += C0; + } + } + } +}; + +// pack a block of the lhs +// The travesal is as follow (mr==4): +// 0 4 8 12 ... +// 1 5 9 13 ... +// 2 6 10 14 ... +// 3 7 11 15 ... +// +// 16 20 24 28 ... +// 17 21 25 29 ... +// 18 22 26 30 ... +// 19 23 27 31 ... +// +// 32 33 34 35 ... +// 36 36 38 39 ... +template<typename Scalar, int mr, int StorageOrder, bool Conjugate, bool PanelMode> +struct ei_gemm_pack_lhs +{ + void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, int lhsStride, int depth, int rows, + int stride=0, int offset=0) + { + ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + ei_conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; + ei_const_blas_data_mapper<Scalar, StorageOrder> lhs(_lhs,lhsStride); + int count = 0; + const int peeled_mc = (rows/mr)*mr; + for(int i=0; i<peeled_mc; i+=mr) + { + if(PanelMode) count += mr * offset; + for(int k=0; k<depth; k++) + for(int w=0; w<mr; w++) + blockA[count++] = cj(lhs(i+w, k)); + if(PanelMode) count += mr * (stride-offset-depth); + } + for(int i=peeled_mc; i<rows; i++) + { + if(PanelMode) count += offset; + for(int k=0; k<depth; k++) + blockA[count++] = cj(lhs(i, k)); + if(PanelMode) count += (stride-offset-depth); + } + } +}; + +// copy a complete panel of the rhs while expending each coefficient into a packet form +// this version is optimized for column major matrices +// The traversal order is as follow (nr==4): +// 0 1 2 3 12 13 14 15 24 27 +// 4 5 6 7 16 17 18 19 25 28 +// 8 9 10 11 20 21 22 23 26 29 +// . . . . . . . . . . +template<typename Scalar, int nr, bool PanelMode> +struct ei_gemm_pack_rhs<Scalar, nr, ColMajor, PanelMode> +{ + enum { PacketSize = ei_packet_traits<Scalar>::size }; + void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols, + int stride=0, int offset=0) + { + ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + bool hasAlpha = alpha != Scalar(1); + int packet_cols = (cols/nr) * nr; + int count = 0; + for(int j2=0; j2<packet_cols; j2+=nr) + { + // skip what we have before + if(PanelMode) count += PacketSize * nr * offset; + const Scalar* b0 = &rhs[(j2+0)*rhsStride]; + const Scalar* b1 = &rhs[(j2+1)*rhsStride]; + const Scalar* b2 = &rhs[(j2+2)*rhsStride]; + const Scalar* b3 = &rhs[(j2+3)*rhsStride]; + if (hasAlpha) + { + for(int k=0; k<depth; k++) + { + ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[k])); + ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b1[k])); + if (nr==4) + { + ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b2[k])); + ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b3[k])); + } + count += nr*PacketSize; + } + } + else + { + for(int k=0; k<depth; k++) + { + ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[k])); + ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b1[k])); + if (nr==4) + { + ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b2[k])); + ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b3[k])); + } + count += nr*PacketSize; + } + } + // skip what we have after + if(PanelMode) count += PacketSize * nr * (stride-offset-depth); + } + // copy the remaining columns one at a time (nr==1) + for(int j2=packet_cols; j2<cols; ++j2) + { + if(PanelMode) count += PacketSize * offset; + const Scalar* b0 = &rhs[(j2+0)*rhsStride]; + if (hasAlpha) + { + for(int k=0; k<depth; k++) + { + ei_pstore(&blockB[count], ei_pset1(alpha*b0[k])); + count += PacketSize; + } + } + else + { + for(int k=0; k<depth; k++) + { + ei_pstore(&blockB[count], ei_pset1(b0[k])); + count += PacketSize; + } + } + if(PanelMode) count += PacketSize * (stride-offset-depth); + } + } +}; + +// this version is optimized for row major matrices +template<typename Scalar, int nr, bool PanelMode> +struct ei_gemm_pack_rhs<Scalar, nr, RowMajor, PanelMode> +{ + enum { PacketSize = ei_packet_traits<Scalar>::size }; + void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols, + int stride=0, int offset=0) + { + ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + bool hasAlpha = alpha != Scalar(1); + int packet_cols = (cols/nr) * nr; + int count = 0; + for(int j2=0; j2<packet_cols; j2+=nr) + { + // skip what we have before + if(PanelMode) count += PacketSize * nr * offset; + if (hasAlpha) + { + for(int k=0; k<depth; k++) + { + const Scalar* b0 = &rhs[k*rhsStride + j2]; + ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[0])); + ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b0[1])); + if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b0[2])); + if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b0[3])); + count += nr*PacketSize; + } + } + else + { + for(int k=0; k<depth; k++) + { + const Scalar* b0 = &rhs[k*rhsStride + j2]; + ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[0])); + ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b0[1])); + if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b0[2])); + if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b0[3])); + count += nr*PacketSize; + } + } + // skip what we have after + if(PanelMode) count += PacketSize * nr * (stride-offset-depth); + } + // copy the remaining columns one at a time (nr==1) + for(int j2=packet_cols; j2<cols; ++j2) + { + if(PanelMode) count += PacketSize * offset; + const Scalar* b0 = &rhs[j2]; + for(int k=0; k<depth; k++) + { + ei_pstore(&blockB[count], ei_pset1(alpha*b0[k*rhsStride])); + count += PacketSize; + } + if(PanelMode) count += PacketSize * (stride-offset-depth); + } + } +}; + +#endif // EIGEN_EXTERN_INSTANTIATIONS + +#endif // EIGEN_GENERAL_BLOCK_PANEL_H diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 365499001..ff0f2c1b4 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -27,6 +27,7 @@ #ifndef EIGEN_EXTERN_INSTANTIATIONS +/* Specialization for a row-major destination matrix => simple transposition of the product */ template< typename Scalar, int LhsStorageOrder, bool ConjugateLhs, @@ -51,6 +52,8 @@ struct ei_general_matrix_matrix_product<Scalar,LhsStorageOrder,ConjugateLhs,RhsS } }; +/* Specialization for a col-major destination matrix + * => Blocking algorithm following Goto's paper */ template< typename Scalar, int LhsStorageOrder, bool ConjugateLhs, @@ -78,23 +81,30 @@ static void run(int rows, int cols, int depth, Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize); - // => GEMM_VAR1 + // For each horizontal panel of the rhs, and corresponding panel of the lhs... + // (==GEMM_VAR1) for(int k2=0; k2<depth; k2+=kc) { const int actual_kc = std::min(k2+kc,depth)-k2; - // we have selected one row panel of rhs and one column panel of lhs - // pack rhs's panel into a sequential chunk of memory - // and expand each coeff to a constant packet for further reuse + // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs. + // => Pack rhs's panel into a sequential chunk of memory (L2 caching) + // Note that this panel will be read as many times as the number of blocks in the lhs's + // vertical panel which is, in practice, a very low number. ei_gemm_pack_rhs<Scalar, Blocking::nr, RhsStorageOrder>()(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols); - // => GEPP_VAR1 + // For each mc x kc block of the lhs's vertical panel... + // (==GEPP_VAR1) for(int i2=0; i2<rows; i2+=mc) { const int actual_mc = std::min(i2+mc,rows)-i2; + // We pack the lhs's block into a sequential chunk of memory (L1 caching) + // Note that this block will be read a very high number of times, which is equal to the number of + // micro vertical panel of the large rhs's panel (e.g., cols/4 times). ei_gemm_pack_lhs<Scalar, Blocking::mr, LhsStorageOrder>()(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc); + // Everything is packed, we can now call the block * panel kernel: ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >() (res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols); } @@ -106,417 +116,49 @@ static void run(int rows, int cols, int depth, }; -// optimized GEneral packed Block * packed Panel product kernel -template<typename Scalar, int mr, int nr, typename Conj> -struct ei_gebp_kernel -{ - void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int rows, int depth, int cols, int strideA=-1, int strideB=-1, int offsetA=0, int offsetB=0) - { - typedef typename ei_packet_traits<Scalar>::type PacketType; - enum { PacketSize = ei_packet_traits<Scalar>::size }; - if(strideA==-1) strideA = depth; - if(strideB==-1) strideB = depth; - Conj cj; - int packet_cols = (cols/nr) * nr; - const int peeled_mc = (rows/mr)*mr; - // loops on each cache friendly block of the result/rhs - for(int j2=0; j2<packet_cols; j2+=nr) - { - // loops on each register blocking of lhs/res - for(int i=0; i<peeled_mc; i+=mr) - { - const Scalar* blA = &blockA[i*strideA+offsetA*mr]; - #ifdef EIGEN_VECTORIZE_SSE - _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); - #endif - - // TODO move the res loads to the stores - - // gets res block as register - PacketType C0, C1, C2, C3, C4, C5, C6, C7; - C0 = ei_ploadu(&res[(j2+0)*resStride + i]); - C1 = ei_ploadu(&res[(j2+1)*resStride + i]); - if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i]); - if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]); - C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]); - C5 = ei_ploadu(&res[(j2+1)*resStride + i + PacketSize]); - if(nr==4) C6 = ei_ploadu(&res[(j2+2)*resStride + i + PacketSize]); - if(nr==4) C7 = ei_ploadu(&res[(j2+3)*resStride + i + PacketSize]); - - // performs "inner" product - // TODO let's check wether the flowing peeled loop could not be - // optimized via optimal prefetching from one loop to the other - const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr]; - const int peeled_kc = (depth/4)*4; - for(int k=0; k<peeled_kc; k+=4) - { - PacketType B0, B1, B2, B3, A0, A1; - - A0 = ei_pload(&blA[0*PacketSize]); - A1 = ei_pload(&blA[1*PacketSize]); - B0 = ei_pload(&blB[0*PacketSize]); - B1 = ei_pload(&blB[1*PacketSize]); - C0 = cj.pmadd(A0, B0, C0); - if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); - C4 = cj.pmadd(A1, B0, C4); - if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); - B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]); - C1 = cj.pmadd(A0, B1, C1); - C5 = cj.pmadd(A1, B1, C5); - B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]); - if(nr==4) C2 = cj.pmadd(A0, B2, C2); - if(nr==4) C6 = cj.pmadd(A1, B2, C6); - if(nr==4) B2 = ei_pload(&blB[6*PacketSize]); - if(nr==4) C3 = cj.pmadd(A0, B3, C3); - A0 = ei_pload(&blA[2*PacketSize]); - if(nr==4) C7 = cj.pmadd(A1, B3, C7); - A1 = ei_pload(&blA[3*PacketSize]); - if(nr==4) B3 = ei_pload(&blB[7*PacketSize]); - C0 = cj.pmadd(A0, B0, C0); - C4 = cj.pmadd(A1, B0, C4); - B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]); - C1 = cj.pmadd(A0, B1, C1); - C5 = cj.pmadd(A1, B1, C5); - B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]); - if(nr==4) C2 = cj.pmadd(A0, B2, C2); - if(nr==4) C6 = cj.pmadd(A1, B2, C6); - if(nr==4) B2 = ei_pload(&blB[10*PacketSize]); - if(nr==4) C3 = cj.pmadd(A0, B3, C3); - A0 = ei_pload(&blA[4*PacketSize]); - if(nr==4) C7 = cj.pmadd(A1, B3, C7); - A1 = ei_pload(&blA[5*PacketSize]); - if(nr==4) B3 = ei_pload(&blB[11*PacketSize]); - - C0 = cj.pmadd(A0, B0, C0); - C4 = cj.pmadd(A1, B0, C4); - B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]); - C1 = cj.pmadd(A0, B1, C1); - C5 = cj.pmadd(A1, B1, C5); - B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]); - if(nr==4) C2 = cj.pmadd(A0, B2, C2); - if(nr==4) C6 = cj.pmadd(A1, B2, C6); - if(nr==4) B2 = ei_pload(&blB[14*PacketSize]); - if(nr==4) C3 = cj.pmadd(A0, B3, C3); - A0 = ei_pload(&blA[6*PacketSize]); - if(nr==4) C7 = cj.pmadd(A1, B3, C7); - A1 = ei_pload(&blA[7*PacketSize]); - if(nr==4) B3 = ei_pload(&blB[15*PacketSize]); - C0 = cj.pmadd(A0, B0, C0); - C4 = cj.pmadd(A1, B0, C4); - C1 = cj.pmadd(A0, B1, C1); - C5 = cj.pmadd(A1, B1, C5); - if(nr==4) C2 = cj.pmadd(A0, B2, C2); - if(nr==4) C6 = cj.pmadd(A1, B2, C6); - if(nr==4) C3 = cj.pmadd(A0, B3, C3); - if(nr==4) C7 = cj.pmadd(A1, B3, C7); - - blB += 4*nr*PacketSize; - blA += 4*mr; - } - // process remaining peeled loop - for(int k=peeled_kc; k<depth; k++) - { - PacketType B0, B1, B2, B3, A0, A1; - - A0 = ei_pload(&blA[0*PacketSize]); - A1 = ei_pload(&blA[1*PacketSize]); - B0 = ei_pload(&blB[0*PacketSize]); - B1 = ei_pload(&blB[1*PacketSize]); - C0 = cj.pmadd(A0, B0, C0); - if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); - C4 = cj.pmadd(A1, B0, C4); - if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); - C1 = cj.pmadd(A0, B1, C1); - C5 = cj.pmadd(A1, B1, C5); - if(nr==4) C2 = cj.pmadd(A0, B2, C2); - if(nr==4) C6 = cj.pmadd(A1, B2, C6); - if(nr==4) C3 = cj.pmadd(A0, B3, C3); - if(nr==4) C7 = cj.pmadd(A1, B3, C7); - - blB += nr*PacketSize; - blA += mr; - } - - ei_pstoreu(&res[(j2+0)*resStride + i], C0); - ei_pstoreu(&res[(j2+1)*resStride + i], C1); - if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i], C2); - if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i], C3); - ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4); - ei_pstoreu(&res[(j2+1)*resStride + i + PacketSize], C5); - if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i + PacketSize], C6); - if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i + PacketSize], C7); - } - for(int i=peeled_mc; i<rows; i++) - { - const Scalar* blA = &blockA[i*strideA+offsetA]; - #ifdef EIGEN_VECTORIZE_SSE - _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); - #endif - - // gets a 1 x nr res block as registers - Scalar C0(0), C1(0), C2(0), C3(0); - const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr]; - for(int k=0; k<depth; k++) - { - Scalar B0, B1, B2, B3, A0; - - A0 = blA[k]; - B0 = blB[0*PacketSize]; - B1 = blB[1*PacketSize]; - C0 = cj.pmadd(A0, B0, C0); - if(nr==4) B2 = blB[2*PacketSize]; - if(nr==4) B3 = blB[3*PacketSize]; - C1 = cj.pmadd(A0, B1, C1); - if(nr==4) C2 = cj.pmadd(A0, B2, C2); - if(nr==4) C3 = cj.pmadd(A0, B3, C3); - - blB += nr*PacketSize; - } - res[(j2+0)*resStride + i] += C0; - res[(j2+1)*resStride + i] += C1; - if(nr==4) res[(j2+2)*resStride + i] += C2; - if(nr==4) res[(j2+3)*resStride + i] += C3; - } - } - - // process remaining rhs/res columns one at a time - // => do the same but with nr==1 - for(int j2=packet_cols; j2<cols; j2++) - { - for(int i=0; i<peeled_mc; i+=mr) - { - const Scalar* blA = &blockA[i*strideA+offsetA*mr]; - #ifdef EIGEN_VECTORIZE_SSE - _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); - #endif - - // TODO move the res loads to the stores - - // gets res block as register - PacketType C0, C4; - C0 = ei_ploadu(&res[(j2+0)*resStride + i]); - C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]); - - const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB]; - for(int k=0; k<depth; k++) - { - PacketType B0, A0, A1; - - A0 = ei_pload(&blA[0*PacketSize]); - A1 = ei_pload(&blA[1*PacketSize]); - B0 = ei_pload(&blB[0*PacketSize]); - C0 = cj.pmadd(A0, B0, C0); - C4 = cj.pmadd(A1, B0, C4); +#endif // EIGEN_EXTERN_INSTANTIATIONS - blB += PacketSize; - blA += mr; - } +/********************************************************************************* +* Specialization of GeneralProduct<> for "large" GEMM, i.e., +* implementation of the high level wrapper to ei_general_matrix_matrix_product +**********************************************************************************/ - ei_pstoreu(&res[(j2+0)*resStride + i], C0); - ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4); - } - for(int i=peeled_mc; i<rows; i++) - { - const Scalar* blA = &blockA[i*strideA+offsetA]; - #ifdef EIGEN_VECTORIZE_SSE - _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); - #endif +template<typename Lhs, typename Rhs> +struct ei_traits<GeneralProduct<Lhs,Rhs,GemmProduct> > + : ei_traits<ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs> > +{}; - // gets a 1 x 1 res block as registers - Scalar C0(0); - const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB]; - for(int k=0; k<depth; k++) - C0 = cj.pmadd(blA[k], blB[k*PacketSize], C0); - res[(j2+0)*resStride + i] += C0; - } - } - } -}; - -// pack a block of the lhs -// The travesal is as follow (mr==4): -// 0 4 8 12 ... -// 1 5 9 13 ... -// 2 6 10 14 ... -// 3 7 11 15 ... -// -// 16 20 24 28 ... -// 17 21 25 29 ... -// 18 22 26 30 ... -// 19 23 27 31 ... -// -// 32 33 34 35 ... -// 36 36 38 39 ... -template<typename Scalar, int mr, int StorageOrder, bool Conjugate, bool PanelMode> -struct ei_gemm_pack_lhs +template<typename Lhs, typename Rhs> +class GeneralProduct<Lhs, Rhs, GemmProduct> + : public ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs> { - void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, int lhsStride, int depth, int rows, - int stride=0, int offset=0) - { - ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); - ei_conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; - ei_const_blas_data_mapper<Scalar, StorageOrder> lhs(_lhs,lhsStride); - int count = 0; - const int peeled_mc = (rows/mr)*mr; - for(int i=0; i<peeled_mc; i+=mr) - { - if(PanelMode) count += mr * offset; - for(int k=0; k<depth; k++) - for(int w=0; w<mr; w++) - blockA[count++] = cj(lhs(i+w, k)); - if(PanelMode) count += mr * (stride-offset-depth); - } - for(int i=peeled_mc; i<rows; i++) - { - if(PanelMode) count += offset; - for(int k=0; k<depth; k++) - blockA[count++] = cj(lhs(i, k)); - if(PanelMode) count += (stride-offset-depth); - } - } -}; + public: + EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct) -// copy a complete panel of the rhs while expending each coefficient into a packet form -// this version is optimized for column major matrices -// The traversal order is as follow (nr==4): -// 0 1 2 3 12 13 14 15 24 27 -// 4 5 6 7 16 17 18 19 25 28 -// 8 9 10 11 20 21 22 23 26 29 -// . . . . . . . . . . -template<typename Scalar, int nr, bool PanelMode> -struct ei_gemm_pack_rhs<Scalar, nr, ColMajor, PanelMode> -{ - enum { PacketSize = ei_packet_traits<Scalar>::size }; - void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols, - int stride=0, int offset=0) - { - ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); - bool hasAlpha = alpha != Scalar(1); - int packet_cols = (cols/nr) * nr; - int count = 0; - for(int j2=0; j2<packet_cols; j2+=nr) - { - // skip what we have before - if(PanelMode) count += PacketSize * nr * offset; - const Scalar* b0 = &rhs[(j2+0)*rhsStride]; - const Scalar* b1 = &rhs[(j2+1)*rhsStride]; - const Scalar* b2 = &rhs[(j2+2)*rhsStride]; - const Scalar* b3 = &rhs[(j2+3)*rhsStride]; - if (hasAlpha) - { - for(int k=0; k<depth; k++) - { - ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[k])); - ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b1[k])); - if (nr==4) - { - ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b2[k])); - ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b3[k])); - } - count += nr*PacketSize; - } - } - else - { - for(int k=0; k<depth; k++) - { - ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[k])); - ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b1[k])); - if (nr==4) - { - ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b2[k])); - ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b3[k])); - } - count += nr*PacketSize; - } - } - // skip what we have after - if(PanelMode) count += PacketSize * nr * (stride-offset-depth); - } - // copy the remaining columns one at a time (nr==1) - for(int j2=packet_cols; j2<cols; ++j2) - { - if(PanelMode) count += PacketSize * offset; - const Scalar* b0 = &rhs[(j2+0)*rhsStride]; - if (hasAlpha) - { - for(int k=0; k<depth; k++) - { - ei_pstore(&blockB[count], ei_pset1(alpha*b0[k])); - count += PacketSize; - } - } - else - { - for(int k=0; k<depth; k++) - { - ei_pstore(&blockB[count], ei_pset1(b0[k])); - count += PacketSize; - } - } - if(PanelMode) count += PacketSize * (stride-offset-depth); - } - } -}; + GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} -// this version is optimized for row major matrices -template<typename Scalar, int nr, bool PanelMode> -struct ei_gemm_pack_rhs<Scalar, nr, RowMajor, PanelMode> -{ - enum { PacketSize = ei_packet_traits<Scalar>::size }; - void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols, - int stride=0, int offset=0) - { - ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); - bool hasAlpha = alpha != Scalar(1); - int packet_cols = (cols/nr) * nr; - int count = 0; - for(int j2=0; j2<packet_cols; j2+=nr) - { - // skip what we have before - if(PanelMode) count += PacketSize * nr * offset; - if (hasAlpha) - { - for(int k=0; k<depth; k++) - { - const Scalar* b0 = &rhs[k*rhsStride + j2]; - ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[0])); - ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b0[1])); - if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b0[2])); - if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b0[3])); - count += nr*PacketSize; - } - } - else - { - for(int k=0; k<depth; k++) - { - const Scalar* b0 = &rhs[k*rhsStride + j2]; - ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[0])); - ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b0[1])); - if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b0[2])); - if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b0[3])); - count += nr*PacketSize; - } - } - // skip what we have after - if(PanelMode) count += PacketSize * nr * (stride-offset-depth); - } - // copy the remaining columns one at a time (nr==1) - for(int j2=packet_cols; j2<cols; ++j2) + template<typename Dest> void addTo(Dest& dst, Scalar alpha) const { - if(PanelMode) count += PacketSize * offset; - const Scalar* b0 = &rhs[j2]; - for(int k=0; k<depth; k++) - { - ei_pstore(&blockB[count], ei_pset1(alpha*b0[k*rhsStride])); - count += PacketSize; - } - if(PanelMode) count += PacketSize * (stride-offset-depth); + ei_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); + + const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs); + const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs); + + Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) + * RhsBlasTraits::extractScalarFactor(m_rhs); + + ei_general_matrix_matrix_product< + Scalar, + (_ActualLhsType::Flags&RowMajorBit)?RowMajor:ColMajor, bool(LhsBlasTraits::NeedToConjugate), + (_ActualRhsType::Flags&RowMajorBit)?RowMajor:ColMajor, bool(RhsBlasTraits::NeedToConjugate), + (Dest::Flags&RowMajorBit)?RowMajor:ColMajor> + ::run( + this->rows(), this->cols(), lhs.cols(), + (const Scalar*)&(lhs.const_cast_derived().coeffRef(0,0)), lhs.stride(), + (const Scalar*)&(rhs.const_cast_derived().coeffRef(0,0)), rhs.stride(), + (Scalar*)&(dst.coeffRef(0,0)), dst.stride(), + actualAlpha); } - } }; -#endif // EIGEN_EXTERN_INSTANTIATIONS - #endif // EIGEN_GENERAL_MATRIX_MATRIX_H diff --git a/Eigen/src/Core/products/GeneralUnrolled.h b/Eigen/src/Core/products/GeneralUnrolled.h new file mode 100644 index 000000000..7241976a8 --- /dev/null +++ b/Eigen/src/Core/products/GeneralUnrolled.h @@ -0,0 +1,385 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com> +// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr> +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see <http://www.gnu.org/licenses/>. + +#ifndef EIGEN_GENERAL_UNROLLED_PRODUCT_H +#define EIGEN_GENERAL_UNROLLED_PRODUCT_H + +/********************************************************************************* +* Specialization of GeneralProduct<> for products with small fixed sizes +*********************************************************************************/ + +/* Since the all the dimensions of the product are small, here we can rely + * on the generic Assign mechanism to evaluate the product per coeff (or packet). + * + * Note that here the inner-loops should always be unrolled. + */ + +template<int VectorizationMode, int Index, typename Lhs, typename Rhs, typename RetScalar> +struct ei_product_coeff_impl; + +template<int StorageOrder, int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode> +struct ei_product_packet_impl; + +template<typename LhsNested, typename RhsNested> +struct ei_traits<GeneralProduct<LhsNested,RhsNested,UnrolledProduct> > +{ + typedef typename ei_cleantype<LhsNested>::type _LhsNested; + typedef typename ei_cleantype<RhsNested>::type _RhsNested; + typedef typename ei_scalar_product_traits<typename _LhsNested::Scalar, typename _RhsNested::Scalar>::ReturnType Scalar; + + enum { + LhsCoeffReadCost = _LhsNested::CoeffReadCost, + RhsCoeffReadCost = _RhsNested::CoeffReadCost, + LhsFlags = _LhsNested::Flags, + RhsFlags = _RhsNested::Flags, + + RowsAtCompileTime = _LhsNested::RowsAtCompileTime, + ColsAtCompileTime = _RhsNested::ColsAtCompileTime, + InnerSize = EIGEN_ENUM_MIN(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime), + + MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime, + MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime, + + LhsRowMajor = LhsFlags & RowMajorBit, + RhsRowMajor = RhsFlags & RowMajorBit, + + CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit) + && (ColsAtCompileTime == Dynamic || (ColsAtCompileTime % ei_packet_traits<Scalar>::size) == 0), + + CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit) + && (RowsAtCompileTime == Dynamic || (RowsAtCompileTime % ei_packet_traits<Scalar>::size) == 0), + + EvalToRowMajor = RhsRowMajor && (!CanVectorizeLhs), + + RemovedBits = ~(EvalToRowMajor ? 0 : RowMajorBit), + + Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & RemovedBits) + | EvalBeforeAssigningBit + | EvalBeforeNestingBit + | (CanVectorizeLhs || CanVectorizeRhs ? PacketAccessBit : 0) + | (LhsFlags & RhsFlags & AlignedBit), + + CoeffReadCost = InnerSize == Dynamic ? Dynamic + : InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost) + + (InnerSize - 1) * NumTraits<Scalar>::AddCost, + + /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside + * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner + * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect + * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI. + */ + CanVectorizeInner = LhsRowMajor && (!RhsRowMajor) && (LhsFlags & RhsFlags & ActualPacketAccessBit) + && (InnerSize % ei_packet_traits<Scalar>::size == 0) + }; +}; + +template<typename LhsNested, typename RhsNested> class GeneralProduct<LhsNested,RhsNested,UnrolledProduct> + : ei_no_assignment_operator, + public MatrixBase<GeneralProduct<LhsNested, RhsNested, UnrolledProduct> > +{ + public: + + EIGEN_GENERIC_PUBLIC_INTERFACE(GeneralProduct) + + private: + + typedef typename ei_traits<GeneralProduct>::_LhsNested _LhsNested; + typedef typename ei_traits<GeneralProduct>::_RhsNested _RhsNested; + + enum { + PacketSize = ei_packet_traits<Scalar>::size, + InnerSize = ei_traits<GeneralProduct>::InnerSize, + Unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT, + CanVectorizeInner = ei_traits<GeneralProduct>::CanVectorizeInner + }; + + typedef ei_product_coeff_impl<CanVectorizeInner ? InnerVectorization : NoVectorization, + Unroll ? InnerSize-1 : Dynamic, + _LhsNested, _RhsNested, Scalar> ScalarCoeffImpl; + + public: + + template<typename Lhs, typename Rhs> + inline GeneralProduct(const Lhs& lhs, const Rhs& rhs) + : m_lhs(lhs), m_rhs(rhs) + { + // we don't allow taking products of matrices of different real types, as that wouldn't be vectorizable. + // We still allow to mix T and complex<T>. + EIGEN_STATIC_ASSERT((ei_is_same_type<typename Lhs::RealScalar, typename Rhs::RealScalar>::ret), + YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) + ei_assert(lhs.cols() == rhs.rows() + && "invalid matrix product" + && "if you wanted a coeff-wise or a dot product use the respective explicit functions"); + } + + EIGEN_STRONG_INLINE int rows() const { return m_lhs.rows(); } + EIGEN_STRONG_INLINE int cols() const { return m_rhs.cols(); } + + EIGEN_STRONG_INLINE const Scalar coeff(int row, int col) const + { + Scalar res; + ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res); + return res; + } + + /* Allow index-based non-packet access. It is impossible though to allow index-based packed access, + * which is why we don't set the LinearAccessBit. + */ + EIGEN_STRONG_INLINE const Scalar coeff(int index) const + { + Scalar res; + const int row = RowsAtCompileTime == 1 ? 0 : index; + const int col = RowsAtCompileTime == 1 ? index : 0; + ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res); + return res; + } + + template<int LoadMode> + EIGEN_STRONG_INLINE const PacketScalar packet(int row, int col) const + { + PacketScalar res; + ei_product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor, + Unroll ? InnerSize-1 : Dynamic, + _LhsNested, _RhsNested, PacketScalar, LoadMode> + ::run(row, col, m_lhs, m_rhs, res); + return res; + } + + protected: + const LhsNested m_lhs; + const RhsNested m_rhs; +}; + + +/*************************************************************************** +* Normal product .coeff() implementation (with meta-unrolling) +***************************************************************************/ + +/************************************** +*** Scalar path - no vectorization *** +**************************************/ + +template<int Index, typename Lhs, typename Rhs, typename RetScalar> +struct ei_product_coeff_impl<NoVectorization, Index, Lhs, Rhs, RetScalar> +{ + EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar &res) + { + ei_product_coeff_impl<NoVectorization, Index-1, Lhs, Rhs, RetScalar>::run(row, col, lhs, rhs, res); + res += lhs.coeff(row, Index) * rhs.coeff(Index, col); + } +}; + +template<typename Lhs, typename Rhs, typename RetScalar> +struct ei_product_coeff_impl<NoVectorization, 0, Lhs, Rhs, RetScalar> +{ + EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar &res) + { + res = lhs.coeff(row, 0) * rhs.coeff(0, col); + } +}; + +template<typename Lhs, typename Rhs, typename RetScalar> +struct ei_product_coeff_impl<NoVectorization, Dynamic, Lhs, Rhs, RetScalar> +{ + EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar& res) + { + ei_assert(lhs.cols()>0 && "you are using a non initialized matrix"); + res = lhs.coeff(row, 0) * rhs.coeff(0, col); + for(int i = 1; i < lhs.cols(); ++i) + res += lhs.coeff(row, i) * rhs.coeff(i, col); + } +}; + +// prevent buggy user code from causing an infinite recursion +template<typename Lhs, typename Rhs, typename RetScalar> +struct ei_product_coeff_impl<NoVectorization, -1, Lhs, Rhs, RetScalar> +{ + EIGEN_STRONG_INLINE static void run(int, int, const Lhs&, const Rhs&, RetScalar&) {} +}; + +/******************************************* +*** Scalar path with inner vectorization *** +*******************************************/ + +template<int Index, typename Lhs, typename Rhs, typename PacketScalar> +struct ei_product_coeff_vectorized_unroller +{ + enum { PacketSize = ei_packet_traits<typename Lhs::Scalar>::size }; + EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres) + { + ei_product_coeff_vectorized_unroller<Index-PacketSize, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, pres); + pres = ei_padd(pres, ei_pmul( lhs.template packet<Aligned>(row, Index) , rhs.template packet<Aligned>(Index, col) )); + } +}; + +template<typename Lhs, typename Rhs, typename PacketScalar> +struct ei_product_coeff_vectorized_unroller<0, Lhs, Rhs, PacketScalar> +{ + EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres) + { + pres = ei_pmul(lhs.template packet<Aligned>(row, 0) , rhs.template packet<Aligned>(0, col)); + } +}; + +template<int Index, typename Lhs, typename Rhs, typename RetScalar> +struct ei_product_coeff_impl<InnerVectorization, Index, Lhs, Rhs, RetScalar> +{ + typedef typename Lhs::PacketScalar PacketScalar; + enum { PacketSize = ei_packet_traits<typename Lhs::Scalar>::size }; + EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, RetScalar &res) + { + PacketScalar pres; + ei_product_coeff_vectorized_unroller<Index+1-PacketSize, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, pres); + ei_product_coeff_impl<NoVectorization,Index,Lhs,Rhs,RetScalar>::run(row, col, lhs, rhs, res); + res = ei_predux(pres); + } +}; + +template<typename Lhs, typename Rhs, int LhsRows = Lhs::RowsAtCompileTime, int RhsCols = Rhs::ColsAtCompileTime> +struct ei_product_coeff_vectorized_dyn_selector +{ + EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) + { + res = ei_dot_impl< + Block<Lhs, 1, ei_traits<Lhs>::ColsAtCompileTime>, + Block<Rhs, ei_traits<Rhs>::RowsAtCompileTime, 1>, + LinearVectorization, NoUnrolling>::run(lhs.row(row), rhs.col(col)); + } +}; + +// NOTE the 3 following specializations are because taking .col(0) on a vector is a bit slower +// NOTE maybe they are now useless since we have a specialization for Block<Matrix> +template<typename Lhs, typename Rhs, int RhsCols> +struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,RhsCols> +{ + EIGEN_STRONG_INLINE static void run(int /*row*/, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) + { + res = ei_dot_impl< + Lhs, + Block<Rhs, ei_traits<Rhs>::RowsAtCompileTime, 1>, + LinearVectorization, NoUnrolling>::run(lhs, rhs.col(col)); + } +}; + +template<typename Lhs, typename Rhs, int LhsRows> +struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,LhsRows,1> +{ + EIGEN_STRONG_INLINE static void run(int row, int /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) + { + res = ei_dot_impl< + Block<Lhs, 1, ei_traits<Lhs>::ColsAtCompileTime>, + Rhs, + LinearVectorization, NoUnrolling>::run(lhs.row(row), rhs); + } +}; + +template<typename Lhs, typename Rhs> +struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,1> +{ + EIGEN_STRONG_INLINE static void run(int /*row*/, int /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) + { + res = ei_dot_impl< + Lhs, + Rhs, + LinearVectorization, NoUnrolling>::run(lhs, rhs); + } +}; + +template<typename Lhs, typename Rhs, typename RetScalar> +struct ei_product_coeff_impl<InnerVectorization, Dynamic, Lhs, Rhs, RetScalar> +{ + EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) + { + ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs>::run(row, col, lhs, rhs, res); + } +}; + +/******************* +*** Packet path *** +*******************/ + +template<int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode> +struct ei_product_packet_impl<RowMajor, Index, Lhs, Rhs, PacketScalar, LoadMode> +{ + EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) + { + ei_product_packet_impl<RowMajor, Index-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res); + res = ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.template packet<LoadMode>(Index, col), res); + } +}; + +template<int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode> +struct ei_product_packet_impl<ColMajor, Index, Lhs, Rhs, PacketScalar, LoadMode> +{ + EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) + { + ei_product_packet_impl<ColMajor, Index-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res); + res = ei_pmadd(lhs.template packet<LoadMode>(row, Index), ei_pset1(rhs.coeff(Index, col)), res); + } +}; + +template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode> +struct ei_product_packet_impl<RowMajor, 0, Lhs, Rhs, PacketScalar, LoadMode> +{ + EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) + { + res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col)); + } +}; + +template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode> +struct ei_product_packet_impl<ColMajor, 0, Lhs, Rhs, PacketScalar, LoadMode> +{ + EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) + { + res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1(rhs.coeff(0, col))); + } +}; + +template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode> +struct ei_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, PacketScalar, LoadMode> +{ + EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res) + { + ei_assert(lhs.cols()>0 && "you are using a non initialized matrix"); + res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col)); + for(int i = 1; i < lhs.cols(); ++i) + res = ei_pmadd(ei_pset1(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res); + } +}; + +template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode> +struct ei_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, PacketScalar, LoadMode> +{ + EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res) + { + ei_assert(lhs.cols()>0 && "you are using a non initialized matrix"); + res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1(rhs.coeff(0, col))); + for(int i = 1; i < lhs.cols(); ++i) + res = ei_pmadd(lhs.template packet<LoadMode>(row, i), ei_pset1(rhs.coeff(i, col)), res); + } +}; + +#endif // EIGEN_GENERAL_UNROLLED_PRODUCT_H diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index acd239d28..1e92ada27 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -339,4 +339,56 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,false,ConjugateLhs, } }; +/*************************************************************************** +* Wrapper to ei_product_selfadjoint_matrix +***************************************************************************/ + +template<typename Lhs, int LhsMode, typename Rhs, int RhsMode> +struct ei_traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false> > + : ei_traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs> > +{}; + +template<typename Lhs, int LhsMode, typename Rhs, int RhsMode> +struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false> + : public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs > +{ + EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix) + + SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} + + enum { + LhsUpLo = LhsMode&(UpperTriangularBit|LowerTriangularBit), + LhsIsSelfAdjoint = (LhsMode&SelfAdjointBit)==SelfAdjointBit, + RhsUpLo = RhsMode&(UpperTriangularBit|LowerTriangularBit), + RhsIsSelfAdjoint = (RhsMode&SelfAdjointBit)==SelfAdjointBit + }; + + template<typename Dest> void addTo(Dest& dst, Scalar alpha) const + { + ei_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); + + const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs); + const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs); + + Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) + * RhsBlasTraits::extractScalarFactor(m_rhs); + + ei_product_selfadjoint_matrix<Scalar, + EIGEN_LOGICAL_XOR(LhsUpLo==UpperTriangular, + ei_traits<Lhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint, + NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsUpLo==UpperTriangular,bool(LhsBlasTraits::NeedToConjugate)), + EIGEN_LOGICAL_XOR(RhsUpLo==UpperTriangular, + ei_traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint, + NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsUpLo==UpperTriangular,bool(RhsBlasTraits::NeedToConjugate)), + ei_traits<Dest>::Flags&RowMajorBit ? RowMajor : ColMajor> + ::run( + lhs.rows(), rhs.cols(), // sizes + &lhs.coeff(0,0), lhs.stride(), // lhs info + &rhs.coeff(0,0), rhs.stride(), // rhs info + &dst.coeffRef(0,0), dst.stride(), // result info + actualAlpha // alpha + ); + } +}; + #endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_H diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h index 279836f8a..f0004cdb9 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixVector.h +++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h @@ -154,5 +154,48 @@ static EIGEN_DONT_INLINE void ei_product_selfadjoint_vector( ei_aligned_stack_delete(Scalar, const_cast<Scalar*>(rhs), size); } +/*************************************************************************** +* Wrapper to ei_product_selfadjoint_vector +***************************************************************************/ + +template<typename Lhs, int LhsMode, typename Rhs> +struct ei_traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true> > + : ei_traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs> > +{}; + +template<typename Lhs, int LhsMode, typename Rhs> +struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true> + : public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs > +{ + EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix) + + enum { + LhsUpLo = LhsMode&(UpperTriangularBit|LowerTriangularBit) + }; + + SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} + + template<typename Dest> void addTo(Dest& dst, Scalar alpha) const + { + ei_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); + + const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs); + const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs); + + Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) + * RhsBlasTraits::extractScalarFactor(m_rhs); + + ei_assert((&dst.coeff(1))-(&dst.coeff(0))==1 && "not implemented yet"); + ei_product_selfadjoint_vector<Scalar, ei_traits<_ActualLhsType>::Flags&RowMajorBit, int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)> + ( + lhs.rows(), // size + &lhs.coeff(0,0), lhs.stride(), // lhs info + &rhs.coeff(0), (&rhs.coeff(1))-(&rhs.coeff(0)), // rhs info + &dst.coeffRef(0), // result info + actualAlpha // scale factor + ); + } +}; + #endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_H diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index 216f2dd69..f4690c0ca 100644 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -139,7 +139,7 @@ struct ei_product_blocking_traits // register block size along the N direction (must be either 2 or 4) nr = HalfRegisterCount/2, - // register block size along the M direction (this cannot be modified) + // register block size along the M direction (currently, this one cannot be modified) mr = 2 * PacketSize, // max cache block size along the K direction |