diff options
Diffstat (limited to 'Eigen')
61 files changed, 2636 insertions, 1191 deletions
diff --git a/Eigen/Core b/Eigen/Core index dcb20bfd0..216428e97 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -125,9 +125,7 @@ #define EIGEN_VECTORIZE_SSE4_1 #define EIGEN_VECTORIZE_SSE4_2 #endif - #ifdef __FMA__ - #define EIGEN_VECTORIZE_FMA - #endif + // include files // This extern "C" works around a MINGW-w64 compilation issue @@ -187,6 +185,11 @@ #endif #endif +#if defined __CUDACC__ + #define EIGEN_VECTORIZE_CUDA + #include <vector_types.h> +#endif + #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE) #define EIGEN_HAS_OPENMP #endif @@ -302,9 +305,15 @@ using std::ptrdiff_t; #include "src/Core/arch/AltiVec/Complex.h" #elif defined EIGEN_VECTORIZE_NEON #include "src/Core/arch/NEON/PacketMath.h" + #include "src/Core/arch/NEON/MathFunctions.h" #include "src/Core/arch/NEON/Complex.h" #endif +#if defined EIGEN_VECTORIZE_CUDA + #include "src/Core/arch/CUDA/PacketMath.h" + #include "src/Core/arch/CUDA/MathFunctions.h" +#endif + #include "src/Core/arch/Default/Settings.h" #include "src/Core/functors/BinaryFunctors.h" diff --git a/Eigen/SparseCore b/Eigen/SparseCore index b68c8fa8a..48ed967b8 100644 --- a/Eigen/SparseCore +++ b/Eigen/SparseCore @@ -26,21 +26,17 @@ * This module depends on: Core. */ -namespace Eigen { - -/** The type used to identify a general sparse storage. */ -struct Sparse {}; - -} - #include "src/SparseCore/SparseUtil.h" #include "src/SparseCore/SparseMatrixBase.h" #include "src/SparseCore/SparseAssign.h" #include "src/SparseCore/CompressedStorage.h" #include "src/SparseCore/AmbiVector.h" +#include "src/SparseCore/SparseCompressedBase.h" #include "src/SparseCore/SparseMatrix.h" +#include "src/SparseCore/SparseMap.h" #include "src/SparseCore/MappedSparseMatrix.h" #include "src/SparseCore/SparseVector.h" +#include "src/SparseCore/SparseRef.h" #include "src/SparseCore/SparseCwiseUnaryOp.h" #include "src/SparseCore/SparseCwiseBinaryOp.h" #include "src/SparseCore/SparseTranspose.h" diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index 6ea383695..2ef37ca1c 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -87,7 +87,7 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp // FIXME, this traits is rather specialized for dense object and it needs to be cleaned further FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0, FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0, - Flags = (traits<XprType>::Flags & DirectAccessBit) | FlagsLvalueBit | FlagsRowMajorBit + Flags = (traits<XprType>::Flags & (DirectAccessBit | (InnerPanel?CompressedAccessBit:0))) | FlagsLvalueBit | FlagsRowMajorBit // FIXME DirectAccessBit should not be handled by expressions }; }; diff --git a/Eigen/src/Core/CommaInitializer.h b/Eigen/src/Core/CommaInitializer.h index 98ebe3bf6..808f3977c 100644 --- a/Eigen/src/Core/CommaInitializer.h +++ b/Eigen/src/Core/CommaInitializer.h @@ -28,7 +28,6 @@ template<typename XprType> struct CommaInitializer { typedef typename XprType::Scalar Scalar; - typedef typename XprType::StorageIndex StorageIndex; EIGEN_DEVICE_FUNC inline CommaInitializer(XprType& xpr, const Scalar& s) diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index eb35b44cb..9485080d3 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -111,6 +111,7 @@ struct evaluator_base typedef evaluator<ExpressionType> type; typedef evaluator<ExpressionType> nestedType; + // FIXME is it really usefull? typedef typename traits<ExpressionType>::StorageIndex StorageIndex; // TODO that's not very nice to have to propagate all these traits. They are currently only needed to handle outer,inner indices. typedef traits<ExpressionType> ExpressionTraits; @@ -128,7 +129,6 @@ struct evaluator<PlainObjectBase<Derived> > : evaluator_base<Derived> { typedef PlainObjectBase<Derived> PlainObjectType; - typedef typename PlainObjectType::StorageIndex StorageIndex; typedef typename PlainObjectType::Scalar Scalar; typedef typename PlainObjectType::CoeffReturnType CoeffReturnType; typedef typename PlainObjectType::PacketScalar PacketScalar; @@ -264,7 +264,6 @@ struct unary_evaluator<Transpose<ArgType>, IndexBased> EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {} - typedef typename XprType::StorageIndex StorageIndex; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketScalar PacketScalar; @@ -343,7 +342,6 @@ struct evaluator<CwiseNullaryOp<NullaryOp,PlainObjectType> > : m_functor(n.functor()) { } - typedef typename XprType::StorageIndex StorageIndex; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketScalar PacketScalar; @@ -394,7 +392,6 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased > m_argImpl(op.nestedExpression()) { } - typedef typename XprType::StorageIndex StorageIndex; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketScalar PacketScalar; @@ -469,7 +466,6 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase m_rhsImpl(xpr.rhs()) { } - typedef typename XprType::StorageIndex StorageIndex; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketScalar PacketScalar; @@ -522,7 +518,6 @@ struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType>, IndexBased> m_argImpl(op.nestedExpression()) { } - typedef typename XprType::StorageIndex StorageIndex; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; @@ -563,7 +558,6 @@ struct mapbase_evaluator : evaluator_base<Derived> { typedef Derived XprType; typedef typename XprType::PointerType PointerType; - typedef typename XprType::StorageIndex StorageIndex; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketScalar PacketScalar; @@ -760,7 +754,6 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa m_startCol(block.startCol()) { } - typedef typename XprType::StorageIndex StorageIndex; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketScalar PacketScalar; @@ -865,7 +858,6 @@ struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> > m_elseImpl(select.elseMatrix()) { } - typedef typename XprType::StorageIndex StorageIndex; typedef typename XprType::CoeffReturnType CoeffReturnType; inline EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const @@ -898,7 +890,6 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> > : evaluator_base<Replicate<ArgType, RowFactor, ColFactor> > { typedef Replicate<ArgType, RowFactor, ColFactor> XprType; - typedef typename XprType::StorageIndex StorageIndex; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; enum { @@ -981,7 +972,6 @@ struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> > : m_expr(expr) {} - typedef typename XprType::StorageIndex StorageIndex; typedef typename XprType::CoeffReturnType CoeffReturnType; EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const @@ -1016,7 +1006,6 @@ struct evaluator_wrapper_base EIGEN_DEVICE_FUNC explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {} - typedef typename ArgType::StorageIndex StorageIndex; typedef typename ArgType::Scalar Scalar; typedef typename ArgType::CoeffReturnType CoeffReturnType; typedef typename ArgType::PacketScalar PacketScalar; @@ -1103,7 +1092,6 @@ struct unary_evaluator<Reverse<ArgType, Direction> > : evaluator_base<Reverse<ArgType, Direction> > { typedef Reverse<ArgType, Direction> XprType; - typedef typename XprType::StorageIndex StorageIndex; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketScalar PacketScalar; @@ -1219,9 +1207,10 @@ struct evaluator<Diagonal<ArgType, DiagIndex> > m_index(diagonal.index()) { } - typedef typename XprType::StorageIndex StorageIndex; typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; + // FIXME having to check whether ArgType is sparse here i not very nice. + typedef typename internal::conditional<!internal::is_same<typename ArgType::StorageKind,Sparse>::value, + typename XprType::CoeffReturnType,Scalar>::type CoeffReturnType; EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index) const { diff --git a/Eigen/src/Core/DiagonalMatrix.h b/Eigen/src/Core/DiagonalMatrix.h index f37091000..56beaf3bc 100644 --- a/Eigen/src/Core/DiagonalMatrix.h +++ b/Eigen/src/Core/DiagonalMatrix.h @@ -326,6 +326,12 @@ struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Dense, Scalar> dst.setZero(); dst.diagonal() = src.diagonal(); } + + static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar> &/*func*/) + { dst.diagonal() += src.diagonal(); } + + static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar> &/*func*/) + { dst.diagonal() -= src.diagonal(); } }; } // namespace internal diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index 4eb01b1b3..81750722c 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -11,7 +11,7 @@ #ifndef EIGEN_GENERAL_PRODUCT_H #define EIGEN_GENERAL_PRODUCT_H -namespace Eigen { +namespace Eigen { enum { Large = 2, @@ -252,12 +252,12 @@ template<> struct gemv_dense_sense_selector<OnTheRight,ColMajor,true> bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0)); bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible; - + RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha); ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(), evalToDest ? dest.data() : static_dest.data()); - + if(!evalToDest) { #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN @@ -273,11 +273,13 @@ template<> struct gemv_dense_sense_selector<OnTheRight,ColMajor,true> MappedDest(actualDestPtr, dest.size()) = dest; } + typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper; + typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper; general_matrix_vector_product - <Index,LhsScalar,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run( + <Index,LhsScalar,LhsMapper,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run( actualLhs.rows(), actualLhs.cols(), - actualLhs.data(), actualLhs.outerStride(), - actualRhs.data(), actualRhs.innerStride(), + LhsMapper(actualLhs.data(), actualLhs.outerStride()), + RhsMapper(actualRhs.data(), actualRhs.innerStride()), actualDestPtr, 1, compatibleAlpha); @@ -333,11 +335,13 @@ template<> struct gemv_dense_sense_selector<OnTheRight,RowMajor,true> Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs; } + typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper; + typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper; general_matrix_vector_product - <Index,LhsScalar,RowMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run( + <Index,LhsScalar,LhsMapper,RowMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run( actualLhs.rows(), actualLhs.cols(), - actualLhs.data(), actualLhs.outerStride(), - actualRhsPtr, 1, + LhsMapper(actualLhs.data(), actualLhs.outerStride()), + RhsMapper(actualRhsPtr, 1), dest.data(), dest.innerStride(), actualAlpha); } @@ -410,7 +414,7 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const #ifdef EIGEN_DEBUG_PRODUCT internal::product_type<Derived,OtherDerived>::debug(); #endif - + return Product<Derived, OtherDerived>(derived(), other.derived()); } diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 81f4eef40..8759cd06c 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -54,6 +54,7 @@ struct default_packet_traits HasMax = 1, HasConj = 1, HasSetLinear = 1, + HasBlend = 0, HasDiv = 0, HasSqrt = 0, @@ -94,6 +95,8 @@ template<typename T> struct packet_traits : default_packet_traits }; }; +template<typename T> struct packet_traits<const T> : packet_traits<T> { }; + /** \internal \returns a + b (coeff-wise) */ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet padd(const Packet& a, @@ -356,7 +359,7 @@ pmadd(const Packet& a, /** \internal \returns a packet version of \a *from. * If LoadMode equals #Aligned, \a from must be 16 bytes aligned */ template<typename Packet, int LoadMode> -inline Packet ploadt(const typename unpacket_traits<Packet>::type* from) +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt(const typename unpacket_traits<Packet>::type* from) { if(LoadMode == Aligned) return pload<Packet>(from); @@ -367,7 +370,7 @@ inline Packet ploadt(const typename unpacket_traits<Packet>::type* from) /** \internal copy the packet \a from to \a *to. * If StoreMode equals #Aligned, \a to must be 16 bytes aligned */ template<typename Scalar, typename Packet, int LoadMode> -inline void pstoret(Scalar* to, const Packet& from) +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& from) { if(LoadMode == Aligned) pstore(to, from); @@ -375,6 +378,17 @@ inline void pstoret(Scalar* to, const Packet& from) pstoreu(to, from); } +/** \internal \returns a packet version of \a *from. + * Unlike ploadt, ploadt_ro takes advantage of the read-only memory path on the + * hardware if available to speedup the loading of data that won't be modified + * by the current computation. + */ +template<typename Packet, int LoadMode> +inline Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from) +{ + return ploadt<Packet, LoadMode>(from); +} + /** \internal default implementation of palign() allowing partial specialization */ template<int Offset,typename PacketType> struct palign_impl @@ -433,6 +447,19 @@ ptranspose(PacketBlock<Packet,1>& /*kernel*/) { // Nothing to do in the scalar case, i.e. a 1x1 matrix. } +/*************************************************************************** + * Selector, i.e. vector of N boolean values used to select (i.e. blend) + * words from 2 packets. +***************************************************************************/ +template <size_t N> struct Selector { + bool select[N]; +}; + +template<typename Packet> EIGEN_DEVICE_FUNC inline Packet +pblend(const Selector<unpacket_traits<Packet>::size>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) { + return ifPacket.select[0] ? thenPacket : elsePacket; +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h index 3dafee9d7..8dca9796d 100644 --- a/Eigen/src/Core/MapBase.h +++ b/Eigen/src/Core/MapBase.h @@ -171,6 +171,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors> template<typename Derived> class MapBase<Derived, WriteAccessors> : public MapBase<Derived, ReadOnlyAccessors> { + typedef MapBase<Derived, ReadOnlyAccessors> ReadOnlyMapBase; public: typedef MapBase<Derived, ReadOnlyAccessors> Base; @@ -238,11 +239,13 @@ template<typename Derived> class MapBase<Derived, WriteAccessors> EIGEN_DEVICE_FUNC Derived& operator=(const MapBase& other) { - Base::Base::operator=(other); + ReadOnlyMapBase::Base::operator=(other); return derived(); } - using Base::Base::operator=; + // In theory we could simply refer to Base:Base::operator=, but MSVC does not like Base::Base, + // see bugs 821 and 920. + using ReadOnlyMapBase::Base::operator=; }; #undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 4c5fc1cae..16ad2dc7e 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -14,7 +14,7 @@ namespace Eigen { // On WINCE, std::abs is defined for int only, so let's defined our own overloads: // This issue has been confirmed with MSVC 2008 only, but the issue might exist for more recent versions too. -#if defined(_WIN32_WCE) && defined(_MSC_VER) && _MSC_VER<=1500 +#if EIGEN_OS_WINCE && EIGEN_COMP_MSVC && EIGEN_COMP_MSVC<=1500 long abs(long x) { return (labs(x)); } double abs(double x) { return (fabs(x)); } float abs(float x) { return (fabsf(x)); } @@ -360,50 +360,31 @@ inline NewType cast(const OldType& x) } /**************************************************************************** -* Implementation of atanh2 * +* Implementation of logp1 * ****************************************************************************/ template<typename Scalar> -struct atanh2_impl +struct log1p_impl { - static inline Scalar run(const Scalar& x, const Scalar& r) + static inline Scalar run(const Scalar& x) { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) - #if (__cplusplus >= 201103L) && !defined(__CYGWIN__) + // Let's be conservative and enable the default C++11 implementation only if we are sure it exists + #if (__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC) \ + && (EIGEN_ARCH_i386_OR_x86_64) && (EIGEN_OS_GNULINUX || EIGEN_OS_WIN_STRICT || EIGEN_OS_MAC) using std::log1p; - return log1p(2 * x / (r - x)) / 2; + return log1p(x); #else - using std::abs; + typedef typename NumTraits<Scalar>::Real RealScalar; using std::log; - using std::sqrt; - Scalar z = x / r; - if (r == 0 || abs(z) > sqrt(NumTraits<Scalar>::epsilon())) - return log((r + x) / (r - x)) / 2; - else - return z + z*z*z / 3; + Scalar x1p = RealScalar(1) + x; + return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) ); #endif } }; -template<typename RealScalar> -struct atanh2_impl<std::complex<RealScalar> > -{ - typedef std::complex<RealScalar> Scalar; - static inline Scalar run(const Scalar& x, const Scalar& r) - { - using std::log; - using std::norm; - using std::sqrt; - Scalar z = x / r; - if (r == Scalar(0) || norm(z) > NumTraits<RealScalar>::epsilon()) - return RealScalar(0.5) * log((r + x) / (r - x)); - else - return z + z*z*z / RealScalar(3); - } -}; - template<typename Scalar> -struct atanh2_retval +struct log1p_retval { typedef Scalar type; }; @@ -680,9 +661,9 @@ inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar& template<typename Scalar> EIGEN_DEVICE_FUNC -inline EIGEN_MATHFUNC_RETVAL(atanh2, Scalar) atanh2(const Scalar& x, const Scalar& y) +inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x) { - return EIGEN_MATHFUNC_IMPL(atanh2, Scalar)::run(x, y); + return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x); } template<typename Scalar> diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index b2c9b56ed..7f9d135f7 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -210,23 +210,26 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct> template<typename Dst, typename Lhs, typename Rhs, typename Func> EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&) { + typename evaluator<Rhs>::type rhsEval(rhs); // FIXME make sure lhs is sequentially stored // FIXME not very good if rhs is real and lhs complex while alpha is real too - // FIXME we should probably build an evaluator for dst and rhs + // FIXME we should probably build an evaluator for dst const Index cols = dst.cols(); for (Index j=0; j<cols; ++j) - func(dst.col(j), rhs.coeff(0,j) * lhs); + func(dst.col(j), rhsEval.coeff(0,j) * lhs); } // Row major result template<typename Dst, typename Lhs, typename Rhs, typename Func> -EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&) { +EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&) +{ + typename evaluator<Lhs>::type lhsEval(lhs); // FIXME make sure rhs is sequentially stored // FIXME not very good if lhs is real and rhs complex while alpha is real too - // FIXME we should probably build an evaluator for dst and lhs + // FIXME we should probably build an evaluator for dst const Index rows = dst.rows(); for (Index i=0; i<rows; ++i) - func(dst.row(i), lhs.coeff(i,0) * rhs); + func(dst.row(i), lhsEval.coeff(i,0) * rhs); } template<typename Lhs, typename Rhs> diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h index 0f17e3a89..f97048bda 100644 --- a/Eigen/src/Core/SolveTriangular.h +++ b/Eigen/src/Core/SolveTriangular.h @@ -96,7 +96,7 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic> typedef internal::gemm_blocking_space<(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar, Rhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxRowsAtCompileTime,4> BlockingType; - BlockingType blocking(rhs.rows(), rhs.cols(), size); + BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false); triangular_solve_matrix<Scalar,Index,Side,Mode,LhsProductTraits::NeedToConjugate,(int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor, (Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor> diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h index e1316a73d..5d60ba149 100644 --- a/Eigen/src/Core/Transpose.h +++ b/Eigen/src/Core/Transpose.h @@ -213,18 +213,39 @@ MatrixBase<Derived>::adjoint() const namespace internal { template<typename MatrixType, - bool IsSquare = (MatrixType::RowsAtCompileTime == MatrixType::ColsAtCompileTime) && MatrixType::RowsAtCompileTime!=Dynamic> + bool IsSquare = (MatrixType::RowsAtCompileTime == MatrixType::ColsAtCompileTime) && MatrixType::RowsAtCompileTime!=Dynamic, + bool MatchPacketSize = + (int(MatrixType::RowsAtCompileTime) == int(internal::packet_traits<typename MatrixType::Scalar>::size)) + && (internal::evaluator<MatrixType>::Flags&PacketAccessBit) > struct inplace_transpose_selector; template<typename MatrixType> -struct inplace_transpose_selector<MatrixType,true> { // square matrix +struct inplace_transpose_selector<MatrixType,true,false> { // square matrix static void run(MatrixType& m) { m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose()); } }; +// TODO: vectorized path is currently limited to LargestPacketSize x LargestPacketSize cases only. template<typename MatrixType> -struct inplace_transpose_selector<MatrixType,false> { // non square matrix +struct inplace_transpose_selector<MatrixType,true,true> { // PacketSize x PacketSize + static void run(MatrixType& m) { + typedef typename MatrixType::Scalar Scalar; + typedef typename internal::packet_traits<typename MatrixType::Scalar>::type Packet; + typedef typename MatrixType::Index Index; + const Index PacketSize = internal::packet_traits<Scalar>::size; + const Index Alignment = internal::evaluator<MatrixType>::Flags&AlignedBit ? Aligned : Unaligned; + PacketBlock<Packet> A; + for (Index i=0; i<PacketSize; ++i) + A.packet[i] = m.template packetByOuterInner<Alignment>(i,0); + internal::ptranspose(A); + for (Index i=0; i<PacketSize; ++i) + m.template writePacket<Alignment>(m.rowIndexByOuterInner(i,0), m.colIndexByOuterInner(i,0), A.packet[i]); + } +}; + +template<typename MatrixType,bool MatchPacketSize> +struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non square matrix static void run(MatrixType& m) { if (m.rows()==m.cols()) m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose()); diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index e2376bd1f..be66a502a 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -22,9 +22,9 @@ namespace internal { #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) #endif -#ifdef EIGEN_VECTORIZE_FMA -#ifndef EIGEN_HAS_FUSED_MADD -#define EIGEN_HAS_FUSED_MADD 1 +#ifdef __FMA__ +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif #endif @@ -58,7 +58,8 @@ template<> struct packet_traits<float> : default_packet_traits HasCos = 0, HasLog = 0, HasExp = 0, - HasSqrt = 0 + HasSqrt = 0, + HasBlend = 1 }; }; template<> struct packet_traits<double> : default_packet_traits @@ -72,7 +73,8 @@ template<> struct packet_traits<double> : default_packet_traits HasHalfPacket = 1, HasDiv = 1, - HasExp = 0 + HasExp = 0, + HasBlend = 1 }; }; @@ -133,7 +135,7 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, co return pset1<Packet8i>(0); } -#ifdef EIGEN_VECTORIZE_FMA +#ifdef __FMA__ template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { #if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers, @@ -557,6 +559,19 @@ ptranspose(PacketBlock<Packet4d,4>& kernel) { kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49); } +template<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) { + const __m256 zero = _mm256_setzero_ps(); + const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ); + return _mm256_blendv_ps(thenPacket, elsePacket, false_mask); +} +template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) { + const __m256d zero = _mm256_setzero_pd(); + const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ); + return _mm256_blendv_pd(thenPacket, elsePacket, false_mask); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index fa02f57a1..6b68fc7a5 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -18,12 +18,12 @@ namespace internal { #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 #endif -#ifndef EIGEN_HAS_FUSED_MADD -#define EIGEN_HAS_FUSED_MADD 1 +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif -#ifndef EIGEN_HAS_FUSE_CJMADD -#define EIGEN_HAS_FUSE_CJMADD 1 +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD #endif // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16 diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/CUDA/MathFunctions.h new file mode 100644 index 000000000..e7305c01e --- /dev/null +++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h @@ -0,0 +1,75 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H +#define EIGEN_MATH_FUNCTIONS_CUDA_H + +namespace Eigen { + +namespace internal { + +// Make sure this is only available when targeting a GPU: we don't want to +// introduce conflicts between these packet_traits definitions and the ones +// we'll use on the host side (SSE, AVX, ...) +#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +template<> EIGEN_STRONG_INLINE +float4 plog<float4>(const float4& a) +{ + return make_float4(logf(a.x), logf(a.y), logf(a.z), logf(a.w)); +} + +template<> EIGEN_STRONG_INLINE +double2 plog<double2>(const double2& a) +{ + return make_double2(log(a.x), log(a.y)); +} + +template<> EIGEN_STRONG_INLINE +float4 pexp<float4>(const float4& a) +{ + return make_float4(expf(a.x), expf(a.y), expf(a.z), expf(a.w)); +} + +template<> EIGEN_STRONG_INLINE +double2 pexp<double2>(const double2& a) +{ + return make_double2(exp(a.x), exp(a.y)); +} + +template<> EIGEN_STRONG_INLINE +float4 psqrt<float4>(const float4& a) +{ + return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w)); +} + +template<> EIGEN_STRONG_INLINE +double2 psqrt<double2>(const double2& a) +{ + return make_double2(sqrt(a.x), sqrt(a.y)); +} + +template<> EIGEN_STRONG_INLINE +float4 prsqrt<float4>(const float4& a) +{ + return make_float4(rsqrtf(a.x), rsqrtf(a.y), rsqrtf(a.z), rsqrtf(a.w)); +} + +template<> EIGEN_STRONG_INLINE +double2 prsqrt<double2>(const double2& a) +{ + return make_double2(rsqrt(a.x), rsqrt(a.y)); +} + +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_CUDA_H diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h new file mode 100644 index 000000000..19749c832 --- /dev/null +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -0,0 +1,296 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_CUDA_H +#define EIGEN_PACKET_MATH_CUDA_H + +namespace Eigen { + +namespace internal { + +// Make sure this is only available when targeting a GPU: we don't want to +// introduce conflicts between these packet_traits definitions and the ones +// we'll use on the host side (SSE, AVX, ...) +#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +template<> struct is_arithmetic<float4> { enum { value = true }; }; +template<> struct is_arithmetic<double2> { enum { value = true }; }; + + +template<> struct packet_traits<float> : default_packet_traits +{ + typedef float4 type; + typedef float4 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=4, + HasHalfPacket = 0, + + HasDiv = 1, + HasSin = 0, + HasCos = 0, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + + HasBlend = 0, + }; +}; + +template<> struct packet_traits<double> : default_packet_traits +{ + typedef double2 type; + typedef double2 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=2, + HasHalfPacket = 0, + + HasDiv = 1, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + + HasBlend = 0, + }; +}; + + +template<> struct unpacket_traits<float4> { typedef float type; enum {size=4}; typedef float4 half; }; +template<> struct unpacket_traits<double2> { typedef double type; enum {size=2}; typedef double2 half; }; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float& from) { + return make_float4(from, from, from, from); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) { + return make_double2(from, from); +} + + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float>(const float& a) { + return make_float4(a, a+1, a+2, a+3); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double>(const double& a) { + return make_double2(a, a+1); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) { + return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) { + return make_double2(a.x+b.x, a.y+b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) { + return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) { + return make_double2(a.x-b.x, a.y-b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) { + return make_float4(-a.x, -a.y, -a.z, -a.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) { + return make_double2(-a.x, -a.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; } + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) { + return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) { + return make_double2(a.x*b.x, a.y*b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) { + return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) { + return make_double2(a.x/b.x, a.y/b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) { + return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w)); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) { + return make_double2(fmin(a.x, b.x), fmin(a.y, b.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) { + return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w)); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) { + return make_double2(fmax(a.x, b.x), fmax(a.y, b.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) { + return *reinterpret_cast<const float4*>(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) { + return *reinterpret_cast<const double2*>(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) { + return make_float4(from[0], from[1], from[2], from[3]); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) { + return make_double2(from[0], from[1]); +} + +template<> EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float* from) { + return make_float4(from[0], from[0], from[1], from[1]); +} +template<> EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double* from) { + return make_double2(from[0], from[0]); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float* to, const float4& from) { + *reinterpret_cast<float4*>(to) = from; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) { + *reinterpret_cast<double2*>(to) = from; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const float4& from) { + to[0] = from.x; + to[1] = from.y; + to[2] = from.z; + to[3] = from.w; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) { + to[0] = from.x; + to[1] = from.y; +} + +#ifdef __CUDA_ARCH__ +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) { + return __ldg((const float4*)from); +} +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) { + return __ldg((const double2*)from); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) { + return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3)); +} +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) { + return make_double2(__ldg(from+0), __ldg(from+1)); +} +#endif + +template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, int stride) { + return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]); +} + +template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, int stride) { + return make_double2(from[0*stride], from[1*stride]); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, int stride) { + to[stride*0] = from.x; + to[stride*1] = from.y; + to[stride*2] = from.z; + to[stride*3] = from.w; +} +template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, int stride) { + to[stride*0] = from.x; + to[stride*1] = from.y; +} + +template<> EIGEN_DEVICE_FUNC inline float pfirst<float4>(const float4& a) { + return a.x; +} +template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) { + return a.x; +} + +template<> EIGEN_DEVICE_FUNC inline float predux<float4>(const float4& a) { + return a.x + a.y + a.z + a.w; +} +template<> EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) { + return a.x + a.y; +} + +template<> EIGEN_DEVICE_FUNC inline float predux_max<float4>(const float4& a) { + return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) { + return fmax(a.x, a.y); +} + +template<> EIGEN_DEVICE_FUNC inline float predux_min<float4>(const float4& a) { + return fminf(fminf(a.x, a.y), fminf(a.z, a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) { + return fmin(a.x, a.y); +} + +template<> EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) { + return make_float4(fabs(a.x), fabs(a.y), fabs(a.z), fabs(a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) { + return make_double2(abs(a.x), abs(a.y)); +} + + +template<> EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<float4,4>& kernel) { + double tmp = kernel.packet[0].y; + kernel.packet[0].y = kernel.packet[1].x; + kernel.packet[1].x = tmp; + + tmp = kernel.packet[0].z; + kernel.packet[0].z = kernel.packet[2].x; + kernel.packet[2].x = tmp; + + tmp = kernel.packet[0].w; + kernel.packet[0].w = kernel.packet[3].x; + kernel.packet[3].x = tmp; + + tmp = kernel.packet[1].z; + kernel.packet[1].z = kernel.packet[2].y; + kernel.packet[2].y = tmp; + + tmp = kernel.packet[1].w; + kernel.packet[1].w = kernel.packet[3].y; + kernel.packet[3].y = tmp; + + tmp = kernel.packet[2].w; + kernel.packet[2].w = kernel.packet[3].z; + kernel.packet[3].z = tmp; +} + +template<> EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<double2,2>& kernel) { + double tmp = kernel.packet[0].y; + kernel.packet[0].y = kernel.packet[1].x; + kernel.packet[1].x = tmp; +} + +#endif + +} // end namespace internal + +} // end namespace Eigen + + +#endif // EIGEN_PACKET_MATH_CUDA_H diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h new file mode 100644 index 000000000..6bb05bb92 --- /dev/null +++ b/Eigen/src/Core/arch/NEON/MathFunctions.h @@ -0,0 +1,91 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/* The sin, cos, exp, and log functions of this file come from + * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ + */ + +#ifndef EIGEN_MATH_FUNCTIONS_NEON_H +#define EIGEN_MATH_FUNCTIONS_NEON_H + +namespace Eigen { + +namespace internal { + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f pexp<Packet4f>(const Packet4f& _x) +{ + Packet4f x = _x; + Packet4f tmp, fx; + + _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); + _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); + _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); + _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); + + x = vminq_f32(x, p4f_exp_hi); + x = vmaxq_f32(x, p4f_exp_lo); + + /* express exp(x) as exp(g + n*log(2)) */ + fx = vmlaq_f32(p4f_half, x, p4f_cephes_LOG2EF); + + /* perform a floorf */ + tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx)); + + /* if greater, substract 1 */ + Packet4ui mask = vcgtq_f32(tmp, fx); + mask = vandq_u32(mask, vreinterpretq_u32_f32(p4f_1)); + + fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask)); + + tmp = vmulq_f32(fx, p4f_cephes_exp_C1); + Packet4f z = vmulq_f32(fx, p4f_cephes_exp_C2); + x = vsubq_f32(x, tmp); + x = vsubq_f32(x, z); + + Packet4f y = vmulq_f32(p4f_cephes_exp_p0, x); + z = vmulq_f32(x, x); + y = vaddq_f32(y, p4f_cephes_exp_p1); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_exp_p2); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_exp_p3); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_exp_p4); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_exp_p5); + + y = vmulq_f32(y, z); + y = vaddq_f32(y, x); + y = vaddq_f32(y, p4f_1); + + /* build 2^n */ + int32x4_t mm; + mm = vcvtq_s32_f32(fx); + mm = vaddq_s32(mm, p4i_0x7f); + mm = vshlq_n_s32(mm, 23); + Packet4f pow2n = vreinterpretq_f32_s32(mm); + + y = vmulq_f32(y, pow2n); + return y; +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_NEON_H diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 6c5c669a1..559682cf7 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -20,12 +20,12 @@ namespace internal { #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 #endif -#ifndef EIGEN_HAS_FUSED_MADD -#define EIGEN_HAS_FUSED_MADD 1 +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif -#ifndef EIGEN_HAS_FUSE_CJMADD -#define EIGEN_HAS_FUSE_CJMADD 1 +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD #endif // FIXME NEON has 16 quad registers, but since the current register allocator @@ -88,7 +88,7 @@ template<> struct packet_traits<float> : default_packet_traits HasSin = 0, HasCos = 0, HasLog = 0, - HasExp = 0, + HasExp = 1, HasSqrt = 0 }; }; @@ -177,8 +177,19 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co return pset1<Packet4i>(0); } -// for some weird raisons, it has to be overloaded for packet of integers +#ifdef __ARM_FEATURE_FMA +// See bug 936. +// FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4. +// FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding. +// MLA is not fused i.e. does 2 roundings. +// In addition to giving better accuracy, FMA also gives better performance here on a Krait (Nexus 4): +// MLA: 10 GFlop/s ; FMA: 12 GFlops/s. +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); } +#else template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vmlaq_f32(c,a,b); } +#endif + +// No FMA instruction for int, so use MLA unconditionally. template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); } template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); } @@ -492,6 +503,21 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) { //---------- double ---------- #if EIGEN_ARCH_ARM64 +#if (EIGEN_COMP_GNUC_STRICT && defined(__ANDROID__)) || defined(__apple_build_version__) +// Bug 907: workaround missing declarations of the following two functions in the ADK +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_u64_f64 (float64x2_t __a) +{ + return (uint64x2_t) __a; +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_f64_u64 (uint64x2_t __a) +{ + return (float64x2_t) __a; +} +#endif + typedef float64x2_t Packet2d; typedef float64x1_t Packet1d; @@ -536,8 +562,12 @@ template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); } -// for some weird raisons, it has to be overloaded for packet of integers +#ifdef __ARM_FEATURE_FMA +// See bug 936. See above comment about FMA for float. +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); } +#else template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vmlaq_f64(c,a,b); } +#endif template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); } @@ -597,7 +627,12 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vco template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); } -template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return vget_low_f64(a) + vget_high_f64(a); } +#if EIGEN_COMP_CLANG && defined(__apple_build_version__) +// workaround ICE, see bug 907 +template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) + vget_high_f64(a))[0]; } +#else +template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); } +#endif template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) { @@ -613,7 +648,11 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) } // Other reduction functions: // mul -template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return vget_low_f64(a) * vget_high_f64(a); } +#if EIGEN_COMP_CLANG && defined(__apple_build_version__) +template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; } +#else +template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); } +#endif // min template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); } diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 414c4cb6a..565e448fe 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -44,7 +44,8 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits HasAbs2 = 0, HasMin = 0, HasMax = 0, - HasSetLinear = 0 + HasSetLinear = 0, + HasBlend = 1 }; }; #endif @@ -472,6 +473,11 @@ ptranspose(PacketBlock<Packet2cf,2>& kernel) { kernel.packet[1].v = tmp; } +template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { + __m128d result = pblend(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v)); + return Packet2cf(_mm_castpd_ps(result)); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 28427c308..898cb9ab0 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -22,9 +22,9 @@ namespace internal { #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) #endif -#ifdef EIGEN_VECTORIZE_FMA -#ifndef EIGEN_HAS_FUSED_MADD -#define EIGEN_HAS_FUSED_MADD 1 +#ifdef __FMA__ +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 #endif #endif @@ -108,7 +108,8 @@ template<> struct packet_traits<float> : default_packet_traits HasCos = EIGEN_FAST_MATH, HasLog = 1, HasExp = 1, - HasSqrt = 1 + HasSqrt = 1, + HasBlend = 1 }; }; template<> struct packet_traits<double> : default_packet_traits @@ -123,7 +124,8 @@ template<> struct packet_traits<double> : default_packet_traits HasDiv = 1, HasExp = 1, - HasSqrt = 1 + HasSqrt = 1, + HasBlend = 1 }; }; #endif @@ -135,7 +137,9 @@ template<> struct packet_traits<int> : default_packet_traits // FIXME check the Has* Vectorizable = 1, AlignedOnScalar = 1, - size=4 + size=4, + + HasBlend = 1 }; }; @@ -227,7 +231,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co // for some weird raisons, it has to be overloaded for packet of integers template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); } -#ifdef EIGEN_VECTORIZE_FMA +#ifdef __FMA__ template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); } template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); } #endif @@ -809,6 +813,37 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) { kernel.packet[3] = _mm_unpackhi_epi64(T2, T3); } +template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { + const __m128i zero = _mm_setzero_si128(); + const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m128i false_mask = _mm_cmpeq_epi32(select, zero); +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_epi8(thenPacket, elsePacket, false_mask); +#else + return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { + const __m128 zero = _mm_setzero_ps(); + const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m128 false_mask = _mm_cmpeq_ps(select, zero); +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_ps(thenPacket, elsePacket, false_mask); +#else + return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { + const __m128d zero = _mm_setzero_pd(); + const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]); + __m128d false_mask = _mm_cmpeq_pd(select, zero); +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_pd(thenPacket, elsePacket, false_mask); +#else + return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket)); +#endif +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 7b2ed6728..11e5f591d 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -26,28 +26,37 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff } /** \internal */ -inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0) +inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3) { - static std::ptrdiff_t m_l1CacheSize = 0; - static std::ptrdiff_t m_l2CacheSize = 0; - if(m_l2CacheSize==0) + static bool m_cache_sizes_initialized = false; + static std::ptrdiff_t m_l1CacheSize = 32*1024; + static std::ptrdiff_t m_l2CacheSize = 256*1024; + static std::ptrdiff_t m_l3CacheSize = 2*1024*1024; + + if(!m_cache_sizes_initialized) { - m_l1CacheSize = manage_caching_sizes_helper(queryL1CacheSize(),8 * 1024); - m_l2CacheSize = manage_caching_sizes_helper(queryTopLevelCacheSize(),1*1024*1024); + int l1CacheSize, l2CacheSize, l3CacheSize; + queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize); + m_l1CacheSize = manage_caching_sizes_helper(l1CacheSize, 8*1024); + m_l2CacheSize = manage_caching_sizes_helper(l2CacheSize, 256*1024); + m_l3CacheSize = manage_caching_sizes_helper(l3CacheSize, 8*1024*1024); + m_cache_sizes_initialized = true; } - + if(action==SetAction) { // set the cpu cache size and cache all block sizes from a global cache size in byte eigen_internal_assert(l1!=0 && l2!=0); m_l1CacheSize = *l1; m_l2CacheSize = *l2; + m_l3CacheSize = *l3; } else if(action==GetAction) { eigen_internal_assert(l1!=0 && l2!=0); *l1 = m_l1CacheSize; *l2 = m_l2CacheSize; + *l3 = m_l3CacheSize; } else { @@ -70,10 +79,11 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdi * - the number of scalars that fit into a packet (when vectorization is enabled). * * \sa setCpuCacheSizes */ +#define CEIL(a, b) ((a)+(b)-1)/(b) + template<typename LhsScalar, typename RhsScalar, int KcFactor, typename SizeType> -void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) +void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads) { - EIGEN_UNUSED_VARIABLE(n); // Explanations: // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed @@ -81,47 +91,75 @@ void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) // at the register level. For vectorization purpose, these small vertical panels are unpacked, // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to // stay in L1 cache. - std::ptrdiff_t l1, l2; - - typedef gebp_traits<LhsScalar,RhsScalar> Traits; - enum { - kdiv = KcFactor * 2 * Traits::nr - * Traits::RhsProgress * sizeof(RhsScalar), - mr = gebp_traits<LhsScalar,RhsScalar>::mr, - mr_mask = (0xffffffff/mr)*mr - }; + std::ptrdiff_t l1, l2, l3; + manage_caching_sizes(GetAction, &l1, &l2, &l3); + + if (num_threads > 1) { + typedef gebp_traits<LhsScalar,RhsScalar> Traits; + typedef typename Traits::ResScalar ResScalar; + enum { + kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)), + ksub = Traits::mr * Traits::nr * sizeof(ResScalar), + k_mask = (0xffffffff/8)*8, + + mr = Traits::mr, + mr_mask = (0xffffffff/mr)*mr, + + nr = Traits::nr, + nr_mask = (0xffffffff/nr)*nr + }; + SizeType k_cache = (l1-ksub)/kdiv; + if (k_cache < k) { + k = k_cache & k_mask; + eigen_assert(k > 0); + } - manage_caching_sizes(GetAction, &l1, &l2); + SizeType n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k); + SizeType n_per_thread = CEIL(n, num_threads); + if (n_cache <= n_per_thread) { + // Don't exceed the capacity of the l2 cache. + eigen_assert(n_cache >= static_cast<SizeType>(nr)); + n = n_cache & nr_mask; + eigen_assert(n > 0); + } else { + n = (std::min<SizeType>)(n, (n_per_thread + nr - 1) & nr_mask); + } -// k = std::min<SizeType>(k, l1/kdiv); -// SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0; -// if(_m<m) m = _m & mr_mask; - - // In unit tests we do not want to use extra large matrices, - // so we reduce the block size to check the blocking strategy is not flawed + if (l3 > l2) { + // l3 is shared between all cores, so we'll give each thread its own chunk of l3. + SizeType m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads); + SizeType m_per_thread = CEIL(m, num_threads); + if(m_cache < m_per_thread && m_cache >= static_cast<SizeType>(mr)) { + m = m_cache & mr_mask; + eigen_assert(m > 0); + } else { + m = (std::min<SizeType>)(m, (m_per_thread + mr - 1) & mr_mask); + } + } + } + else { + // In unit tests we do not want to use extra large matrices, + // so we reduce the block size to check the blocking strategy is not flawed #ifndef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS -// k = std::min<SizeType>(k,240); -// n = std::min<SizeType>(n,3840/sizeof(RhsScalar)); -// m = std::min<SizeType>(m,3840/sizeof(RhsScalar)); - - k = std::min<SizeType>(k,sizeof(LhsScalar)<=4 ? 360 : 240); - n = std::min<SizeType>(n,3840/sizeof(RhsScalar)); - m = std::min<SizeType>(m,3840/sizeof(RhsScalar)); + k = std::min<SizeType>(k,sizeof(LhsScalar)<=4 ? 360 : 240); + n = std::min<SizeType>(n,3840/sizeof(RhsScalar)); + m = std::min<SizeType>(m,3840/sizeof(RhsScalar)); #else - k = std::min<SizeType>(k,24); - n = std::min<SizeType>(n,384/sizeof(RhsScalar)); - m = std::min<SizeType>(m,384/sizeof(RhsScalar)); + k = std::min<SizeType>(k,24); + n = std::min<SizeType>(n,384/sizeof(RhsScalar)); + m = std::min<SizeType>(m,384/sizeof(RhsScalar)); #endif + } } template<typename LhsScalar, typename RhsScalar, typename SizeType> -inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) +inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads) { - computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n); + computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n, num_threads); } -#ifdef EIGEN_HAS_FUSE_CJMADD - #define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD + #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); #else // FIXME (a bit overkill maybe ?) @@ -146,8 +184,8 @@ inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t); } - #define MADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T); -// #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); + #define CJMADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T); +// #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); #endif /* Vectorization logic @@ -182,7 +220,7 @@ public: nr = 4, // register block size along the M direction (currently, this one cannot be modified) -#if defined(EIGEN_HAS_FUSED_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) +#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) // we assume 16 registers mr = 3*LhsPacketSize, #else @@ -248,7 +286,7 @@ public: // let gcc allocate the register in which to store the result of the pmul // (in the case where there is no FMA) gcc fails to figure out how to avoid // spilling register. -#ifdef EIGEN_HAS_FUSED_MADD +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD EIGEN_UNUSED_VARIABLE(tmp); c = pmadd(a,b,c); #else @@ -290,7 +328,7 @@ public: NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, nr = 4, -#if defined(EIGEN_HAS_FUSED_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) +#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) // we assume 16 registers mr = 3*LhsPacketSize, #else @@ -353,7 +391,7 @@ public: EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const { -#ifdef EIGEN_HAS_FUSED_MADD +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD EIGEN_UNUSED_VARIABLE(tmp); c.v = pmadd(a.v,b,c.v); #else @@ -637,7 +675,7 @@ public: EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const { -#ifdef EIGEN_HAS_FUSED_MADD +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD EIGEN_UNUSED_VARIABLE(tmp); c.v = pmadd(a,b.v,c.v); #else @@ -667,7 +705,7 @@ protected: * |real |cplx | no vectorization yet, would require to pack A with duplication * |cplx |real | easy vectorization */ -template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> +template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> struct gebp_kernel { typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits; @@ -676,14 +714,15 @@ struct gebp_kernel typedef typename Traits::RhsPacket RhsPacket; typedef typename Traits::ResPacket ResPacket; typedef typename Traits::AccPacket AccPacket; - + typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits; typedef typename SwappedTraits::ResScalar SResScalar; typedef typename SwappedTraits::LhsPacket SLhsPacket; typedef typename SwappedTraits::RhsPacket SRhsPacket; typedef typename SwappedTraits::ResPacket SResPacket; typedef typename SwappedTraits::AccPacket SAccPacket; - + + typedef typename DataMapper::LinearMapper LinearMapper; enum { Vectorizable = Traits::Vectorizable, @@ -693,14 +732,16 @@ struct gebp_kernel }; EIGEN_DONT_INLINE - void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, + void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, + Index rows, Index depth, Index cols, ResScalar alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); }; -template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> +template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> EIGEN_DONT_INLINE -void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> - ::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, +void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs> + ::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, + Index rows, Index depth, Index cols, ResScalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { Traits traits; @@ -743,15 +784,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7); traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11); - ResScalar* r0 = &res[(j2+0)*resStride + i]; - ResScalar* r1 = &res[(j2+1)*resStride + i]; - ResScalar* r2 = &res[(j2+2)*resStride + i]; - ResScalar* r3 = &res[(j2+3)*resStride + i]; - - internal::prefetch(r0); - internal::prefetch(r1); - internal::prefetch(r2); - internal::prefetch(r3); + LinearMapper r0 = res.getLinearMapper(i, j2 + 0); + LinearMapper r1 = res.getLinearMapper(i, j2 + 1); + LinearMapper r2 = res.getLinearMapper(i, j2 + 2); + LinearMapper r3 = res.getLinearMapper(i, j2 + 3); + + r0.prefetch(0); + r1.prefetch(0); + r2.prefetch(0); + r3.prefetch(0); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; @@ -760,31 +801,36 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> for(Index k=0; k<peeled_kc; k+=pk) { - EIGEN_ASM_COMMENT("begin gegp micro kernel 3p x 4"); + EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4"); RhsPacket B_0, T0; LhsPacket A2; #define EIGEN_GEBGP_ONESTEP(K) \ - internal::prefetch(blA+(3*K+16)*LhsProgress); \ - traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ - traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ - traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ - traits.loadRhs(&blB[(0+4*K)*RhsProgress], B_0); \ - traits.madd(A0, B_0, C0, T0); \ - traits.madd(A1, B_0, C4, T0); \ - traits.madd(A2, B_0, C8, B_0); \ - traits.loadRhs(&blB[1+4*K*RhsProgress], B_0); \ - traits.madd(A0, B_0, C1, T0); \ - traits.madd(A1, B_0, C5, T0); \ - traits.madd(A2, B_0, C9, B_0); \ - traits.loadRhs(&blB[2+4*K*RhsProgress], B_0); \ - traits.madd(A0, B_0, C2, T0); \ - traits.madd(A1, B_0, C6, T0); \ - traits.madd(A2, B_0, C10, B_0); \ - traits.loadRhs(&blB[3+4*K*RhsProgress], B_0); \ - traits.madd(A0, B_0, C3 , T0); \ - traits.madd(A1, B_0, C7, T0); \ - traits.madd(A2, B_0, C11, B_0) + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \ + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ + internal::prefetch(blA+(3*K+16)*LhsProgress); \ + traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ + traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ + traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ + traits.loadRhs(&blB[(0+4*K)*RhsProgress], B_0); \ + traits.madd(A0, B_0, C0, T0); \ + traits.madd(A1, B_0, C4, T0); \ + traits.madd(A2, B_0, C8, B_0); \ + traits.loadRhs(&blB[1+4*K*RhsProgress], B_0); \ + traits.madd(A0, B_0, C1, T0); \ + traits.madd(A1, B_0, C5, T0); \ + traits.madd(A2, B_0, C9, B_0); \ + traits.loadRhs(&blB[2+4*K*RhsProgress], B_0); \ + traits.madd(A0, B_0, C2, T0); \ + traits.madd(A1, B_0, C6, T0); \ + traits.madd(A2, B_0, C10, B_0); \ + traits.loadRhs(&blB[3+4*K*RhsProgress], B_0); \ + traits.madd(A0, B_0, C3 , T0); \ + traits.madd(A1, B_0, C7, T0); \ + traits.madd(A2, B_0, C11, B_0); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \ + } while(false) internal::prefetch(blB+(48+0)); EIGEN_GEBGP_ONESTEP(0); @@ -799,6 +845,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> blB += pk*4*RhsProgress; blA += pk*3*Traits::LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4"); } // process remaining peeled loop for(Index k=peeled_kc; k<depth; k++) @@ -813,48 +861,48 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> ResPacket R0, R1, R2; ResPacket alphav = pset1<ResPacket>(alpha); - - R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize); - R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize); - R2 = ploadu<ResPacket>(r0+2*Traits::ResPacketSize); + + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); + R2 = r0.loadPacket(2 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C8, alphav, R2); - pstoreu(r0+0*Traits::ResPacketSize, R0); - pstoreu(r0+1*Traits::ResPacketSize, R1); - pstoreu(r0+2*Traits::ResPacketSize, R2); - - R0 = ploadu<ResPacket>(r1+0*Traits::ResPacketSize); - R1 = ploadu<ResPacket>(r1+1*Traits::ResPacketSize); - R2 = ploadu<ResPacket>(r1+2*Traits::ResPacketSize); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); + r0.storePacket(2 * Traits::ResPacketSize, R2); + + R0 = r1.loadPacket(0 * Traits::ResPacketSize); + R1 = r1.loadPacket(1 * Traits::ResPacketSize); + R2 = r1.loadPacket(2 * Traits::ResPacketSize); traits.acc(C1, alphav, R0); traits.acc(C5, alphav, R1); traits.acc(C9, alphav, R2); - pstoreu(r1+0*Traits::ResPacketSize, R0); - pstoreu(r1+1*Traits::ResPacketSize, R1); - pstoreu(r1+2*Traits::ResPacketSize, R2); - - R0 = ploadu<ResPacket>(r2+0*Traits::ResPacketSize); - R1 = ploadu<ResPacket>(r2+1*Traits::ResPacketSize); - R2 = ploadu<ResPacket>(r2+2*Traits::ResPacketSize); + r1.storePacket(0 * Traits::ResPacketSize, R0); + r1.storePacket(1 * Traits::ResPacketSize, R1); + r1.storePacket(2 * Traits::ResPacketSize, R2); + + R0 = r2.loadPacket(0 * Traits::ResPacketSize); + R1 = r2.loadPacket(1 * Traits::ResPacketSize); + R2 = r2.loadPacket(2 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C6, alphav, R1); traits.acc(C10, alphav, R2); - pstoreu(r2+0*Traits::ResPacketSize, R0); - pstoreu(r2+1*Traits::ResPacketSize, R1); - pstoreu(r2+2*Traits::ResPacketSize, R2); - - R0 = ploadu<ResPacket>(r3+0*Traits::ResPacketSize); - R1 = ploadu<ResPacket>(r3+1*Traits::ResPacketSize); - R2 = ploadu<ResPacket>(r3+2*Traits::ResPacketSize); + r2.storePacket(0 * Traits::ResPacketSize, R0); + r2.storePacket(1 * Traits::ResPacketSize, R1); + r2.storePacket(2 * Traits::ResPacketSize, R2); + + R0 = r3.loadPacket(0 * Traits::ResPacketSize); + R1 = r3.loadPacket(1 * Traits::ResPacketSize); + R2 = r3.loadPacket(2 * Traits::ResPacketSize); traits.acc(C3, alphav, R0); traits.acc(C7, alphav, R1); traits.acc(C11, alphav, R2); - pstoreu(r3+0*Traits::ResPacketSize, R0); - pstoreu(r3+1*Traits::ResPacketSize, R1); - pstoreu(r3+2*Traits::ResPacketSize, R2); + r3.storePacket(0 * Traits::ResPacketSize, R0); + r3.storePacket(1 * Traits::ResPacketSize, R1); + r3.storePacket(2 * Traits::ResPacketSize, R2); } - + // Deal with remaining columns of the rhs for(Index j2=packet_cols4; j2<cols; j2++) { @@ -868,7 +916,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> traits.initAcc(C4); traits.initAcc(C8); - ResScalar* r0 = &res[(j2+0)*resStride + i]; + LinearMapper r0 = res.getLinearMapper(i, j2); + r0.prefetch(0); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB]; @@ -876,16 +925,21 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> for(Index k=0; k<peeled_kc; k+=pk) { - EIGEN_ASM_COMMENT("begin gegp micro kernel 3p x 1"); + EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1"); RhsPacket B_0; #define EIGEN_GEBGP_ONESTEP(K) \ - traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ - traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ - traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ - traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ - traits.madd(A0, B_0, C0, B_0); \ - traits.madd(A1, B_0, C4, B_0); \ - traits.madd(A2, B_0, C8, B_0) + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \ + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ + traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ + traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ + traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ + traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ + traits.madd(A0, B_0, C0, B_0); \ + traits.madd(A1, B_0, C4, B_0); \ + traits.madd(A2, B_0, C8, B_0); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \ + } while(false) EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); @@ -898,6 +952,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> blB += pk*RhsProgress; blA += pk*3*Traits::LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1"); } // process remaining peeled loop @@ -912,19 +968,19 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> ResPacket R0, R1, R2; ResPacket alphav = pset1<ResPacket>(alpha); - R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize); - R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize); - R2 = ploadu<ResPacket>(r0+2*Traits::ResPacketSize); + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); + R2 = r0.loadPacket(2 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); - traits.acc(C8 , alphav, R2); - pstoreu(r0+0*Traits::ResPacketSize, R0); - pstoreu(r0+1*Traits::ResPacketSize, R1); - pstoreu(r0+2*Traits::ResPacketSize, R2); + traits.acc(C8, alphav, R2); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); + r0.storePacket(2 * Traits::ResPacketSize, R2); } } } - + //---------- Process 2 * LhsProgress rows at once ---------- if(mr>=2*Traits::LhsProgress) { @@ -946,15 +1002,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3); traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7); - ResScalar* r0 = &res[(j2+0)*resStride + i]; - ResScalar* r1 = &res[(j2+1)*resStride + i]; - ResScalar* r2 = &res[(j2+2)*resStride + i]; - ResScalar* r3 = &res[(j2+3)*resStride + i]; - - internal::prefetch(r0+prefetch_res_offset); - internal::prefetch(r1+prefetch_res_offset); - internal::prefetch(r2+prefetch_res_offset); - internal::prefetch(r3+prefetch_res_offset); + LinearMapper r0 = res.getLinearMapper(i, j2 + 0); + LinearMapper r1 = res.getLinearMapper(i, j2 + 1); + LinearMapper r2 = res.getLinearMapper(i, j2 + 2); + LinearMapper r3 = res.getLinearMapper(i, j2 + 3); + + r0.prefetch(prefetch_res_offset); + r1.prefetch(prefetch_res_offset); + r2.prefetch(prefetch_res_offset); + r3.prefetch(prefetch_res_offset); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; @@ -963,21 +1019,26 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> for(Index k=0; k<peeled_kc; k+=pk) { - EIGEN_ASM_COMMENT("begin gegp micro kernel 2pX4"); + EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4"); RhsPacket B_0, B1, B2, B3, T0; #define EIGEN_GEBGP_ONESTEP(K) \ - traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ - traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ - traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ - traits.madd(A0, B_0, C0, T0); \ - traits.madd(A1, B_0, C4, B_0); \ - traits.madd(A0, B1, C1, T0); \ - traits.madd(A1, B1, C5, B1); \ - traits.madd(A0, B2, C2, T0); \ - traits.madd(A1, B2, C6, B2); \ - traits.madd(A0, B3, C3, T0); \ - traits.madd(A1, B3, C7, B3) + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \ + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ + traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ + traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ + traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ + traits.madd(A0, B_0, C0, T0); \ + traits.madd(A1, B_0, C4, B_0); \ + traits.madd(A0, B1, C1, T0); \ + traits.madd(A1, B1, C5, B1); \ + traits.madd(A0, B2, C2, T0); \ + traits.madd(A1, B2, C6, B2); \ + traits.madd(A0, B3, C3, T0); \ + traits.madd(A1, B3, C7, B3); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \ + } while(false) internal::prefetch(blB+(48+0)); EIGEN_GEBGP_ONESTEP(0); @@ -992,6 +1053,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> blB += pk*4*RhsProgress; blA += pk*(2*Traits::LhsProgress); + + EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4"); } // process remaining peeled loop for(Index k=peeled_kc; k<depth; k++) @@ -1002,37 +1065,37 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> blA += 2*Traits::LhsProgress; } #undef EIGEN_GEBGP_ONESTEP - + ResPacket R0, R1, R2, R3; ResPacket alphav = pset1<ResPacket>(alpha); - - R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize); - R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize); - R2 = ploadu<ResPacket>(r1+0*Traits::ResPacketSize); - R3 = ploadu<ResPacket>(r1+1*Traits::ResPacketSize); + + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); + R2 = r1.loadPacket(0 * Traits::ResPacketSize); + R3 = r1.loadPacket(1 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C1, alphav, R2); traits.acc(C5, alphav, R3); - pstoreu(r0+0*Traits::ResPacketSize, R0); - pstoreu(r0+1*Traits::ResPacketSize, R1); - pstoreu(r1+0*Traits::ResPacketSize, R2); - pstoreu(r1+1*Traits::ResPacketSize, R3); - - R0 = ploadu<ResPacket>(r2+0*Traits::ResPacketSize); - R1 = ploadu<ResPacket>(r2+1*Traits::ResPacketSize); - R2 = ploadu<ResPacket>(r3+0*Traits::ResPacketSize); - R3 = ploadu<ResPacket>(r3+1*Traits::ResPacketSize); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); + r1.storePacket(0 * Traits::ResPacketSize, R2); + r1.storePacket(1 * Traits::ResPacketSize, R3); + + R0 = r2.loadPacket(0 * Traits::ResPacketSize); + R1 = r2.loadPacket(1 * Traits::ResPacketSize); + R2 = r3.loadPacket(0 * Traits::ResPacketSize); + R3 = r3.loadPacket(1 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C6, alphav, R1); traits.acc(C3, alphav, R2); traits.acc(C7, alphav, R3); - pstoreu(r2+0*Traits::ResPacketSize, R0); - pstoreu(r2+1*Traits::ResPacketSize, R1); - pstoreu(r3+0*Traits::ResPacketSize, R2); - pstoreu(r3+1*Traits::ResPacketSize, R3); + r2.storePacket(0 * Traits::ResPacketSize, R0); + r2.storePacket(1 * Traits::ResPacketSize, R1); + r3.storePacket(0 * Traits::ResPacketSize, R2); + r3.storePacket(1 * Traits::ResPacketSize, R3); } - + // Deal with remaining columns of the rhs for(Index j2=packet_cols4; j2<cols; j2++) { @@ -1045,8 +1108,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> traits.initAcc(C0); traits.initAcc(C4); - ResScalar* r0 = &res[(j2+0)*resStride + i]; - internal::prefetch(r0+prefetch_res_offset); + LinearMapper r0 = res.getLinearMapper(i, j2); + r0.prefetch(prefetch_res_offset); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB]; @@ -1054,15 +1117,20 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> for(Index k=0; k<peeled_kc; k+=pk) { - EIGEN_ASM_COMMENT("begin gegp micro kernel 2p x 1"); + EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1"); RhsPacket B_0, B1; #define EIGEN_GEBGP_ONESTEP(K) \ - traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ - traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ - traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ - traits.madd(A0, B_0, C0, B1); \ - traits.madd(A1, B_0, C4, B_0) + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \ + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ + traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ + traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ + traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ + traits.madd(A0, B_0, C0, B1); \ + traits.madd(A1, B_0, C4, B_0); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \ + } while(false) EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); @@ -1075,6 +1143,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> blB += pk*RhsProgress; blA += pk*2*Traits::LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1"); } // process remaining peeled loop @@ -1089,12 +1159,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> ResPacket R0, R1; ResPacket alphav = pset1<ResPacket>(alpha); - R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize); - R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize); + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); - pstoreu(r0+0*Traits::ResPacketSize, R0); - pstoreu(r0+1*Traits::ResPacketSize, R1); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); } } } @@ -1120,15 +1190,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> traits.initAcc(C2); traits.initAcc(C3); - ResScalar* r0 = &res[(j2+0)*resStride + i]; - ResScalar* r1 = &res[(j2+1)*resStride + i]; - ResScalar* r2 = &res[(j2+2)*resStride + i]; - ResScalar* r3 = &res[(j2+3)*resStride + i]; - - internal::prefetch(r0+prefetch_res_offset); - internal::prefetch(r1+prefetch_res_offset); - internal::prefetch(r2+prefetch_res_offset); - internal::prefetch(r3+prefetch_res_offset); + LinearMapper r0 = res.getLinearMapper(i, j2 + 0); + LinearMapper r1 = res.getLinearMapper(i, j2 + 1); + LinearMapper r2 = res.getLinearMapper(i, j2 + 2); + LinearMapper r3 = res.getLinearMapper(i, j2 + 3); + + r0.prefetch(prefetch_res_offset); + r1.prefetch(prefetch_res_offset); + r2.prefetch(prefetch_res_offset); + r3.prefetch(prefetch_res_offset); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; @@ -1137,16 +1207,21 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> for(Index k=0; k<peeled_kc; k+=pk) { - EIGEN_ASM_COMMENT("begin gegp micro kernel 1pX4"); + EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4"); RhsPacket B_0, B1, B2, B3; #define EIGEN_GEBGP_ONESTEP(K) \ - traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \ - traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ - traits.madd(A0, B_0, C0, B_0); \ - traits.madd(A0, B1, C1, B1); \ - traits.madd(A0, B2, C2, B2); \ - traits.madd(A0, B3, C3, B3); + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \ + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ + traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \ + traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ + traits.madd(A0, B_0, C0, B_0); \ + traits.madd(A0, B1, C1, B1); \ + traits.madd(A0, B2, C2, B2); \ + traits.madd(A0, B3, C3, B3); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \ + } while(false) internal::prefetch(blB+(48+0)); EIGEN_GEBGP_ONESTEP(0); @@ -1161,6 +1236,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> blB += pk*4*RhsProgress; blA += pk*1*LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4"); } // process remaining peeled loop for(Index k=peeled_kc; k<depth; k++) @@ -1171,25 +1248,25 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> blA += 1*LhsProgress; } #undef EIGEN_GEBGP_ONESTEP - + ResPacket R0, R1; ResPacket alphav = pset1<ResPacket>(alpha); - - R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize); - R1 = ploadu<ResPacket>(r1+0*Traits::ResPacketSize); + + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r1.loadPacket(0 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C1, alphav, R1); - pstoreu(r0+0*Traits::ResPacketSize, R0); - pstoreu(r1+0*Traits::ResPacketSize, R1); - - R0 = ploadu<ResPacket>(r2+0*Traits::ResPacketSize); - R1 = ploadu<ResPacket>(r3+0*Traits::ResPacketSize); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r1.storePacket(0 * Traits::ResPacketSize, R1); + + R0 = r2.loadPacket(0 * Traits::ResPacketSize); + R1 = r3.loadPacket(0 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C3, alphav, R1); - pstoreu(r2+0*Traits::ResPacketSize, R0); - pstoreu(r3+0*Traits::ResPacketSize, R1); + r2.storePacket(0 * Traits::ResPacketSize, R0); + r3.storePacket(0 * Traits::ResPacketSize, R1); } - + // Deal with remaining columns of the rhs for(Index j2=packet_cols4; j2<cols; j2++) { @@ -1201,7 +1278,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> AccPacket C0; traits.initAcc(C0); - ResScalar* r0 = &res[(j2+0)*resStride + i]; + LinearMapper r0 = res.getLinearMapper(i, j2); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB]; @@ -1209,14 +1286,19 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> for(Index k=0; k<peeled_kc; k+=pk) { - EIGEN_ASM_COMMENT("begin gegp micro kernel 2p x 1"); + EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1"); RhsPacket B_0; #define EIGEN_GEBGP_ONESTEP(K) \ - traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \ - traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ - traits.madd(A0, B_0, C0, B_0); \ - + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \ + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ + traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \ + traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ + traits.madd(A0, B_0, C0, B_0); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \ + } while(false); + EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); EIGEN_GEBGP_ONESTEP(2); @@ -1228,6 +1310,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> blB += pk*RhsProgress; blA += pk*1*Traits::LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1"); } // process remaining peeled loop @@ -1241,9 +1325,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> #undef EIGEN_GEBGP_ONESTEP ResPacket R0; ResPacket alphav = pset1<ResPacket>(alpha); - R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize); + R0 = r0.loadPacket(0 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); - pstoreu(r0+0*Traits::ResPacketSize, R0); + r0.storePacket(0 * Traits::ResPacketSize, R0); } } } @@ -1259,7 +1343,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> const LhsScalar* blA = &blockA[i*strideA+offsetA]; prefetch(&blA[0]); const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; - + if( (SwappedTraits::LhsProgress % 4)==0 ) { // NOTE The following piece of code wont work for 512 bit registers @@ -1268,32 +1352,32 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> straits.initAcc(C1); straits.initAcc(C2); straits.initAcc(C3); - + const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4); const Index endk = (depth/spk)*spk; const Index endk4 = (depth/(spk*4))*(spk*4); - + Index k=0; for(; k<endk4; k+=4*spk) { SLhsPacket A0,A1; SRhsPacket B_0,B_1; - + straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0); straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1); - + straits.loadRhsQuad(blA+0*spk, B_0); straits.loadRhsQuad(blA+1*spk, B_1); straits.madd(A0,B_0,C0,B_0); straits.madd(A1,B_1,C1,B_1); - + straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0); straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1); straits.loadRhsQuad(blA+2*spk, B_0); straits.loadRhsQuad(blA+3*spk, B_1); straits.madd(A0,B_0,C2,B_0); straits.madd(A1,B_1,C3,B_1); - + blB += 4*SwappedTraits::LhsProgress; blA += 4*spk; } @@ -1302,11 +1386,11 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> { SLhsPacket A0; SRhsPacket B_0; - + straits.loadLhsUnaligned(blB, A0); straits.loadRhsQuad(blA, B_0); straits.madd(A0,B_0,C0,B_0); - + blB += SwappedTraits::LhsProgress; blA += spk; } @@ -1317,10 +1401,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf; typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf; typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf; - - SResPacketHalf R = pgather<SResScalar, SResPacketHalf>(&res[j2*resStride + i], resStride); + + SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2); SResPacketHalf alphav = pset1<SResPacketHalf>(alpha); - + if(depth-endk>0) { // We have to handle the last row of the rhs which corresponds to a half-packet @@ -1336,14 +1420,14 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> { straits.acc(predux4(C0), alphav, R); } - pscatter(&res[j2*resStride + i], R, resStride); + res.scatterPacket(i, j2, R); } else { - SResPacket R = pgather<SResScalar, SResPacket>(&res[j2*resStride + i], resStride); + SResPacket R = res.template gatherPacket<SResPacket>(i, j2); SResPacket alphav = pset1<SResPacket>(alpha); straits.acc(C0, alphav, R); - pscatter(&res[j2*resStride + i], R, resStride); + res.scatterPacket(i, j2, R); } } else // scalar path @@ -1355,25 +1439,25 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> { LhsScalar A0; RhsScalar B_0, B_1; - + A0 = blA[k]; - + B_0 = blB[0]; B_1 = blB[1]; - MADD(cj,A0,B_0,C0, B_0); - MADD(cj,A0,B_1,C1, B_1); + CJMADD(cj,A0,B_0,C0, B_0); + CJMADD(cj,A0,B_1,C1, B_1); B_0 = blB[2]; B_1 = blB[3]; - MADD(cj,A0,B_0,C2, B_0); - MADD(cj,A0,B_1,C3, B_1); + CJMADD(cj,A0,B_0,C2, B_0); + CJMADD(cj,A0,B_1,C3, B_1); blB += 4; } - res[(j2+0)*resStride + i] += alpha*C0; - res[(j2+1)*resStride + i] += alpha*C1; - res[(j2+2)*resStride + i] += alpha*C2; - res[(j2+3)*resStride + i] += alpha*C3; + res(i, j2 + 0) += alpha * C0; + res(i, j2 + 1) += alpha * C1; + res(i, j2 + 2) += alpha * C2; + res(i, j2 + 3) += alpha * C3; } } } @@ -1392,9 +1476,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> { LhsScalar A0 = blA[k]; RhsScalar B_0 = blB[k]; - MADD(cj, A0, B_0, C0, B_0); + CJMADD(cj, A0, B_0, C0, B_0); } - res[(j2+0)*resStride + i] += alpha*C0; + res(i, j2) += alpha * C0; } } } @@ -1417,15 +1501,16 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> // // 32 33 34 35 ... // 36 36 38 39 ... -template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode> -struct gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conjugate, PanelMode> +template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode> +struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode> { - EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0); + typedef typename DataMapper::LinearMapper LinearMapper; + EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; -template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode> -EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conjugate, PanelMode> - ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset) +template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode> +EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode> + ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { typedef typename packet_traits<Scalar>::type Packet; enum { PacketSize = packet_traits<Scalar>::size }; @@ -1436,30 +1521,29 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) ); conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; - const_blas_data_mapper<Scalar, Index, ColMajor> lhs(_lhs,lhsStride); Index count = 0; - + const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1 : Pack2>1 ? (rows/Pack2)*Pack2 : 0; - + Index i=0; - + // Pack 3 packets if(Pack1>=3*PacketSize) { for(; i<peeled_mc3; i+=3*PacketSize) { if(PanelMode) count += (3*PacketSize) * offset; - + for(Index k=0; k<depth; k++) { Packet A, B, C; - A = ploadu<Packet>(&lhs(i+0*PacketSize, k)); - B = ploadu<Packet>(&lhs(i+1*PacketSize, k)); - C = ploadu<Packet>(&lhs(i+2*PacketSize, k)); + A = lhs.loadPacket(i+0*PacketSize, k); + B = lhs.loadPacket(i+1*PacketSize, k); + C = lhs.loadPacket(i+2*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; pstore(blockA+count, cj.pconj(B)); count+=PacketSize; pstore(blockA+count, cj.pconj(C)); count+=PacketSize; @@ -1473,12 +1557,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj for(; i<peeled_mc2; i+=2*PacketSize) { if(PanelMode) count += (2*PacketSize) * offset; - + for(Index k=0; k<depth; k++) { Packet A, B; - A = ploadu<Packet>(&lhs(i+0*PacketSize, k)); - B = ploadu<Packet>(&lhs(i+1*PacketSize, k)); + A = lhs.loadPacket(i+0*PacketSize, k); + B = lhs.loadPacket(i+1*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } @@ -1491,11 +1575,11 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj for(; i<peeled_mc1; i+=1*PacketSize) { if(PanelMode) count += (1*PacketSize) * offset; - + for(Index k=0; k<depth; k++) { Packet A; - A = ploadu<Packet>(&lhs(i+0*PacketSize, k)); + A = lhs.loadPacket(i+0*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } @@ -1508,11 +1592,11 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj for(; i<peeled_mc0; i+=Pack2) { if(PanelMode) count += Pack2 * offset; - + for(Index k=0; k<depth; k++) for(Index w=0; w<Pack2; w++) blockA[count++] = cj(lhs(i+w, k)); - + if(PanelMode) count += Pack2 * (stride-offset-depth); } } @@ -1525,15 +1609,16 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj } } -template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode> -struct gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conjugate, PanelMode> +template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode> +struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode> { - EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0); + typedef typename DataMapper::LinearMapper LinearMapper; + EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; -template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode> -EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conjugate, PanelMode> - ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset) +template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode> +EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode> + ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { typedef typename packet_traits<Scalar>::type Packet; enum { PacketSize = packet_traits<Scalar>::size }; @@ -1543,13 +1628,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj EIGEN_UNUSED_VARIABLE(offset); eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; - const_blas_data_mapper<Scalar, Index, RowMajor> lhs(_lhs,lhsStride); Index count = 0; - + // const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; // const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; // const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; - + int pack = Pack1; Index i = 0; while(pack>0) @@ -1569,7 +1653,7 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj for (Index m = 0; m < pack; m += PacketSize) { PacketBlock<Packet> kernel; - for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = ploadu<Packet>(&lhs(i+p+m, k)); + for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k); ptranspose(kernel); for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); } @@ -1594,15 +1678,15 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj for(;w<pack;++w) blockA[count++] = cj(lhs(i+w, k)); } - + if(PanelMode) count += pack * (stride-offset-depth); } - + pack -= PacketSize; if(pack<Pack2 && (pack+PacketSize)!=Pack2) pack = Pack2; } - + for(; i<rows; i++) { if(PanelMode) count += offset; @@ -1619,17 +1703,18 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj // 4 5 6 7 16 17 18 19 25 28 // 8 9 10 11 20 21 22 23 26 29 // . . . . . . . . . . -template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> -struct gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode> +template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode> +struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> { typedef typename packet_traits<Scalar>::type Packet; + typedef typename DataMapper::LinearMapper LinearMapper; enum { PacketSize = packet_traits<Scalar>::size }; - EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0); + EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); }; -template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> -EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode> - ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset) +template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode> +EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> + ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR"); EIGEN_UNUSED_VARIABLE(stride); @@ -1685,27 +1770,27 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan // if(PanelMode) count += 8 * (stride-offset-depth); // } // } - + if(nr>=4) { for(Index j2=packet_cols8; j2<packet_cols4; j2+=4) { // skip what we have before if(PanelMode) count += 4 * offset; - const Scalar* b0 = &rhs[(j2+0)*rhsStride]; - const Scalar* b1 = &rhs[(j2+1)*rhsStride]; - const Scalar* b2 = &rhs[(j2+2)*rhsStride]; - const Scalar* b3 = &rhs[(j2+3)*rhsStride]; - + const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0); + const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1); + const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2); + const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3); + Index k=0; if((PacketSize%4)==0) // TODO enbale vectorized transposition for PacketSize==2 ?? { for(; k<peeled_k; k+=PacketSize) { PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel; - kernel.packet[0] = ploadu<Packet>(&b0[k]); - kernel.packet[1] = ploadu<Packet>(&b1[k]); - kernel.packet[2] = ploadu<Packet>(&b2[k]); - kernel.packet[3] = ploadu<Packet>(&b3[k]); + kernel.packet[0] = dm0.loadPacket(k); + kernel.packet[1] = dm1.loadPacket(k); + kernel.packet[2] = dm2.loadPacket(k); + kernel.packet[3] = dm3.loadPacket(k); ptranspose(kernel); pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0])); pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1])); @@ -1716,10 +1801,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan } for(; k<depth; k++) { - blockB[count+0] = cj(b0[k]); - blockB[count+1] = cj(b1[k]); - blockB[count+2] = cj(b2[k]); - blockB[count+3] = cj(b3[k]); + blockB[count+0] = cj(dm0(k)); + blockB[count+1] = cj(dm1(k)); + blockB[count+2] = cj(dm2(k)); + blockB[count+3] = cj(dm3(k)); count += 4; } // skip what we have after @@ -1731,10 +1816,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan for(Index j2=packet_cols4; j2<cols; ++j2) { if(PanelMode) count += offset; - const Scalar* b0 = &rhs[(j2+0)*rhsStride]; + const LinearMapper dm0 = rhs.getLinearMapper(0, j2); for(Index k=0; k<depth; k++) { - blockB[count] = cj(b0[k]); + blockB[count] = cj(dm0(k)); count += 1; } if(PanelMode) count += (stride-offset-depth); @@ -1742,17 +1827,18 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan } // this version is optimized for row major matrices -template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> -struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode> +template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode> +struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> { typedef typename packet_traits<Scalar>::type Packet; + typedef typename DataMapper::LinearMapper LinearMapper; enum { PacketSize = packet_traits<Scalar>::size }; - EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0); + EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); }; -template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> -EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode> - ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset) +template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode> +EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> + ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR"); EIGEN_UNUSED_VARIABLE(stride); @@ -1762,7 +1848,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; Index count = 0; - + // if(nr>=8) // { // for(Index j2=0; j2<packet_cols8; j2+=8) @@ -1805,15 +1891,15 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan for(Index k=0; k<depth; k++) { if (PacketSize==4) { - Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]); + Packet A = rhs.loadPacket(k, j2); pstoreu(blockB+count, cj.pconj(A)); count += PacketSize; } else { - const Scalar* b0 = &rhs[k*rhsStride + j2]; - blockB[count+0] = cj(b0[0]); - blockB[count+1] = cj(b0[1]); - blockB[count+2] = cj(b0[2]); - blockB[count+3] = cj(b0[3]); + const LinearMapper dm0 = rhs.getLinearMapper(k, j2); + blockB[count+0] = cj(dm0(0)); + blockB[count+1] = cj(dm0(1)); + blockB[count+2] = cj(dm0(2)); + blockB[count+3] = cj(dm0(3)); count += 4; } } @@ -1825,10 +1911,9 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan for(Index j2=packet_cols4; j2<cols; ++j2) { if(PanelMode) count += offset; - const Scalar* b0 = &rhs[j2]; for(Index k=0; k<depth; k++) { - blockB[count] = cj(b0[k*rhsStride]); + blockB[count] = cj(rhs(k, j2)); count += 1; } if(PanelMode) count += stride-offset-depth; @@ -1841,8 +1926,8 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan * \sa setCpuCacheSize */ inline std::ptrdiff_t l1CacheSize() { - std::ptrdiff_t l1, l2; - internal::manage_caching_sizes(GetAction, &l1, &l2); + std::ptrdiff_t l1, l2, l3; + internal::manage_caching_sizes(GetAction, &l1, &l2, &l3); return l1; } @@ -1850,8 +1935,8 @@ inline std::ptrdiff_t l1CacheSize() * \sa setCpuCacheSize */ inline std::ptrdiff_t l2CacheSize() { - std::ptrdiff_t l1, l2; - internal::manage_caching_sizes(GetAction, &l1, &l2); + std::ptrdiff_t l1, l2, l3; + internal::manage_caching_sizes(GetAction, &l1, &l2, &l3); return l2; } @@ -1860,9 +1945,9 @@ inline std::ptrdiff_t l2CacheSize() * for the algorithms working per blocks. * * \sa computeProductBlockingSizes */ -inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2) +inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3) { - internal::manage_caching_sizes(SetAction, &l1, &l2); + internal::manage_caching_sizes(SetAction, &l1, &l2, &l3); } } // end namespace Eigen diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index b7e1867f0..fd9443cd2 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -59,21 +59,25 @@ typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScal static void run(Index rows, Index cols, Index depth, const LhsScalar* _lhs, Index lhsStride, const RhsScalar* _rhs, Index rhsStride, - ResScalar* res, Index resStride, + ResScalar* _res, Index resStride, ResScalar alpha, level3_blocking<LhsScalar,RhsScalar>& blocking, GemmParallelInfo<Index>* info = 0) { - const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); - const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); + typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper; + typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper; + typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper; + LhsMapper lhs(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction Index nc = (std::min)(cols,blocking.nc()); // cache block size along the N direction - gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; - gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs; - gebp_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp; + gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs; + gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp; #ifdef EIGEN_HAS_OPENMP if(info) @@ -95,7 +99,7 @@ static void run(Index rows, Index cols, Index depth, // In order to reduce the chance that a thread has to wait for the other, // let's start by packing B'. - pack_rhs(blockB, &rhs(k,0), rhsStride, actual_kc, nc); + pack_rhs(blockB, rhs.getSubMapper(k,0), actual_kc, nc); // Pack A_k to A' in a parallel fashion: // each thread packs the sub block A_k,i to A'_i where i is the thread id. @@ -105,8 +109,8 @@ static void run(Index rows, Index cols, Index depth, // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it. while(info[tid].users!=0) {} info[tid].users += threads; - - pack_lhs(blockA+info[tid].lhs_start*actual_kc, &lhs(info[tid].lhs_start,k), lhsStride, actual_kc, info[tid].lhs_length); + + pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length); // Notify the other threads that the part A'_i is ready to go. info[tid].sync = k; @@ -119,9 +123,12 @@ static void run(Index rows, Index cols, Index depth, // At this point we have to make sure that A'_i has been updated by the thread i, // we use testAndSetOrdered to mimic a volatile access. // However, no need to wait for the B' part which has been updated by the current thread! - if(shift>0) - while(info[i].sync!=k) {} - gebp(res+info[i].lhs_start, resStride, blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha); + if (shift>0) { + while(info[i].sync!=k) { + } + } + + gebp(res.getSubMapper(info[i].lhs_start, 0), blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha); } // Then keep going as usual with the remaining B' @@ -130,10 +137,10 @@ static void run(Index rows, Index cols, Index depth, const Index actual_nc = (std::min)(j+nc,cols)-j; // pack B_k,j to B' - pack_rhs(blockB, &rhs(k,j), rhsStride, actual_kc, actual_nc); + pack_rhs(blockB, rhs.getSubMapper(k,j), actual_kc, actual_nc); // C_j += A' * B' - gebp(res+j*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha); + gebp(res.getSubMapper(0, j), blockA, blockB, rows, actual_kc, actual_nc, alpha); } // Release all the sub blocks A'_i of A' for the current thread, @@ -159,28 +166,33 @@ static void run(Index rows, Index cols, Index depth, ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB()); // For each horizontal panel of the rhs, and corresponding panel of the lhs... - for(Index k2=0; k2<depth; k2+=kc) + for(Index i2=0; i2<rows; i2+=mc) { - const Index actual_kc = (std::min)(k2+kc,depth)-k2; + const Index actual_mc = (std::min)(i2+mc,rows)-i2; - // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs. - // => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching) - // Note that this panel will be read as many times as the number of blocks in the rhs's - // horizontal panel which is, in practice, a very low number. - pack_lhs(blockA, &lhs(0,k2), lhsStride, actual_kc, rows); - - // For each kc x nc block of the rhs's horizontal panel... - for(Index j2=0; j2<cols; j2+=nc) + for(Index k2=0; k2<depth; k2+=kc) { - const Index actual_nc = (std::min)(j2+nc,cols)-j2; - - // We pack the rhs's block into a sequential chunk of memory (L2 caching) - // Note that this block will be read a very high number of times, which is equal to the number of - // micro horizontal panel of the large rhs's panel (e.g., rows/12 times). - pack_rhs(blockB, &rhs(k2,j2), rhsStride, actual_kc, actual_nc); - - // Everything is packed, we can now call the panel * block kernel: - gebp(res+j2*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha); + const Index actual_kc = (std::min)(k2+kc,depth)-k2; + + // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs. + // => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching) + // Note that this panel will be read as many times as the number of blocks in the rhs's + // horizontal panel which is, in practice, a very low number. + pack_lhs(blockA, lhs.getSubMapper(i2,k2), actual_kc, actual_mc); + + // For each kc x nc block of the rhs's horizontal panel... + for(Index j2=0; j2<cols; j2+=nc) + { + const Index actual_nc = (std::min)(j2+nc,cols)-j2; + + // We pack the rhs's block into a sequential chunk of memory (L2 caching) + // Note that this block will be read a very high number of times, which is equal to the number of + // micro horizontal panel of the large rhs's panel (e.g., rows/12 times). + pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc); + + // Everything is packed, we can now call the panel * block kernel: + gebp(res.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, alpha); + } } } } @@ -287,7 +299,7 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M public: - gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, bool /*full_rows*/ = false) + gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, int /*num_threads*/, bool /*full_rows = false*/) { this->m_mc = ActualRows; this->m_nc = ActualCols; @@ -319,23 +331,23 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M public: - gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, bool full_rows = false) + gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, int num_threads, bool l3_blocking) { this->m_mc = Transpose ? cols : rows; this->m_nc = Transpose ? rows : cols; this->m_kc = depth; - if(full_rows) + if(l3_blocking) { - DenseIndex m = this->m_mc; - computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc); + computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc, num_threads); } - else // full columns + else // no l3 blocking { + DenseIndex m = this->m_mc; DenseIndex n = this->m_nc; - computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, n); + computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, n, num_threads); } - + m_sizeA = this->m_mc * this->m_kc; m_sizeB = this->m_kc * this->m_nc; } @@ -445,8 +457,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct> (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>, ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> GemmFunctor; - BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), true); - + BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true); internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)> (GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), Dest::Flags&RowMajorBit); } diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index 7db3e3d38..e55994900 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -58,27 +58,31 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder, { typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsStride, ResScalar* res, Index resStride, const ResScalar& alpha) + const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride, const ResScalar& alpha) { - const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); - const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); - typedef gebp_traits<LhsScalar,RhsScalar> Traits; + typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper; + typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper; + typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper; + LhsMapper lhs(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride); + Index kc = depth; // cache block size along the K direction Index mc = size; // cache block size along the M direction Index nc = size; // cache block size along the N direction - computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc); + computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc, 1); // !!! mc must be a multiple of nr: if(mc > Traits::nr) mc = (mc/Traits::nr)*Traits::nr; ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0); ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, kc*size, 0); - - gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; - gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs; - gebp_kernel <LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp; + + gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs; + gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp; tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, UpLo> sybb; for(Index k2=0; k2<depth; k2+=kc) @@ -86,29 +90,30 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder, const Index actual_kc = (std::min)(k2+kc,depth)-k2; // note that the actual rhs is the transpose/adjoint of mat - pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, size); + pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, size); for(Index i2=0; i2<size; i2+=mc) { const Index actual_mc = (std::min)(i2+mc,size)-i2; - pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc); + pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc); // the selected actual_mc * size panel of res is split into three different part: // 1 - before the diagonal => processed with gebp or skipped // 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel // 3 - after the diagonal => processed with gebp or skipped if (UpLo==Lower) - gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha, - -1, -1, 0, 0); + gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, + (std::min)(size,i2), alpha, -1, -1, 0, 0); + - sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha); + sybb(_res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha); if (UpLo==Upper) { Index j2 = i2+actual_mc; - gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha, - -1, -1, 0, 0); + gebp(res.getSubMapper(i2, j2), blockA, blockB+actual_kc*j2, actual_mc, + actual_kc, (std::max)(Index(0), size-j2), alpha, -1, -1, 0, 0); } } } @@ -129,13 +134,16 @@ struct tribb_kernel { typedef gebp_traits<LhsScalar,RhsScalar,ConjLhs,ConjRhs> Traits; typedef typename Traits::ResScalar ResScalar; - + enum { BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr) }; - void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) + void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) { - gebp_kernel<LhsScalar, RhsScalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel; + typedef blas_data_mapper<ResScalar, Index, ColMajor> ResMapper; + ResMapper res(_res, resStride); + gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel; + Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer; // let's process the block per panel of actual_mc x BlockSize, @@ -146,7 +154,7 @@ struct tribb_kernel const RhsScalar* actual_b = blockB+j*depth; if(UpLo==Upper) - gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha, + gebp_kernel(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha, -1, -1, 0, 0); // selfadjoint micro block @@ -154,12 +162,12 @@ struct tribb_kernel Index i = j; buffer.setZero(); // 1 - apply the kernel on the temporary buffer - gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, + gebp_kernel(ResMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, -1, -1, 0, 0); // 2 - triangular accumulation for(Index j1=0; j1<actualBlockSize; ++j1) { - ResScalar* r = res + (j+j1)*resStride + i; + ResScalar* r = &res(i, j + j1); for(Index i1=UpLo==Lower ? j1 : 0; UpLo==Lower ? i1<actualBlockSize : i1<=j1; ++i1) r[i1] += buffer(i1,j1); @@ -169,8 +177,8 @@ struct tribb_kernel if(UpLo==Lower) { Index i = j+actualBlockSize; - gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha, - -1, -1, 0, 0); + gebp_kernel(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i, + depth, actualBlockSize, alpha, -1, -1, 0, 0); } } } diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index 340c51394..7df6a6b1a 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -10,7 +10,7 @@ #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H #define EIGEN_GENERAL_MATRIX_VECTOR_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -48,17 +48,17 @@ namespace internal { * // we currently fall back to the NoneAligned case * * The same reasoning apply for the transposed case. - * + * * The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet... * One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment * strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow * compared to unaligned loads on a 4 byte boundary. * */ -template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version> -struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version> +template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version> +struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version> { -typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; + typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; enum { Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable @@ -78,17 +78,17 @@ typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; EIGEN_DONT_INLINE static void run( Index rows, Index cols, - const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsIncr, + const LhsMapper& lhs, + const RhsMapper& rhs, ResScalar* res, Index resIncr, RhsScalar alpha); }; -template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version> -EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run( +template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version> +EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run( Index rows, Index cols, - const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsIncr, + const LhsMapper& lhs, + const RhsMapper& rhs, ResScalar* res, Index resIncr, RhsScalar alpha) { @@ -97,14 +97,16 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co #ifdef _EIGEN_ACCUMULATE_PACKETS #error _EIGEN_ACCUMULATE_PACKETS has already been defined #endif - #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \ + #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \ pstore(&res[j], \ padd(pload<ResPacket>(&res[j]), \ padd( \ - padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \ - pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \ - padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \ - pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) ))) + padd(pcj.pmul(lhs0.template load<LhsPacket, Alignment0>(j), ptmp0), \ + pcj.pmul(lhs1.template load<LhsPacket, Alignment13>(j), ptmp1)), \ + padd(pcj.pmul(lhs2.template load<LhsPacket, Alignment2>(j), ptmp2), \ + pcj.pmul(lhs3.template load<LhsPacket, Alignment13>(j), ptmp3)) ))) + + typedef typename LhsMapper::VectorMapper LhsScalars; conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; @@ -118,7 +120,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co const Index ResPacketAlignedMask = ResPacketSize-1; // const Index PeelAlignedMask = ResPacketSize*peels-1; const Index size = rows; - + + const Index lhsStride = lhs.stride(); + // How many coeffs of the result do we have to skip to be aligned. // Here we assume data are at least aligned on the base scalar type. Index alignedStart = internal::first_aligned(res,size); @@ -131,15 +135,16 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co : FirstAligned; // we cannot assume the first element is aligned because of sub-matrices - const Index lhsAlignmentOffset = internal::first_aligned(lhs,size); + const Index lhsAlignmentOffset = lhs.firstAligned(size); // find how many columns do we have to skip to be aligned with the result (if possible) Index skipColumns = 0; // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) - if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) ) + if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (size_t(res)%sizeof(ResScalar)) ) { alignedSize = 0; alignedStart = 0; + alignmentPattern = NoneAligned; } else if(LhsPacketSize > 4) { @@ -149,7 +154,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co } else if (LhsPacketSize>1) { - eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize); + // eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize); while (skipColumns<LhsPacketSize && alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize)) @@ -166,10 +171,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co // note that the skiped columns are processed later. } - eigen_internal_assert( (alignmentPattern==NoneAligned) + /* eigen_internal_assert( (alignmentPattern==NoneAligned) || (skipColumns + columnsAtOnce >= cols) || LhsPacketSize > size - || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0); + || (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/ } else if(Vectorizable) { @@ -178,20 +183,20 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co alignmentPattern = AllAligned; } - Index offset1 = (FirstAligned && alignmentStep==1?3:1); - Index offset3 = (FirstAligned && alignmentStep==1?1:3); + const Index offset1 = (FirstAligned && alignmentStep==1?3:1); + const Index offset3 = (FirstAligned && alignmentStep==1?1:3); Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns; for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce) { - RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]), - ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]), - ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]), - ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]); + RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(i, 0)), + ptmp1 = pset1<RhsPacket>(alpha*rhs(i+offset1, 0)), + ptmp2 = pset1<RhsPacket>(alpha*rhs(i+2, 0)), + ptmp3 = pset1<RhsPacket>(alpha*rhs(i+offset3, 0)); // this helps a lot generating better binary code - const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, - *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; + const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0), lhs1 = lhs.getVectorMapper(0, i+offset1), + lhs2 = lhs.getVectorMapper(0, i+2), lhs3 = lhs.getVectorMapper(0, i+offset3); if (Vectorizable) { @@ -199,10 +204,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co // process initial unaligned coeffs for (Index j=0; j<alignedStart; ++j) { - res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]); - res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]); - res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]); - res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]); + res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]); + res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]); + res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]); + res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]); } if (alignedSize>alignedStart) @@ -211,11 +216,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co { case AllAligned: for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) - _EIGEN_ACCUMULATE_PACKETS(d,d,d); + _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned); break; case EvenAligned: for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) - _EIGEN_ACCUMULATE_PACKETS(d,du,d); + _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned); break; case FirstAligned: { @@ -225,28 +230,28 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co LhsPacket A00, A01, A02, A03, A10, A11, A12, A13; ResPacket T0, T1; - A01 = pload<LhsPacket>(&lhs1[alignedStart-1]); - A02 = pload<LhsPacket>(&lhs2[alignedStart-2]); - A03 = pload<LhsPacket>(&lhs3[alignedStart-3]); + A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1); + A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2); + A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3); for (; j<peeledSize; j+=peels*ResPacketSize) { - A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11); - A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12); - A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13); + A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11); + A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12); + A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13); - A00 = pload<LhsPacket>(&lhs0[j]); - A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]); + A00 = lhs0.template load<LhsPacket, Aligned>(j); + A10 = lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize); T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j])); T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize])); T0 = pcj.pmadd(A01, ptmp1, T0); - A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01); + A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01); T0 = pcj.pmadd(A02, ptmp2, T0); - A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02); + A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02); T0 = pcj.pmadd(A03, ptmp3, T0); pstore(&res[j],T0); - A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03); + A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03); T1 = pcj.pmadd(A11, ptmp1, T1); T1 = pcj.pmadd(A12, ptmp2, T1); T1 = pcj.pmadd(A13, ptmp3, T1); @@ -254,12 +259,12 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co } } for (; j<alignedSize; j+=ResPacketSize) - _EIGEN_ACCUMULATE_PACKETS(d,du,du); + _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned); break; } default: for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) - _EIGEN_ACCUMULATE_PACKETS(du,du,du); + _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned); break; } } @@ -268,10 +273,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co /* process remaining coeffs (or all if there is no explicit vectorization) */ for (Index j=alignedSize; j<size; ++j) { - res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]); - res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]); - res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]); - res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]); + res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]); + res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]); + res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]); + res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]); } } @@ -282,27 +287,27 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co { for (Index k=start; k<end; ++k) { - RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]); - const LhsScalar* lhs0 = lhs + k*lhsStride; + RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(k, 0)); + const LhsScalars lhs0 = lhs.getVectorMapper(0, k); if (Vectorizable) { /* explicit vectorization */ // process first unaligned result's coeffs for (Index j=0; j<alignedStart; ++j) - res[j] += cj.pmul(lhs0[j], pfirst(ptmp0)); + res[j] += cj.pmul(lhs0(j), pfirst(ptmp0)); // process aligned result's coeffs - if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0) + if (lhs0.template aligned<LhsPacket>(alignedStart)) for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize) - pstore(&res[i], pcj.pmadd(pload<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i]))); + pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(i), ptmp0, pload<ResPacket>(&res[i]))); else for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize) - pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i]))); + pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(i), ptmp0, pload<ResPacket>(&res[i]))); } // process remaining scalars (or all if no explicit vectorization) for (Index i=alignedSize; i<size; ++i) - res[i] += cj.pmul(lhs0[i], pfirst(ptmp0)); + res[i] += cj.pmul(lhs0(i), pfirst(ptmp0)); } if (skipColumns) { @@ -326,8 +331,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co * - alpha is always a complex (or converted to a complex) * - no vectorization */ -template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version> -struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version> +template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version> +struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version> { typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; @@ -346,70 +351,75 @@ typedef typename packet_traits<ResScalar>::type _ResPacket; typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; - + EIGEN_DONT_INLINE static void run( Index rows, Index cols, - const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsIncr, + const LhsMapper& lhs, + const RhsMapper& rhs, ResScalar* res, Index resIncr, ResScalar alpha); }; -template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version> -EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run( +template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version> +EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run( Index rows, Index cols, - const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsIncr, + const LhsMapper& lhs, + const RhsMapper& rhs, ResScalar* res, Index resIncr, ResScalar alpha) { - EIGEN_UNUSED_VARIABLE(rhsIncr); - eigen_internal_assert(rhsIncr==1); - + eigen_internal_assert(rhs.stride()==1); + #ifdef _EIGEN_ACCUMULATE_PACKETS #error _EIGEN_ACCUMULATE_PACKETS has already been defined #endif - #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\ - RhsPacket b = pload<RhsPacket>(&rhs[j]); \ - ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \ - ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \ - ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \ - ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); } + #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\ + RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0); \ + ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Alignment0>(j), b, ptmp0); \ + ptmp1 = pcj.pmadd(lhs1.template load<LhsPacket, Alignment13>(j), b, ptmp1); \ + ptmp2 = pcj.pmadd(lhs2.template load<LhsPacket, Alignment2>(j), b, ptmp2); \ + ptmp3 = pcj.pmadd(lhs3.template load<LhsPacket, Alignment13>(j), b, ptmp3); } conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; + typedef typename LhsMapper::VectorMapper LhsScalars; + enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 }; const Index rowsAtOnce = 4; const Index peels = 2; const Index RhsPacketAlignedMask = RhsPacketSize-1; const Index LhsPacketAlignedMask = LhsPacketSize-1; -// const Index PeelAlignedMask = RhsPacketSize*peels-1; const Index depth = cols; + const Index lhsStride = lhs.stride(); // How many coeffs of the result do we have to skip to be aligned. // Here we assume data are at least aligned on the base scalar type // if that's not the case then vectorization is discarded, see below. - Index alignedStart = internal::first_aligned(rhs, depth); + Index alignedStart = rhs.firstAligned(depth); Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0; const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; Index alignmentPattern = alignmentStep==0 ? AllAligned - : alignmentStep==(LhsPacketSize/2) ? EvenAligned - : FirstAligned; + : alignmentStep==(LhsPacketSize/2) ? EvenAligned + : FirstAligned; // we cannot assume the first element is aligned because of sub-matrices - const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth); + const Index lhsAlignmentOffset = lhs.firstAligned(depth); + const Index rhsAlignmentOffset = rhs.firstAligned(rows); // find how many rows do we have to skip to be aligned with rhs (if possible) Index skipRows = 0; // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) - if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) ) + if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || + (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth) || + (rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows) ) { alignedSize = 0; alignedStart = 0; + alignmentPattern = NoneAligned; } else if(LhsPacketSize > 4) { @@ -418,7 +428,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co } else if (LhsPacketSize>1) { - eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize); + // eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize); while (skipRows<LhsPacketSize && alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize)) @@ -434,11 +444,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co skipRows = (std::min)(skipRows,Index(rows)); // note that the skiped columns are processed later. } - eigen_internal_assert( alignmentPattern==NoneAligned + /* eigen_internal_assert( alignmentPattern==NoneAligned || LhsPacketSize==1 || (skipRows + rowsAtOnce >= rows) || LhsPacketSize > depth - || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0); + || (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/ } else if(Vectorizable) { @@ -447,8 +457,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co alignmentPattern = AllAligned; } - Index offset1 = (FirstAligned && alignmentStep==1?3:1); - Index offset3 = (FirstAligned && alignmentStep==1?1:3); + const Index offset1 = (FirstAligned && alignmentStep==1?3:1); + const Index offset3 = (FirstAligned && alignmentStep==1?1:3); Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; for (Index i=skipRows; i<rowBound; i+=rowsAtOnce) @@ -457,8 +467,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0); // this helps the compiler generating good binary code - const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, - *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; + const LhsScalars lhs0 = lhs.getVectorMapper(i+0, 0), lhs1 = lhs.getVectorMapper(i+offset1, 0), + lhs2 = lhs.getVectorMapper(i+2, 0), lhs3 = lhs.getVectorMapper(i+offset3, 0); if (Vectorizable) { @@ -470,9 +480,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co // FIXME this loop get vectorized by the compiler ! for (Index j=0; j<alignedStart; ++j) { - RhsScalar b = rhs[j]; - tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b); - tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b); + RhsScalar b = rhs(j, 0); + tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b); + tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b); } if (alignedSize>alignedStart) @@ -481,11 +491,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co { case AllAligned: for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) - _EIGEN_ACCUMULATE_PACKETS(d,d,d); + _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned); break; case EvenAligned: for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) - _EIGEN_ACCUMULATE_PACKETS(d,du,d); + _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned); break; case FirstAligned: { @@ -499,39 +509,39 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co * than basic unaligned loads. */ LhsPacket A01, A02, A03, A11, A12, A13; - A01 = pload<LhsPacket>(&lhs1[alignedStart-1]); - A02 = pload<LhsPacket>(&lhs2[alignedStart-2]); - A03 = pload<LhsPacket>(&lhs3[alignedStart-3]); + A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1); + A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2); + A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3); for (; j<peeledSize; j+=peels*RhsPacketSize) { - RhsPacket b = pload<RhsPacket>(&rhs[j]); - A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11); - A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12); - A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13); + RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0); + A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11); + A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12); + A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13); - ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0); + ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), b, ptmp0); ptmp1 = pcj.pmadd(A01, b, ptmp1); - A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01); + A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01); ptmp2 = pcj.pmadd(A02, b, ptmp2); - A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02); + A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02); ptmp3 = pcj.pmadd(A03, b, ptmp3); - A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03); + A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03); - b = pload<RhsPacket>(&rhs[j+RhsPacketSize]); - ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0); + b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load<RhsPacket, Aligned>(0); + ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize), b, ptmp0); ptmp1 = pcj.pmadd(A11, b, ptmp1); ptmp2 = pcj.pmadd(A12, b, ptmp2); ptmp3 = pcj.pmadd(A13, b, ptmp3); } } for (; j<alignedSize; j+=RhsPacketSize) - _EIGEN_ACCUMULATE_PACKETS(d,du,du); + _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned); break; } default: for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) - _EIGEN_ACCUMULATE_PACKETS(du,du,du); + _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned); break; } tmp0 += predux(ptmp0); @@ -545,9 +555,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co // FIXME this loop get vectorized by the compiler ! for (Index j=alignedSize; j<depth; ++j) { - RhsScalar b = rhs[j]; - tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b); - tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b); + RhsScalar b = rhs(j, 0); + tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b); + tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b); } res[i*resIncr] += alpha*tmp0; res[(i+offset1)*resIncr] += alpha*tmp1; @@ -564,28 +574,28 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co { EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0); ResPacket ptmp0 = pset1<ResPacket>(tmp0); - const LhsScalar* lhs0 = lhs + i*lhsStride; + const LhsScalars lhs0 = lhs.getVectorMapper(i, 0); // process first unaligned result's coeffs // FIXME this loop get vectorized by the compiler ! for (Index j=0; j<alignedStart; ++j) - tmp0 += cj.pmul(lhs0[j], rhs[j]); + tmp0 += cj.pmul(lhs0(j), rhs(j, 0)); if (alignedSize>alignedStart) { // process aligned rhs coeffs - if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0) + if (lhs0.template aligned<LhsPacket>(alignedStart)) for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize) - ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0); + ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0); else for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize) - ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0); + ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0); tmp0 += predux(ptmp0); } // process remaining scalars // FIXME this loop get vectorized by the compiler ! for (Index j=alignedSize; j<depth; ++j) - tmp0 += cj.pmul(lhs0[j], rhs[j]); + tmp0 += cj.pmul(lhs0(j), rhs(j, 0)); res[i*resIncr] += alpha*tmp0; } if (skipRows) diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h index 126cfbbff..2b90abf8f 100644 --- a/Eigen/src/Core/products/Parallelizer.h +++ b/Eigen/src/Core/products/Parallelizer.h @@ -49,8 +49,8 @@ inline void initParallel() { int nbt; internal::manage_multi_threading(GetAction, &nbt); - std::ptrdiff_t l1, l2; - internal::manage_caching_sizes(GetAction, &l1, &l2); + std::ptrdiff_t l1, l2, l3; + internal::manage_caching_sizes(GetAction, &l1, &l2, &l3); } /** \returns the max number of threads reserved for Eigen diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index 4e507b6cf..4b6316d63 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -324,20 +324,26 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t Index rows, Index cols, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* res, Index resStride, + Scalar* _res, Index resStride, const Scalar& alpha) { Index size = rows; - const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); - const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); - typedef gebp_traits<Scalar,Scalar> Traits; + typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper; + typedef const_blas_data_mapper<Scalar, Index, (LhsStorageOrder == RowMajor) ? ColMajor : RowMajor> LhsTransposeMapper; + typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper; + typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper; + LhsMapper lhs(_lhs,lhsStride); + LhsTransposeMapper lhs_transpose(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride); + Index kc = size; // cache block size along the K direction Index mc = rows; // cache block size along the M direction Index nc = cols; // cache block size along the N direction - computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc); + computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, 1); // kc must smaller than mc kc = (std::min)(kc,mc); @@ -346,10 +352,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); Scalar* blockB = allocatedBlockB; - gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; + gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; - gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; - gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed; + gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs; + gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed; for(Index k2=0; k2<size; k2+=kc) { @@ -358,7 +364,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t // we have selected one row panel of rhs and one column panel of lhs // pack rhs's panel into a sequential chunk of memory // and expand each coeff to a constant packet for further reuse - pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols); + pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, cols); // the select lhs's panel has to be split in three different parts: // 1 - the transposed panel above the diagonal block => transposed packed copy @@ -368,9 +374,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t { const Index actual_mc = (std::min)(i2+mc,k2)-i2; // transposed packed copy - pack_lhs_transposed(blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc); + pack_lhs_transposed(blockA, lhs_transpose.getSubMapper(i2, k2), actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); + gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha); } // the block diagonal { @@ -378,16 +384,16 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t // symmetric packed copy pack_lhs(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc); - gebp_kernel(res+k2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); + gebp_kernel(res.getSubMapper(k2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha); } for(Index i2=k2+kc; i2<size; i2+=mc) { const Index actual_mc = (std::min)(i2+mc,size)-i2; - gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>() - (blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc); + gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>() + (blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); + gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha); } } } @@ -414,26 +420,29 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f Index rows, Index cols, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* res, Index resStride, + Scalar* _res, Index resStride, const Scalar& alpha) { Index size = cols; - const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); - typedef gebp_traits<Scalar,Scalar> Traits; - Index kc = size; // cache block size along the K direction + typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper; + typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper; + LhsMapper lhs(_lhs,lhsStride); + ResMapper res(_res,resStride); + + Index kc = size; // cache block size along the K direction Index mc = rows; // cache block size along the M direction Index nc = cols; // cache block size along the N direction - computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc); + computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, 1); std::size_t sizeB = kc*cols; ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); Scalar* blockB = allocatedBlockB; - gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; - gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; + gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; for(Index k2=0; k2<size; k2+=kc) @@ -446,9 +455,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f for(Index i2=0; i2<rows; i2+=mc) { const Index actual_mc = (std::min)(i2+mc,rows)-i2; - pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc); + pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); + gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha); } } } diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index c2d0817ea..60c99dcd2 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -108,7 +108,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* res, Index resStride, + Scalar* _res, Index resStride, const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) { // strip zeros @@ -117,8 +117,12 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, Index depth = IsLower ? diagSize : _depth; Index cols = _cols; - const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); - const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); + typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper; + typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper; + typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper; + LhsMapper lhs(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -136,9 +140,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, else triangularBuffer.diagonal().setOnes(); - gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; - gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; - gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; + gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; + gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs; for(Index k2=IsLower ? depth : 0; IsLower ? k2>0 : k2<depth; @@ -154,7 +158,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, k2 = k2+actual_kc-kc; } - pack_rhs(blockB, &rhs(actual_k2,0), rhsStride, actual_kc, cols); + pack_rhs(blockB, rhs.getSubMapper(actual_k2,0), actual_kc, cols); // the selected lhs's panel has to be split in three different parts: // 1 - the part which is zero => skip it @@ -182,9 +186,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, for (Index i=IsLower ? k+1 : 0; IsLower ? i<actualPanelWidth : i<k; ++i) triangularBuffer.coeffRef(i,k) = lhs(startBlock+i,startBlock+k); } - pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth); + pack_lhs(blockA, LhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()), actualPanelWidth, actualPanelWidth); - gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha, + gebp_kernel(res.getSubMapper(startBlock, 0), blockA, blockB, + actualPanelWidth, actualPanelWidth, cols, alpha, actualPanelWidth, actual_kc, 0, blockBOffset); // GEBP with remaining micro panel @@ -192,9 +197,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, { Index startTarget = IsLower ? actual_k2+k1+actualPanelWidth : actual_k2; - pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget); + pack_lhs(blockA, lhs.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget); - gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha, + gebp_kernel(res.getSubMapper(startTarget, 0), blockA, blockB, + lengthTarget, actualPanelWidth, cols, alpha, actualPanelWidth, actual_kc, 0, blockBOffset); } } @@ -206,10 +212,11 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, for(Index i2=start; i2<end; i2+=mc) { const Index actual_mc = (std::min)(i2+mc,end)-i2; - gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>() - (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc); + gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>() + (blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0); + gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, + actual_kc, cols, alpha, -1, -1, 0, 0); } } } @@ -247,7 +254,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* res, Index resStride, + Scalar* _res, Index resStride, const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) { // strip zeros @@ -256,8 +263,12 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, Index depth = IsLower ? _depth : diagSize; Index cols = IsLower ? diagSize : _cols; - const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); - const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); + typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper; + typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper; + typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper; + LhsMapper lhs(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -275,10 +286,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, else triangularBuffer.diagonal().setOnes(); - gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; - gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; - gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; - gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel; + gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; + gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs; + gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel; for(Index k2=IsLower ? 0 : depth; IsLower ? k2<depth : k2>0; @@ -302,7 +313,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, Scalar* geb = blockB+ts*ts; geb = geb + internal::first_aligned(geb,EIGEN_ALIGN_BYTES/sizeof(Scalar)); - pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs); + pack_rhs(geb, rhs.getSubMapper(actual_k2,IsLower ? 0 : k2), actual_kc, rs); // pack the triangular part of the rhs padding the unrolled blocks with zeros if(ts>0) @@ -315,7 +326,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2; // general part pack_rhs_panel(blockB+j2*actual_kc, - &rhs(actual_k2+panelOffset, actual_j2), rhsStride, + rhs.getSubMapper(actual_k2+panelOffset, actual_j2), panelLength, actualPanelWidth, actual_kc, panelOffset); @@ -329,7 +340,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, } pack_rhs_panel(blockB+j2*actual_kc, - triangularBuffer.data(), triangularBuffer.outerStride(), + RhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()), actualPanelWidth, actualPanelWidth, actual_kc, j2); } @@ -338,7 +349,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, for (Index i2=0; i2<rows; i2+=mc) { const Index actual_mc = (std::min)(mc,rows-i2); - pack_lhs(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc); + pack_lhs(blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc); // triangular kernel if(ts>0) @@ -349,7 +360,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, Index panelLength = IsLower ? actual_kc-j2 : j2+actualPanelWidth; Index blockOffset = IsLower ? j2 : 0; - gebp_kernel(res+i2+(actual_k2+j2)*resStride, resStride, + gebp_kernel(res.getSubMapper(i2, actual_k2 + j2), blockA, blockB+j2*actual_kc, actual_mc, panelLength, actualPanelWidth, alpha, @@ -357,7 +368,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, blockOffset, blockOffset);// offsets } } - gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride, + gebp_kernel(res.getSubMapper(i2, IsLower ? 0 : k2), blockA, geb, actual_mc, actual_kc, rs, alpha, -1, -1, 0, 0); @@ -402,7 +413,7 @@ struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false> Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(),lhs.rows())) : ((IsLower) ? rhs.rows() : (std::min)(rhs.rows(),rhs.cols())); - BlockingType blocking(stripedRows, stripedCols, stripedDepth); + BlockingType blocking(stripedRows, stripedCols, stripedDepth, 1, false); internal::product_triangular_matrix_matrix<Scalar, Index, Mode, LhsIsTriangular, diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h index 92d64e384..4d88a710b 100644 --- a/Eigen/src/Core/products/TriangularMatrixVector.h +++ b/Eigen/src/Core/products/TriangularMatrixVector.h @@ -10,7 +10,7 @@ #ifndef EIGEN_TRIANGULARMATRIXVECTOR_H #define EIGEN_TRIANGULARMATRIXVECTOR_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -43,7 +43,7 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap; const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride)); typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs); - + typedef Map<const Matrix<RhsScalar,Dynamic,1>, 0, InnerStride<> > RhsMap; const RhsMap rhs(_rhs,cols,InnerStride<>(rhsIncr)); typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs); @@ -51,6 +51,9 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con typedef Map<Matrix<ResScalar,Dynamic,1> > ResMap; ResMap res(_res,rows); + typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper; + typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper; + for (Index pi=0; pi<size; pi+=PanelWidth) { Index actualPanelWidth = (std::min)(PanelWidth, size-pi); @@ -68,19 +71,19 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con if (r>0) { Index s = IsLower ? pi+actualPanelWidth : 0; - general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjLhs,RhsScalar,ConjRhs,BuiltIn>::run( + general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run( r, actualPanelWidth, - &lhs.coeffRef(s,pi), lhsStride, - &rhs.coeffRef(pi), rhsIncr, + LhsMapper(&lhs.coeffRef(s,pi), lhsStride), + RhsMapper(&rhs.coeffRef(pi), rhsIncr), &res.coeffRef(s), resIncr, alpha); } } if((!IsLower) && cols>size) { - general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjLhs,RhsScalar,ConjRhs>::run( + general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run( rows, cols-size, - &lhs.coeffRef(0,size), lhsStride, - &rhs.coeffRef(size), rhsIncr, + LhsMapper(&lhs.coeffRef(0,size), lhsStride), + RhsMapper(&rhs.coeffRef(size), rhsIncr), _res, resIncr, alpha); } } @@ -118,7 +121,10 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con typedef Map<Matrix<ResScalar,Dynamic,1>, 0, InnerStride<> > ResMap; ResMap res(_res,rows,InnerStride<>(resIncr)); - + + typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper; + typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper; + for (Index pi=0; pi<diagSize; pi+=PanelWidth) { Index actualPanelWidth = (std::min)(PanelWidth, diagSize-pi); @@ -136,19 +142,19 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con if (r>0) { Index s = IsLower ? 0 : pi + actualPanelWidth; - general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjLhs,RhsScalar,ConjRhs,BuiltIn>::run( + general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run( actualPanelWidth, r, - &lhs.coeffRef(pi,s), lhsStride, - &rhs.coeffRef(s), rhsIncr, + LhsMapper(&lhs.coeffRef(pi,s), lhsStride), + RhsMapper(&rhs.coeffRef(s), rhsIncr), &res.coeffRef(pi), resIncr, alpha); } } if(IsLower && rows>diagSize) { - general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjLhs,RhsScalar,ConjRhs>::run( + general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run( rows-diagSize, cols, - &lhs.coeffRef(diagSize,0), lhsStride, - &rhs.coeffRef(0), rhsIncr, + LhsMapper(&lhs.coeffRef(diagSize,0), lhsStride), + RhsMapper(&rhs.coeffRef(0), rhsIncr), &res.coeffRef(diagSize), resIncr, alpha); } } @@ -231,7 +237,7 @@ template<int Mode> struct trmv_selector<Mode,ColMajor> bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0)); bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible; - + RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha); ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(), @@ -251,7 +257,7 @@ template<int Mode> struct trmv_selector<Mode,ColMajor> else MappedDest(actualDestPtr, dest.size()) = dest; } - + internal::triangular_matrix_vector_product <Index,Mode, LhsScalar, LhsBlasTraits::NeedToConjugate, @@ -311,7 +317,7 @@ template<int Mode> struct trmv_selector<Mode,RowMajor> #endif Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs; } - + internal::triangular_matrix_vector_product <Index,Mode, LhsScalar, LhsBlasTraits::NeedToConjugate, diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h index 1f7afd187..f5de67c59 100644 --- a/Eigen/src/Core/products/TriangularSolverMatrix.h +++ b/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -52,10 +52,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju level3_blocking<Scalar,Scalar>& blocking) { Index cols = otherSize; - const_blas_data_mapper<Scalar, Index, TriStorageOrder> tri(_tri,triStride); - blas_data_mapper<Scalar, Index, ColMajor> other(_other,otherStride); + + typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> TriMapper; + typedef blas_data_mapper<Scalar, Index, ColMajor> OtherMapper; + TriMapper tri(_tri, triStride); + OtherMapper other(_other, otherStride); typedef gebp_traits<Scalar,Scalar> Traits; + enum { SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), IsLower = (Mode&Lower) == Lower @@ -71,14 +75,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); conj_if<Conjugate> conj; - gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel; - gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs; - gemm_pack_rhs<Scalar, Index, Traits::nr, ColMajor, false, true> pack_rhs; + gebp_kernel<Scalar, Scalar, Index, OtherMapper, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel; + gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs; + gemm_pack_rhs<Scalar, Index, OtherMapper, Traits::nr, ColMajor, false, true> pack_rhs; // the goal here is to subdivise the Rhs panels such that we keep some cache // coherence when accessing the rhs elements - std::ptrdiff_t l1, l2; - manage_caching_sizes(GetAction, &l1, &l2); + std::ptrdiff_t l1, l2, l3; + manage_caching_sizes(GetAction, &l1, &l2, &l3); Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * otherStride) : 0; subcols = std::max<Index>((subcols/Traits::nr)*Traits::nr, Traits::nr); @@ -146,16 +150,16 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju Index blockBOffset = IsLower ? k1 : lengthTarget; // update the respective rows of B from other - pack_rhs(blockB+actual_kc*j2, &other(startBlock,j2), otherStride, actualPanelWidth, actual_cols, actual_kc, blockBOffset); + pack_rhs(blockB+actual_kc*j2, other.getSubMapper(startBlock,j2), actualPanelWidth, actual_cols, actual_kc, blockBOffset); // GEBP if (lengthTarget>0) { Index startTarget = IsLower ? k2+k1+actualPanelWidth : k2-actual_kc; - pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget); + pack_lhs(blockA, tri.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget); - gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1), + gebp_kernel(other.getSubMapper(startTarget,j2), blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1), actualPanelWidth, actual_kc, 0, blockBOffset); } } @@ -170,9 +174,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju const Index actual_mc = (std::min)(mc,end-i2); if (actual_mc>0) { - pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc); + pack_lhs(blockA, tri.getSubMapper(i2, IsLower ? k2 : k2-kc), actual_kc, actual_mc); - gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0); + gebp_kernel(other.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0); } } } @@ -198,8 +202,11 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj level3_blocking<Scalar,Scalar>& blocking) { Index rows = otherSize; - const_blas_data_mapper<Scalar, Index, TriStorageOrder> rhs(_tri,triStride); - blas_data_mapper<Scalar, Index, ColMajor> lhs(_other,otherStride); + + typedef blas_data_mapper<Scalar, Index, ColMajor> LhsMapper; + typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> RhsMapper; + LhsMapper lhs(_other, otherStride); + RhsMapper rhs(_tri, triStride); typedef gebp_traits<Scalar,Scalar> Traits; enum { @@ -218,10 +225,10 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); conj_if<Conjugate> conj; - gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel; - gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; - gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel; - gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel; + gebp_kernel<Scalar, Scalar, Index, LhsMapper, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel; + gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs; + gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder,false,true> pack_rhs_panel; + gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel; for(Index k2=IsLower ? size : 0; IsLower ? k2>0 : k2<size; @@ -234,7 +241,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj Index rs = IsLower ? actual_k2 : size - actual_k2 - actual_kc; Scalar* geb = blockB+actual_kc*actual_kc; - if (rs>0) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, actual_kc, rs); + if (rs>0) pack_rhs(geb, rhs.getSubMapper(actual_k2,startPanel), actual_kc, rs); // triangular packing (we only pack the panels off the diagonal, // neglecting the blocks overlapping the diagonal @@ -248,7 +255,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj if (panelLength>0) pack_rhs_panel(blockB+j2*actual_kc, - &rhs(actual_k2+panelOffset, actual_j2), triStride, + rhs.getSubMapper(actual_k2+panelOffset, actual_j2), panelLength, actualPanelWidth, actual_kc, panelOffset); } @@ -276,7 +283,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj // GEBP if(panelLength>0) { - gebp_kernel(&lhs(i2,absolute_j2), otherStride, + gebp_kernel(lhs.getSubMapper(i2,absolute_j2), blockA, blockB+j2*actual_kc, actual_mc, panelLength, actualPanelWidth, Scalar(-1), @@ -303,14 +310,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj } // pack the just computed part of lhs to A - pack_lhs_panel(blockA, _other+absolute_j2*otherStride+i2, otherStride, + pack_lhs_panel(blockA, LhsMapper(_other+absolute_j2*otherStride+i2, otherStride), actualPanelWidth, actual_mc, actual_kc, j2); } } if (rs>0) - gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb, + gebp_kernel(lhs.getSubMapper(i2, startPanel), blockA, geb, actual_mc, actual_kc, rs, Scalar(-1), -1, -1, 0, 0); } diff --git a/Eigen/src/Core/products/TriangularSolverVector.h b/Eigen/src/Core/products/TriangularSolverVector.h index ce4d10088..b994759b2 100644 --- a/Eigen/src/Core/products/TriangularSolverVector.h +++ b/Eigen/src/Core/products/TriangularSolverVector.h @@ -10,7 +10,7 @@ #ifndef EIGEN_TRIANGULAR_SOLVER_VECTOR_H #define EIGEN_TRIANGULAR_SOLVER_VECTOR_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -25,7 +25,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheRight, Mode, Co >::run(size, _lhs, lhsStride, rhs); } }; - + // forward and backward substitution, row-major, rhs is a vector template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate> struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Conjugate, RowMajor> @@ -37,6 +37,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con { typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap; const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride)); + + typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper; + typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper; + typename internal::conditional< Conjugate, const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>, @@ -58,10 +62,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con Index startRow = IsLower ? pi : pi-actualPanelWidth; Index startCol = IsLower ? 0 : pi; - general_matrix_vector_product<Index,LhsScalar,RowMajor,Conjugate,RhsScalar,false>::run( + general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,Conjugate,RhsScalar,RhsMapper,false>::run( actualPanelWidth, r, - &lhs.coeffRef(startRow,startCol), lhsStride, - rhs + startCol, 1, + LhsMapper(&lhs.coeffRef(startRow,startCol), lhsStride), + RhsMapper(rhs + startCol, 1), rhs + startRow, 1, RhsScalar(-1)); } @@ -72,7 +76,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con Index s = IsLower ? pi : i+1; if (k>0) rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+s,k))).sum(); - + if(!(Mode & UnitDiag)) rhs[i] /= cjLhs(i,i); } @@ -91,6 +95,8 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con { typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap; const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride)); + typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper; + typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper; typename internal::conditional<Conjugate, const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>, const LhsMap& @@ -122,10 +128,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con // let's directly call the low level product function because: // 1 - it is faster to compile // 2 - it is slighlty faster at runtime - general_matrix_vector_product<Index,LhsScalar,ColMajor,Conjugate,RhsScalar,false>::run( + general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,Conjugate,RhsScalar,RhsMapper,false>::run( r, actualPanelWidth, - &lhs.coeffRef(endBlock,startBlock), lhsStride, - rhs+startBlock, 1, + LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride), + RhsMapper(rhs+startBlock, 1), rhs+endBlock, 1, RhsScalar(-1)); } } diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index 9f9115c2a..3ec55fad2 100644 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -18,13 +18,13 @@ namespace Eigen { namespace internal { // forward declarations -template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false> +template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false> struct gebp_kernel; -template<typename Scalar, typename Index, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false> +template<typename Scalar, typename Index, typename DataMapper, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false> struct gemm_pack_rhs; -template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false> +template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false> struct gemm_pack_lhs; template< @@ -34,7 +34,9 @@ template< int ResStorageOrder> struct general_matrix_matrix_product; -template<typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version=Specialized> +template<typename Index, + typename LhsScalar, typename LhsMapper, int LhsStorageOrder, bool ConjugateLhs, + typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version=Specialized> struct general_matrix_vector_product; @@ -117,32 +119,133 @@ template<typename Scalar> struct get_factor<Scalar,typename NumTraits<Scalar>::R static EIGEN_STRONG_INLINE typename NumTraits<Scalar>::Real run(const Scalar& x) { return numext::real(x); } }; + +template<typename Scalar, typename Index> +class BlasVectorMapper { + public: + EIGEN_ALWAYS_INLINE BlasVectorMapper(Scalar *data) : m_data(data) {} + + EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { + return m_data[i]; + } + template <typename Packet, int AlignmentType> + EIGEN_ALWAYS_INLINE Packet load(Index i) const { + return ploadt<Packet, AlignmentType>(m_data + i); + } + + template <typename Packet> + bool aligned(Index i) const { + return (size_t(m_data+i)%sizeof(Packet))==0; + } + + protected: + Scalar* m_data; +}; + +template<typename Scalar, typename Index, int AlignmentType> +class BlasLinearMapper { + public: + typedef typename packet_traits<Scalar>::type Packet; + typedef typename packet_traits<Scalar>::half HalfPacket; + + EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {} + + EIGEN_ALWAYS_INLINE void prefetch(int i) const { + internal::prefetch(&operator()(i)); + } + + EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const { + return m_data[i]; + } + + EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { + return ploadt<Packet, AlignmentType>(m_data + i); + } + + EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { + return ploadt<HalfPacket, AlignmentType>(m_data + i); + } + + EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const { + pstoret<Scalar, Packet, AlignmentType>(m_data + i, p); + } + + protected: + Scalar *m_data; +}; + // Lightweight helper class to access matrix coefficients. -// Yes, this is somehow redundant with Map<>, but this version is much much lighter, -// and so I hope better compilation performance (time and code quality). -template<typename Scalar, typename Index, int StorageOrder> -class blas_data_mapper -{ +template<typename Scalar, typename Index, int StorageOrder, int AlignmentType = Unaligned> +class blas_data_mapper { public: - blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {} - EIGEN_STRONG_INLINE Scalar& operator()(Index i, Index j) - { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; } + typedef typename packet_traits<Scalar>::type Packet; + typedef typename packet_traits<Scalar>::half HalfPacket; + + typedef BlasLinearMapper<Scalar, Index, AlignmentType> LinearMapper; + typedef BlasVectorMapper<Scalar, Index> VectorMapper; + + EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {} + + EIGEN_ALWAYS_INLINE blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType> + getSubMapper(Index i, Index j) const { + return blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>(&operator()(i, j), m_stride); + } + + EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + return LinearMapper(&operator()(i, j)); + } + + EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { + return VectorMapper(&operator()(i, j)); + } + + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const { + return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; + } + + EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { + return ploadt<Packet, AlignmentType>(&operator()(i, j)); + } + + EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { + return ploadt<HalfPacket, AlignmentType>(&operator()(i, j)); + } + + template<typename SubPacket> + EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, SubPacket p) const { + pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride); + } + + template<typename SubPacket> + EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const { + return pgather<Scalar, SubPacket>(&operator()(i, j), m_stride); + } + + const Index stride() const { return m_stride; } + + Index firstAligned(Index size) const { + if (size_t(m_data)%sizeof(Scalar)) { + return -1; + } + return internal::first_aligned(m_data, size); + } + protected: - Scalar* EIGEN_RESTRICT m_data; - Index m_stride; + Scalar* EIGEN_RESTRICT m_data; + const Index m_stride; }; // lightweight helper class to access matrix coefficients (const version) template<typename Scalar, typename Index, int StorageOrder> -class const_blas_data_mapper -{ +class const_blas_data_mapper : public blas_data_mapper<const Scalar, Index, StorageOrder> { public: - const_blas_data_mapper(const Scalar* data, Index stride) : m_data(data), m_stride(stride) {} - EIGEN_STRONG_INLINE const Scalar& operator()(Index i, Index j) const - { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; } - protected: - const Scalar* EIGEN_RESTRICT m_data; - Index m_stride; + EIGEN_ALWAYS_INLINE const_blas_data_mapper(const Scalar *data, Index stride) : blas_data_mapper<const Scalar, Index, StorageOrder>(data, stride) {} + + EIGEN_ALWAYS_INLINE const_blas_data_mapper<Scalar, Index, StorageOrder> getSubMapper(Index i, Index j) const { + return const_blas_data_mapper<Scalar, Index, StorageOrder>(&(this->operator()(i, j)), this->m_stride); + } }; diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index 5c7d70af6..d1855b50b 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -163,6 +163,19 @@ const unsigned int NestByRefBit = 0x100; * \sa \ref RowMajorBit, \ref TopicStorageOrders */ const unsigned int NoPreferredStorageOrderBit = 0x200; +/** \ingroup flags + * + * Means that the underlying coefficients can be accessed through pointers to the sparse (un)compressed storage format, + * that is, the expression provides: + * \code + inline const Scalar* valuePtr() const; + inline const Index* innerIndexPtr() const; + inline const Index* outerIndexPtr() const; + inline const Index* innerNonZeroPtr() const; + \endcode + */ +const unsigned int CompressedAccessBit = 0x400; + // list of flags that are inherited by default const unsigned int HereditaryBits = RowMajorBit @@ -449,6 +462,9 @@ enum Action {GetAction, SetAction}; /** The type used to identify a dense storage. */ struct Dense {}; +/** The type used to identify a general sparse storage. */ +struct Sparse {}; + /** The type used to identify a permutation storage. */ struct PermutationStorage {}; diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 11b7e2887..07923848a 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -73,7 +73,7 @@ /// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC #if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC) - #define EIGEN_COMP_MSVC_STRICT 1 + #define EIGEN_COMP_MSVC_STRICT _MSC_VER #else #define EIGEN_COMP_MSVC_STRICT 0 #endif @@ -160,6 +160,12 @@ #define EIGEN_ARCH_ARM64 0 #endif +#if EIGEN_ARCH_ARM || EIGEN_ARCH_ARM64 + #define EIGEN_ARCH_ARM_OR_ARM64 1 +#else + #define EIGEN_ARCH_ARM_OR_ARM64 0 +#endif + /// \internal EIGEN_ARCH_MIPS set to 1 if the architecture is MIPS #if defined(__mips__) || defined(__mips) #define EIGEN_ARCH_MIPS 1 @@ -376,10 +382,21 @@ #define EIGEN_HAVE_RVALUE_REFERENCES #endif +// Does the compiler support variadic templates? +#if __cplusplus > 199711L +#define EIGEN_HAS_VARIADIC_TEMPLATES 1 +#endif + +// Does the compiler support const expressions? +#if (defined(__plusplus) && __cplusplus >= 201402L) || \ + EIGEN_GNUC_AT_LEAST(4,9) +#define EIGEN_HAS_CONSTEXPR 1 +#endif + /** Allows to disable some optimizations which might affect the accuracy of the result. * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them. * They currently include: - * - single precision Cwise::sin() and Cwise::cos() when SSE vectorization is enabled. + * - single precision ArrayBase::sin() and ArrayBase::cos() when SSE vectorization is enabled. */ #ifndef EIGEN_FAST_MATH #define EIGEN_FAST_MATH 1 @@ -526,7 +543,7 @@ namespace Eigen { #define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var); #if !defined(EIGEN_ASM_COMMENT) - #if EIGEN_COMP_GNUC && EIGEN_ARCH_i386_OR_x86_64 + #if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64) #define EIGEN_ASM_COMMENT(X) __asm__("#" X) #else #define EIGEN_ASM_COMMENT(X) @@ -540,7 +557,9 @@ namespace Eigen { * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link * vectorized and non-vectorized code. */ -#if EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM +#if (defined __CUDACC__) + #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n) +#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) #elif EIGEN_COMP_MSVC #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n)) @@ -592,7 +611,7 @@ namespace Eigen { // just an empty macro ! #define EIGEN_EMPTY -#if EIGEN_COMP_MSVC_STRICT +#if EIGEN_COMP_MSVC_STRICT && EIGEN_COMP_MSVC < 1900 #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ using Base::operator =; #elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653) diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index a54ccaedc..16f8cc1b0 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -143,8 +143,8 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t = *** Implementation of generic aligned realloc (when no realloc can be used)*** *****************************************************************************/ -void* aligned_malloc(std::size_t size); -void aligned_free(void *ptr); +EIGEN_DEVICE_FUNC void* aligned_malloc(std::size_t size); +EIGEN_DEVICE_FUNC void aligned_free(void *ptr); /** \internal * \brief Reallocates aligned memory. @@ -185,33 +185,33 @@ inline void* generic_aligned_realloc(void* ptr, size_t size, size_t old_size) *****************************************************************************/ #ifdef EIGEN_NO_MALLOC -inline void check_that_malloc_is_allowed() +EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() { eigen_assert(false && "heap allocation is forbidden (EIGEN_NO_MALLOC is defined)"); } #elif defined EIGEN_RUNTIME_NO_MALLOC -inline bool is_malloc_allowed_impl(bool update, bool new_value = false) +EIGEN_DEVICE_FUNC inline bool is_malloc_allowed_impl(bool update, bool new_value = false) { static bool value = true; if (update == 1) value = new_value; return value; } -inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); } -inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); } -inline void check_that_malloc_is_allowed() +EIGEN_DEVICE_FUNC inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); } +EIGEN_DEVICE_FUNC inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); } +EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() { eigen_assert(is_malloc_allowed() && "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and g_is_malloc_allowed is false)"); } #else -inline void check_that_malloc_is_allowed() +EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() {} #endif /** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 or 32 bytes alignment depending on the requirements. * On allocation error, the returned pointer is null, and std::bad_alloc is thrown. */ -inline void* aligned_malloc(size_t size) +EIGEN_DEVICE_FUNC inline void* aligned_malloc(size_t size) { check_that_malloc_is_allowed(); @@ -237,7 +237,7 @@ inline void* aligned_malloc(size_t size) } /** \internal Frees memory allocated with aligned_malloc. */ -inline void aligned_free(void *ptr) +EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr) { #if !EIGEN_ALIGN std::free(ptr); @@ -298,12 +298,12 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size) /** \internal Allocates \a size bytes. If Align is true, then the returned ptr is 16-byte-aligned. * On allocation error, the returned pointer is null, and a std::bad_alloc is thrown. */ -template<bool Align> inline void* conditional_aligned_malloc(size_t size) +template<bool Align> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(size_t size) { return aligned_malloc(size); } -template<> inline void* conditional_aligned_malloc<false>(size_t size) +template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc<false>(size_t size) { check_that_malloc_is_allowed(); @@ -314,12 +314,12 @@ template<> inline void* conditional_aligned_malloc<false>(size_t size) } /** \internal Frees memory allocated with conditional_aligned_malloc */ -template<bool Align> inline void conditional_aligned_free(void *ptr) +template<bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void *ptr) { aligned_free(ptr); } -template<> inline void conditional_aligned_free<false>(void *ptr) +template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void *ptr) { std::free(ptr); } @@ -341,7 +341,7 @@ template<> inline void* conditional_aligned_realloc<false>(void* ptr, size_t new /** \internal Destructs the elements of an array. * The \a size parameters tells on how many objects to call the destructor of T. */ -template<typename T> inline void destruct_elements_of_array(T *ptr, size_t size) +template<typename T> EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T *ptr, size_t size) { // always destruct an array starting from the end. if(ptr) @@ -351,7 +351,7 @@ template<typename T> inline void destruct_elements_of_array(T *ptr, size_t size) /** \internal Constructs the elements of an array. * The \a size parameter tells on how many objects to call the constructor of T. */ -template<typename T> inline T* construct_elements_of_array(T *ptr, size_t size) +template<typename T> EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *ptr, size_t size) { size_t i; EIGEN_TRY @@ -371,7 +371,7 @@ template<typename T> inline T* construct_elements_of_array(T *ptr, size_t size) *****************************************************************************/ template<typename T> -EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size) +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size) { if(size > size_t(-1) / sizeof(T)) throw_std_bad_alloc(); @@ -381,7 +381,7 @@ EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size) * On allocation error, the returned pointer is undefined, but a std::bad_alloc is thrown. * The default constructor of T is called. */ -template<typename T> inline T* aligned_new(size_t size) +template<typename T> EIGEN_DEVICE_FUNC inline T* aligned_new(size_t size) { check_size_for_overflow<T>(size); T *result = reinterpret_cast<T*>(aligned_malloc(sizeof(T)*size)); @@ -396,7 +396,7 @@ template<typename T> inline T* aligned_new(size_t size) } } -template<typename T, bool Align> inline T* conditional_aligned_new(size_t size) +template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(size_t size) { check_size_for_overflow<T>(size); T *result = reinterpret_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size)); @@ -414,7 +414,7 @@ template<typename T, bool Align> inline T* conditional_aligned_new(size_t size) /** \internal Deletes objects constructed with aligned_new * The \a size parameters tells on how many objects to call the destructor of T. */ -template<typename T> inline void aligned_delete(T *ptr, size_t size) +template<typename T> EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, size_t size) { destruct_elements_of_array<T>(ptr, size); aligned_free(ptr); @@ -423,13 +423,13 @@ template<typename T> inline void aligned_delete(T *ptr, size_t size) /** \internal Deletes objects constructed with conditional_aligned_new * The \a size parameters tells on how many objects to call the destructor of T. */ -template<typename T, bool Align> inline void conditional_aligned_delete(T *ptr, size_t size) +template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete(T *ptr, size_t size) { destruct_elements_of_array<T>(ptr, size); conditional_aligned_free<Align>(ptr); } -template<typename T, bool Align> inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size) +template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size) { check_size_for_overflow<T>(new_size); check_size_for_overflow<T>(old_size); @@ -452,7 +452,7 @@ template<typename T, bool Align> inline T* conditional_aligned_realloc_new(T* pt } -template<typename T, bool Align> inline T* conditional_aligned_new_auto(size_t size) +template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new_auto(size_t size) { if(size==0) return 0; // short-cut. Also fixes Bug 884 @@ -495,7 +495,7 @@ template<typename T, bool Align> inline T* conditional_aligned_realloc_new_auto( return result; } -template<typename T, bool Align> inline void conditional_aligned_delete_auto(T *ptr, size_t size) +template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete_auto(T *ptr, size_t size) { if(NumTraits<T>::RequireInitialization) destruct_elements_of_array<T>(ptr, size); @@ -523,9 +523,8 @@ template<typename T, bool Align> inline void conditional_aligned_delete_auto(T * template<typename Scalar, typename Index> inline Index first_aligned(const Scalar* array, Index size) { - enum { PacketSize = packet_traits<Scalar>::size, - PacketAlignedMask = PacketSize-1 - }; + static const Index PacketSize = packet_traits<Scalar>::size; + static const Index PacketAlignedMask = PacketSize-1; if(PacketSize==1) { diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 299e5cbc2..528ebe297 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -463,6 +463,21 @@ template<typename XprType, typename CastType> struct cast_return_type const XprType&,CastType>::type type; }; +template <typename A, typename B> struct promote_storage_type; + +template <typename A> struct promote_storage_type<A,A> +{ + typedef A ret; +}; +template <typename A> struct promote_storage_type<A, const A> +{ + typedef A ret; +}; +template <typename A> struct promote_storage_type<const A, A> +{ + typedef A ret; +}; + /** \internal Specify the "storage kind" of applying a coefficient-wise * binary operations between two expressions of kinds A and B respectively. * The template parameter Functor permits to specialize the resulting storage kind wrt to diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h index ae10ff91e..128ef9028 100644 --- a/Eigen/src/Eigenvalues/RealQZ.h +++ b/Eigen/src/Eigenvalues/RealQZ.h @@ -313,7 +313,7 @@ namespace Eigen { using std::abs; using std::sqrt; const Index dim=m_S.cols(); - if (abs(m_S.coeff(i+1,i)==Scalar(0))) + if (abs(m_S.coeff(i+1,i))==Scalar(0)) return; Index z = findSmallDiagEntry(i,i+1); if (z==i-1) diff --git a/Eigen/src/Eigenvalues/RealSchur.h b/Eigen/src/Eigenvalues/RealSchur.h index 10f5fb174..51e61ba38 100644 --- a/Eigen/src/Eigenvalues/RealSchur.h +++ b/Eigen/src/Eigenvalues/RealSchur.h @@ -234,7 +234,7 @@ template<typename _MatrixType> class RealSchur typedef Matrix<Scalar,3,1> Vector3s; Scalar computeNormOfT(); - Index findSmallSubdiagEntry(Index iu, const Scalar& norm); + Index findSmallSubdiagEntry(Index iu); void splitOffTwoRows(Index iu, bool computeU, const Scalar& exshift); void computeShift(Index iu, Index iter, Scalar& exshift, Vector3s& shiftInfo); void initFrancisQRStep(Index il, Index iu, const Vector3s& shiftInfo, Index& im, Vector3s& firstHouseholderVector); @@ -286,7 +286,7 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMa { while (iu >= 0) { - Index il = findSmallSubdiagEntry(iu, norm); + Index il = findSmallSubdiagEntry(iu); // Check for convergence if (il == iu) // One root found @@ -343,16 +343,14 @@ inline typename MatrixType::Scalar RealSchur<MatrixType>::computeNormOfT() /** \internal Look for single small sub-diagonal element and returns its index */ template<typename MatrixType> -inline typename MatrixType::Index RealSchur<MatrixType>::findSmallSubdiagEntry(Index iu, const Scalar& norm) +inline typename MatrixType::Index RealSchur<MatrixType>::findSmallSubdiagEntry(Index iu) { using std::abs; Index res = iu; while (res > 0) { Scalar s = abs(m_matT.coeff(res-1,res-1)) + abs(m_matT.coeff(res,res)); - if (s == 0.0) - s = norm; - if (abs(m_matT.coeff(res,res-1)) < NumTraits<Scalar>::epsilon() * s) + if (abs(m_matT.coeff(res,res-1)) <= NumTraits<Scalar>::epsilon() * s) break; res--; } @@ -457,9 +455,7 @@ inline void RealSchur<MatrixType>::initFrancisQRStep(Index il, Index iu, const V const Scalar lhs = m_matT.coeff(im,im-1) * (abs(v.coeff(1)) + abs(v.coeff(2))); const Scalar rhs = v.coeff(0) * (abs(m_matT.coeff(im-1,im-1)) + abs(Tmm) + abs(m_matT.coeff(im+1,im+1))); if (abs(lhs) < NumTraits<Scalar>::epsilon() * rhs) - { break; - } } } diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h index 276e94c58..54842b5ff 100644 --- a/Eigen/src/Geometry/Transform.h +++ b/Eigen/src/Geometry/Transform.h @@ -78,6 +78,8 @@ struct traits<Transform<_Scalar,_Dim,_Mode,_Options> > }; }; +template<int Mode> struct transform_make_affine; + } // end namespace internal /** \geometry_module \ingroup Geometry_Module @@ -247,8 +249,7 @@ public: inline Transform() { check_template_params(); - if (int(Mode)==Affine) - makeAffine(); + internal::transform_make_affine<(int(Mode)==Affine) ? Affine : AffineCompact>::run(m_matrix); } inline Transform(const Transform& other) @@ -611,11 +612,7 @@ public: */ void makeAffine() { - if(int(Mode)!=int(AffineCompact)) - { - matrix().template block<1,Dim>(Dim,0).setZero(); - matrix().coeffRef(Dim,Dim) = Scalar(1); - } + internal::transform_make_affine<int(Mode)>::run(m_matrix); } /** \internal @@ -1103,6 +1100,24 @@ Transform<Scalar,Dim,Mode,Options>::fromPositionOrientationScale(const MatrixBas namespace internal { +template<int Mode> +struct transform_make_affine +{ + template<typename MatrixType> + static void run(MatrixType &mat) + { + static const int Dim = MatrixType::ColsAtCompileTime-1; + mat.template block<1,Dim>(Dim,0).setZero(); + mat.coeffRef(Dim,Dim) = typename MatrixType::Scalar(1); + } +}; + +template<> +struct transform_make_affine<AffineCompact> +{ + template<typename MatrixType> static void run(MatrixType &) { } +}; + // selector needed to avoid taking the inverse of a 3x4 matrix template<typename TransformType, int Mode=TransformType::Mode> struct projective_transform_inverse diff --git a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h index 5f55efbe9..a715c7285 100644 --- a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +++ b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h @@ -139,11 +139,7 @@ struct traits<BiCGSTAB<_MatrixType,_Preconditioner> > * \include BiCGSTAB_simple.cpp * * By default the iterations start with x=0 as an initial guess of the solution. - * One can control the start using the solveWithGuess() method. Here is a step by - * step execution example starting with a random guess and printing the evolution - * of the estimated error: - * \include BiCGSTAB_step_by_step.cpp - * Note that such a step by step execution is slightly slower. + * One can control the start using the solveWithGuess() method. * * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner */ @@ -192,7 +188,7 @@ public: m_error = Base::m_tolerance; typename Dest::ColXpr xj(x,j); - if(!internal::bicgstab(*mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_error)) + if(!internal::bicgstab(mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_error)) failed = true; } m_info = failed ? NumericalIssue diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h index 2ccbf4fe6..d8bc13f5f 100644 --- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h @@ -113,8 +113,8 @@ struct traits<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> > * The matrix A must be selfadjoint. The matrix A and the vectors x and b can be either dense or sparse. * * \tparam _MatrixType the type of the matrix A, can be a dense or a sparse matrix. - * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower - * or Upper. Default is Lower. + * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower, + * Upper, or Lower|Upper in which the full matrix entries will be considered. Default is Lower. * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner * * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations() @@ -137,20 +137,7 @@ struct traits<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> > * \endcode * * By default the iterations start with x=0 as an initial guess of the solution. - * One can control the start using the solveWithGuess() method. Here is a step by - * step execution example starting with a random guess and printing the evolution - * of the estimated error: - * * \code - * x = VectorXd::Random(n); - * cg.setMaxIterations(1); - * int i = 0; - * do { - * x = cg.solveWithGuess(b,x); - * std::cout << i << " : " << cg.error() << std::endl; - * ++i; - * } while (cg.info()!=Success && i<100); - * \endcode - * Note that such a step by step excution is slightly slower. + * One can control the start using the solveWithGuess() method. * * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner */ @@ -196,6 +183,10 @@ public: template<typename Rhs,typename Dest> void _solve_with_guess_impl(const Rhs& b, Dest& x) const { + typedef typename internal::conditional<UpLo==(Lower|Upper), + Ref<const MatrixType>&, + SparseSelfAdjointView<const Ref<const MatrixType>, UpLo> + >::type MatrixWrapperType; m_iterations = Base::maxIterations(); m_error = Base::m_tolerance; @@ -205,8 +196,7 @@ public: m_error = Base::m_tolerance; typename Dest::ColXpr xj(x,j); - internal::conjugate_gradient(mp_matrix->template selfadjointView<UpLo>(), b.col(j), xj, - Base::m_preconditioner, m_iterations, m_error); + internal::conjugate_gradient(MatrixWrapperType(mp_matrix), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error); } m_isInitialized = true; diff --git a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h index cc99e00f9..2afd80f4d 100644 --- a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +++ b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h @@ -37,7 +37,7 @@ public: /** Default constructor. */ IterativeSolverBase() - : mp_matrix(0) + : m_dummy(0,0), mp_matrix(m_dummy) { init(); } @@ -52,10 +52,11 @@ public: * this class becomes invalid. Call compute() to update it with the new * matrix A, or modify a copy of A. */ - explicit IterativeSolverBase(const MatrixType& A) + template<typename SparseMatrixDerived> + explicit IterativeSolverBase(const SparseMatrixBase<SparseMatrixDerived>& A) { init(); - compute(A); + compute(A.derived()); } ~IterativeSolverBase() {} @@ -65,9 +66,11 @@ public: * Currently, this function mostly calls analyzePattern on the preconditioner. In the future * we might, for instance, implement column reordering for faster matrix vector products. */ - Derived& analyzePattern(const MatrixType& A) + template<typename SparseMatrixDerived> + Derived& analyzePattern(const SparseMatrixBase<SparseMatrixDerived>& A) { - m_preconditioner.analyzePattern(A); + grab(A); + m_preconditioner.analyzePattern(mp_matrix); m_isInitialized = true; m_analysisIsOk = true; m_info = Success; @@ -83,11 +86,12 @@ public: * this class becomes invalid. Call compute() to update it with the new * matrix A, or modify a copy of A. */ - Derived& factorize(const MatrixType& A) + template<typename SparseMatrixDerived> + Derived& factorize(const SparseMatrixBase<SparseMatrixDerived>& A) { eigen_assert(m_analysisIsOk && "You must first call analyzePattern()"); - mp_matrix = &A; - m_preconditioner.factorize(A); + grab(A); + m_preconditioner.factorize(mp_matrix); m_factorizationIsOk = true; m_info = Success; return derived(); @@ -103,10 +107,11 @@ public: * this class becomes invalid. Call compute() to update it with the new * matrix A, or modify a copy of A. */ - Derived& compute(const MatrixType& A) + template<typename SparseMatrixDerived> + Derived& compute(const SparseMatrixBase<SparseMatrixDerived>& A) { - mp_matrix = &A; - m_preconditioner.compute(A); + grab(A); + m_preconditioner.compute(mp_matrix); m_isInitialized = true; m_analysisIsOk = true; m_factorizationIsOk = true; @@ -115,9 +120,10 @@ public: } /** \internal */ - StorageIndex rows() const { return mp_matrix ? mp_matrix->rows() : 0; } + Index rows() const { return mp_matrix.rows(); } + /** \internal */ - StorageIndex cols() const { return mp_matrix ? mp_matrix->cols() : 0; } + Index cols() const { return mp_matrix.cols(); } /** \returns the tolerance threshold used by the stopping criteria */ RealScalar tolerance() const { return m_tolerance; } @@ -135,13 +141,18 @@ public: /** \returns a read-only reference to the preconditioner. */ const Preconditioner& preconditioner() const { return m_preconditioner; } - /** \returns the max number of iterations */ + /** \returns the max number of iterations. + * It is either the value setted by setMaxIterations or, by default, + * twice the number of columns of the matrix. + */ int maxIterations() const { - return (mp_matrix && m_maxIterations<0) ? mp_matrix->cols() : m_maxIterations; + return (m_maxIterations<0) ? 2*mp_matrix.cols() : m_maxIterations; } - /** Sets the max number of iterations */ + /** Sets the max number of iterations. + * Default is twice the number of columns of the matrix. + */ Derived& setMaxIterations(int maxIters) { m_maxIterations = maxIters; @@ -210,7 +221,16 @@ protected: m_maxIterations = -1; m_tolerance = NumTraits<Scalar>::epsilon(); } - const MatrixType* mp_matrix; + + template<typename SparseMatrixDerived> + void grab(const SparseMatrixBase<SparseMatrixDerived> &A) + { + mp_matrix.~Ref<const MatrixType>(); + ::new (&mp_matrix) Ref<const MatrixType>(A); + } + + MatrixType m_dummy; + Ref<const MatrixType> mp_matrix; Preconditioner m_preconditioner; int m_maxIterations; diff --git a/Eigen/src/PaStiXSupport/PaStiXSupport.h b/Eigen/src/PaStiXSupport/PaStiXSupport.h index 27acf4128..e20c9ba2a 100644 --- a/Eigen/src/PaStiXSupport/PaStiXSupport.h +++ b/Eigen/src/PaStiXSupport/PaStiXSupport.h @@ -43,7 +43,7 @@ namespace internal typedef _MatrixType MatrixType; typedef typename _MatrixType::Scalar Scalar; typedef typename _MatrixType::RealScalar RealScalar; - typedef typename _MatrixType::Index Index; + typedef typename _MatrixType::StorageIndex StorageIndex; }; template<typename _MatrixType, int Options> @@ -52,7 +52,7 @@ namespace internal typedef _MatrixType MatrixType; typedef typename _MatrixType::Scalar Scalar; typedef typename _MatrixType::RealScalar RealScalar; - typedef typename _MatrixType::Index Index; + typedef typename _MatrixType::StorageIndex StorageIndex; }; template<typename _MatrixType, int Options> @@ -61,7 +61,7 @@ namespace internal typedef _MatrixType MatrixType; typedef typename _MatrixType::Scalar Scalar; typedef typename _MatrixType::RealScalar RealScalar; - typedef typename _MatrixType::Index Index; + typedef typename _MatrixType::StorageIndex StorageIndex; }; void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, float *vals, int *perm, int * invp, float *x, int nbrhs, int *iparm, double *dparm) @@ -138,7 +138,6 @@ class PastixBase : public SparseSolverBase<Derived> typedef _MatrixType MatrixType; typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; - typedef typename MatrixType::Index Index; typedef typename MatrixType::StorageIndex StorageIndex; typedef Matrix<Scalar,Dynamic,1> Vector; typedef SparseMatrix<Scalar, ColMajor> ColSpMatrix; @@ -163,7 +162,7 @@ class PastixBase : public SparseSolverBase<Derived> * The statistics related to the different phases of factorization and solve are saved here as well * \sa analyzePattern() factorize() */ - Array<Index,IPARM_SIZE,1>& iparm() + Array<StorageIndex,IPARM_SIZE,1>& iparm() { return m_iparm; } @@ -243,8 +242,8 @@ class PastixBase : public SparseSolverBase<Derived> mutable int m_comm; // The MPI communicator identifier mutable Matrix<int,IPARM_SIZE,1> m_iparm; // integer vector for the input parameters mutable Matrix<double,DPARM_SIZE,1> m_dparm; // Scalar vector for the input parameters - mutable Matrix<Index,Dynamic,1> m_perm; // Permutation vector - mutable Matrix<Index,Dynamic,1> m_invp; // Inverse permutation vector + mutable Matrix<StorageIndex,Dynamic,1> m_perm; // Permutation vector + mutable Matrix<StorageIndex,Dynamic,1> m_invp; // Inverse permutation vector mutable int m_size; // Size of the matrix }; @@ -410,7 +409,7 @@ class PastixLU : public PastixBase< PastixLU<_MatrixType> > typedef _MatrixType MatrixType; typedef PastixBase<PastixLU<MatrixType> > Base; typedef typename Base::ColSpMatrix ColSpMatrix; - typedef typename MatrixType::Index Index; + typedef typename MatrixType::StorageIndex StorageIndex; public: PastixLU() : Base() diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h index 6fade3755..0dee139ce 100644 --- a/Eigen/src/QR/ColPivHouseholderQR.h +++ b/Eigen/src/QR/ColPivHouseholderQR.h @@ -477,20 +477,10 @@ ColPivHouseholderQR<MatrixType>& ColPivHouseholderQR<MatrixType>::compute(const // we store that back into our table: it can't hurt to correct our table. m_colSqNorms.coeffRef(biggest_col_index) = biggest_col_sq_norm; - // if the current biggest column is smaller than epsilon times the initial biggest column, - // terminate to avoid generating nan/inf values. - // Note that here, if we test instead for "biggest == 0", we get a failure every 1000 (or so) - // repetitions of the unit test, with the result of solve() filled with large values of the order - // of 1/(size*epsilon). - if(biggest_col_sq_norm < threshold_helper * RealScalar(rows-k)) - { + // Track the number of meaningful pivots but do not stop the decomposition to make + // sure that the initial matrix is properly reproduced. See bug 941. + if(m_nonzero_pivots==size && biggest_col_sq_norm < threshold_helper * RealScalar(rows-k)) m_nonzero_pivots = k; - m_hCoeffs.tail(size-k).setZero(); - m_qr.bottomRightCorner(rows-k,cols-k) - .template triangularView<StrictlyLower>() - .setZero(); - break; - } // apply the transposition to the columns m_colsTranspositions.coeffRef(k) = biggest_col_index; @@ -519,7 +509,7 @@ ColPivHouseholderQR<MatrixType>& ColPivHouseholderQR<MatrixType>::compute(const } m_colsPermutation.setIdentity(PermIndexType(cols)); - for(PermIndexType k = 0; k < m_nonzero_pivots; ++k) + for(PermIndexType k = 0; k < size/*m_nonzero_pivots*/; ++k) m_colsPermutation.applyTranspositionOnTheRight(k, PermIndexType(m_colsTranspositions.coeff(k))); m_det_pq = (number_of_transpositions%2) ? -1 : 1; @@ -575,13 +565,15 @@ struct Assignment<DstXprType, Inverse<ColPivHouseholderQR<MatrixType> >, interna } // end namespace internal -/** \returns the matrix Q as a sequence of householder transformations */ +/** \returns the matrix Q as a sequence of householder transformations. + * You can extract the meaningful part only by using: + * \code qr.householderQ().setLength(qr.nonzeroPivots()) */ template<typename MatrixType> typename ColPivHouseholderQR<MatrixType>::HouseholderSequenceType ColPivHouseholderQR<MatrixType> ::householderQ() const { eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized."); - return HouseholderSequenceType(m_qr, m_hCoeffs.conjugate()).setLength(m_nonzero_pivots); + return HouseholderSequenceType(m_qr, m_hCoeffs.conjugate()); } #ifndef __CUDACC__ diff --git a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h index 5fd18b787..58efa6024 100644 --- a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +++ b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h @@ -68,13 +68,13 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> > typedef Map<PermutationMatrix<Dynamic, Dynamic, StorageIndex> > PermutationType; public: SPQR() - : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon()) + : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon()), m_useDefaultThreshold(true) { cholmod_l_start(&m_cc); } explicit SPQR(const _MatrixType& matrix) - : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon()) + : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon()), m_useDefaultThreshold(true) { cholmod_l_start(&m_cc); compute(matrix); @@ -99,10 +99,25 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> > if(m_isInitialized) SPQR_free(); MatrixType mat(matrix); + + /* Compute the default threshold as in MatLab, see: + * Tim Davis, "Algorithm 915, SuiteSparseQR: Multifrontal Multithreaded Rank-Revealing + * Sparse QR Factorization, ACM Trans. on Math. Soft. 38(1), 2011, Page 8:3 + */ + RealScalar pivotThreshold = m_tolerance; + if(m_useDefaultThreshold) + { + RealScalar max2Norm = 0.0; + for (int j = 0; j < mat.cols(); j++) max2Norm = numext::maxi(max2Norm, mat.col(j).norm()); + if(max2Norm==RealScalar(0)) + max2Norm = RealScalar(1); + pivotThreshold = 20 * (mat.rows() + mat.cols()) * max2Norm * NumTraits<RealScalar>::epsilon(); + } + cholmod_sparse A; A = viewAsCholmod(mat); Index col = matrix.cols(); - m_rank = SuiteSparseQR<Scalar>(m_ordering, m_tolerance, col, &A, + m_rank = SuiteSparseQR<Scalar>(m_ordering, pivotThreshold, col, &A, &m_cR, &m_E, &m_H, &m_HPinv, &m_HTau, &m_cc); if (!m_cR) @@ -118,7 +133,7 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> > /** * Get the number of rows of the input matrix and the Q matrix */ - inline Index rows() const {return m_H->nrow; } + inline Index rows() const {return m_cR->nrow; } /** * Get the number of columns of the input matrix. @@ -130,16 +145,25 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> > { eigen_assert(m_isInitialized && " The QR factorization should be computed first, call compute()"); eigen_assert(b.cols()==1 && "This method is for vectors only"); - + //Compute Q^T * b - typename Dest::PlainObject y; + typename Dest::PlainObject y, y2; y = matrixQ().transpose() * b; - // Solves with the triangular matrix R + + // Solves with the triangular matrix R Index rk = this->rank(); - y.topRows(rk) = this->matrixR().topLeftCorner(rk, rk).template triangularView<Upper>().solve(y.topRows(rk)); - y.bottomRows(cols()-rk).setZero(); + y2 = y; + y.resize((std::max)(cols(),Index(y.rows())),y.cols()); + y.topRows(rk) = this->matrixR().topLeftCorner(rk, rk).template triangularView<Upper>().solve(y2.topRows(rk)); + // Apply the column permutation - dest.topRows(cols()) = colsPermutation() * y.topRows(cols()); + // colsPermutation() performs a copy of the permutation, + // so let's apply it manually: + for(Index i = 0; i < rk; ++i) dest.row(m_E[i]) = y.row(i); + for(Index i = rk; i < cols(); ++i) dest.row(m_E[i]).setZero(); + +// y.bottomRows(y.rows()-rk).setZero(); +// dest = colsPermutation() * y.topRows(cols()); m_info = Success; } @@ -178,7 +202,11 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> > /// Set the fill-reducing ordering method to be used void setSPQROrdering(int ord) { m_ordering = ord;} /// Set the tolerance tol to treat columns with 2-norm < =tol as zero - void setPivotThreshold(const RealScalar& tol) { m_tolerance = tol; } + void setPivotThreshold(const RealScalar& tol) + { + m_useDefaultThreshold = false; + m_tolerance = tol; + } /** \returns a pointer to the SPQR workspace */ cholmod_common *cholmodCommon() const { return &m_cc; } @@ -210,6 +238,7 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> > mutable cholmod_dense *m_HTau; // The Householder coefficients mutable StorageIndex m_rank; // The rank of the matrix mutable cholmod_common m_cc; // Workspace and parameters + bool m_useDefaultThreshold; // Use default threshold template<typename ,typename > friend struct SPQR_QProduct; }; diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h index 0f7e5b8fe..444187ae7 100644 --- a/Eigen/src/SVD/JacobiSVD.h +++ b/Eigen/src/SVD/JacobiSVD.h @@ -628,6 +628,7 @@ template<typename _MatrixType, int QRPreconditioner> class JacobiSVD internal::qr_preconditioner_impl<MatrixType, QRPreconditioner, internal::PreconditionIfMoreColsThanRows> m_qr_precond_morecols; internal::qr_preconditioner_impl<MatrixType, QRPreconditioner, internal::PreconditionIfMoreRowsThanCols> m_qr_precond_morerows; + MatrixType m_scaledMatrix; }; template<typename MatrixType, int QRPreconditioner> @@ -674,8 +675,9 @@ void JacobiSVD<MatrixType, QRPreconditioner>::allocate(Index rows, Index cols, u : 0); m_workMatrix.resize(m_diagSize, m_diagSize); - if(m_cols>m_rows) m_qr_precond_morecols.allocate(*this); - if(m_rows>m_cols) m_qr_precond_morerows.allocate(*this); + if(m_cols>m_rows) m_qr_precond_morecols.allocate(*this); + if(m_rows>m_cols) m_qr_precond_morerows.allocate(*this); + if(m_cols!=m_cols) m_scaledMatrix.resize(rows,cols); } template<typename MatrixType, int QRPreconditioner> @@ -698,7 +700,13 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig /*** step 1. The R-SVD step: we use a QR decomposition to reduce to the case of a square matrix */ - if(!m_qr_precond_morecols.run(*this, matrix/scale) && !m_qr_precond_morerows.run(*this, matrix/scale)) + if(m_rows!=m_cols) + { + m_scaledMatrix = matrix / scale; + m_qr_precond_morecols.run(*this, m_scaledMatrix); + m_qr_precond_morerows.run(*this, m_scaledMatrix); + } + else { m_workMatrix = matrix.block(0,0,m_diagSize,m_diagSize) / scale; if(m_computeFullU) m_matrixU.setIdentity(m_rows,m_rows); diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky.h b/Eigen/src/SparseCholesky/SimplicialCholesky.h index b148d6b1f..c47077e50 100644 --- a/Eigen/src/SparseCholesky/SimplicialCholesky.h +++ b/Eigen/src/SparseCholesky/SimplicialCholesky.h @@ -17,6 +17,27 @@ enum SimplicialCholeskyMode { SimplicialCholeskyLDLT }; +namespace internal { + template<typename CholMatrixType, typename InputMatrixType> + struct simplicial_cholesky_grab_input { + typedef CholMatrixType const * ConstCholMatrixPtr; + static void run(const InputMatrixType& input, ConstCholMatrixPtr &pmat, CholMatrixType &tmp) + { + tmp = input; + pmat = &tmp; + } + }; + + template<typename MatrixType> + struct simplicial_cholesky_grab_input<MatrixType,MatrixType> { + typedef MatrixType const * ConstMatrixPtr; + static void run(const MatrixType& input, ConstMatrixPtr &pmat, MatrixType &/*tmp*/) + { + pmat = &input; + } + }; +} // end namespace internal + /** \ingroup SparseCholesky_Module * \brief A direct sparse Cholesky factorizations * @@ -46,6 +67,7 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived> typedef typename MatrixType::RealScalar RealScalar; typedef typename MatrixType::StorageIndex StorageIndex; typedef SparseMatrix<Scalar,ColMajor,StorageIndex> CholMatrixType; + typedef CholMatrixType const * ConstCholMatrixPtr; typedef Matrix<Scalar,Dynamic,1> VectorType; public: @@ -169,10 +191,11 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived> { eigen_assert(matrix.rows()==matrix.cols()); Index size = matrix.cols(); - CholMatrixType ap(size,size); - ordering(matrix, ap); - analyzePattern_preordered(ap, DoLDLT); - factorize_preordered<DoLDLT>(ap); + CholMatrixType tmp(size,size); + ConstCholMatrixPtr pmat; + ordering(matrix, pmat, tmp); + analyzePattern_preordered(*pmat, DoLDLT); + factorize_preordered<DoLDLT>(*pmat); } template<bool DoLDLT> @@ -180,9 +203,21 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived> { eigen_assert(a.rows()==a.cols()); int size = a.cols(); - CholMatrixType ap(size,size); - ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P); - factorize_preordered<DoLDLT>(ap); + CholMatrixType tmp(size,size); + ConstCholMatrixPtr pmat; + + if(m_P.size()==0 && (UpLo&Upper)==Upper) + { + // If there is no ordering, try to directly use the input matrix without any copy + internal::simplicial_cholesky_grab_input<CholMatrixType,MatrixType>::run(a, pmat, tmp); + } + else + { + tmp.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P); + pmat = &tmp; + } + + factorize_preordered<DoLDLT>(*pmat); } template<bool DoLDLT> @@ -192,13 +227,14 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived> { eigen_assert(a.rows()==a.cols()); int size = a.cols(); - CholMatrixType ap(size,size); - ordering(a, ap); - analyzePattern_preordered(ap,doLDLT); + CholMatrixType tmp(size,size); + ConstCholMatrixPtr pmat; + ordering(a, pmat, tmp); + analyzePattern_preordered(*pmat,doLDLT); } void analyzePattern_preordered(const CholMatrixType& a, bool doLDLT); - void ordering(const MatrixType& a, CholMatrixType& ap); + void ordering(const MatrixType& a, ConstCholMatrixPtr &pmat, CholMatrixType& ap); /** keeps off-diagonal entries; drops diagonal entries */ struct keep_diag { @@ -603,26 +639,41 @@ public: }; template<typename Derived> -void SimplicialCholeskyBase<Derived>::ordering(const MatrixType& a, CholMatrixType& ap) +void SimplicialCholeskyBase<Derived>::ordering(const MatrixType& a, ConstCholMatrixPtr &pmat, CholMatrixType& ap) { eigen_assert(a.rows()==a.cols()); const Index size = a.rows(); - // Note that amd compute the inverse permutation + pmat = ≈ + // Note that ordering methods compute the inverse permutation + if(!internal::is_same<OrderingType,NaturalOrdering<Index> >::value) { - CholMatrixType C; - C = a.template selfadjointView<UpLo>(); + { + CholMatrixType C; + C = a.template selfadjointView<UpLo>(); + + OrderingType ordering; + ordering(C,m_Pinv); + } + + if(m_Pinv.size()>0) m_P = m_Pinv.inverse(); + else m_P.resize(0); - OrderingType ordering; - ordering(C,m_Pinv); + ap.resize(size,size); + ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P); } - - if(m_Pinv.size()>0) - m_P = m_Pinv.inverse(); else + { + m_Pinv.resize(0); m_P.resize(0); - - ap.resize(size,size); - ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P); + if(UpLo==Lower || MatrixType::IsRowMajor) + { + // we have to transpose the lower part to to the upper one + ap.resize(size,size); + ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>(); + } + else + internal::simplicial_cholesky_grab_input<CholMatrixType,MatrixType>::run(a, pmat, ap); + } } } // end namespace Eigen diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h index 302323ab4..a93189df2 100644 --- a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +++ b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h @@ -126,7 +126,7 @@ void SimplicialCholeskyBase<Derived>::factorize_preordered(const CholMatrixType& StorageIndex top = size; // stack for pattern is empty tags[k] = k; // mark node k as visited m_nonZerosPerCol[k] = 0; // count of nonzeros in column k of L - for(typename MatrixType::InnerIterator it(ap,k); it; ++it) + for(typename CholMatrixType::InnerIterator it(ap,k); it; ++it) { StorageIndex i = it.index(); if(i <= k) diff --git a/Eigen/src/SparseCore/AmbiVector.h b/Eigen/src/SparseCore/AmbiVector.h index 759fc08ff..8bde5d58e 100644 --- a/Eigen/src/SparseCore/AmbiVector.h +++ b/Eigen/src/SparseCore/AmbiVector.h @@ -73,7 +73,7 @@ class AmbiVector delete[] m_buffer; if (size<1000) { - Index allocSize = (size * sizeof(ListEl))/sizeof(Scalar); + Index allocSize = (size * sizeof(ListEl) + sizeof(Scalar) - 1)/sizeof(Scalar); m_allocatedElements = convert_index((allocSize*sizeof(Scalar))/sizeof(ListEl)); m_buffer = new Scalar[allocSize]; } @@ -92,7 +92,7 @@ class AmbiVector Index copyElements = m_allocatedElements; m_allocatedElements = (std::min)(StorageIndex(m_allocatedElements*1.5),m_size); Index allocSize = m_allocatedElements * sizeof(ListEl); - allocSize = allocSize/sizeof(Scalar) + (allocSize%sizeof(Scalar)>0?1:0); + allocSize = (allocSize + sizeof(Scalar) - 1)/sizeof(Scalar); Scalar* newBuffer = new Scalar[allocSize]; memcpy(newBuffer, m_buffer, copyElements * sizeof(ListEl)); delete[] m_buffer; diff --git a/Eigen/src/SparseCore/CompressedStorage.h b/Eigen/src/SparseCore/CompressedStorage.h index f98b42760..2d4f2bcf8 100644 --- a/Eigen/src/SparseCore/CompressedStorage.h +++ b/Eigen/src/SparseCore/CompressedStorage.h @@ -143,7 +143,7 @@ class CompressedStorage } /** Like at(), but the search is performed in the range [start,end) */ - inline const Scalar& atInRange(size_t start, size_t end, Index key, const Scalar& defaultValue = Scalar(0)) const + inline Scalar atInRange(size_t start, size_t end, Index key, const Scalar &defaultValue = Scalar(0)) const { if (start>=end) return defaultValue; diff --git a/Eigen/src/SparseCore/MappedSparseMatrix.h b/Eigen/src/SparseCore/MappedSparseMatrix.h index 5e4580329..67718c85b 100644 --- a/Eigen/src/SparseCore/MappedSparseMatrix.h +++ b/Eigen/src/SparseCore/MappedSparseMatrix.h @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr> +// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr> // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -10,9 +10,10 @@ #ifndef EIGEN_MAPPED_SPARSEMATRIX_H #define EIGEN_MAPPED_SPARSEMATRIX_H -namespace Eigen { +namespace Eigen { -/** \class MappedSparseMatrix +/** \deprecated Use Map<SparseMatrix<> > + * \class MappedSparseMatrix * * \brief Sparse matrix * @@ -25,179 +26,38 @@ namespace internal { template<typename _Scalar, int _Flags, typename _StorageIndex> struct traits<MappedSparseMatrix<_Scalar, _Flags, _StorageIndex> > : traits<SparseMatrix<_Scalar, _Flags, _StorageIndex> > {}; -} +} // end namespace internal template<typename _Scalar, int _Flags, typename _StorageIndex> class MappedSparseMatrix - : public SparseMatrixBase<MappedSparseMatrix<_Scalar, _Flags, _StorageIndex> > + : public Map<SparseMatrix<_Scalar, _Flags, _StorageIndex> > { - public: - EIGEN_SPARSE_PUBLIC_INTERFACE(MappedSparseMatrix) - enum { IsRowMajor = Base::IsRowMajor }; - - protected: - - StorageIndex m_outerSize; - StorageIndex m_innerSize; - StorageIndex m_nnz; - StorageIndex* m_outerIndex; - StorageIndex* m_innerIndices; - Scalar* m_values; + typedef Map<SparseMatrix<_Scalar, _Flags, _StorageIndex> > Base; public: - - inline StorageIndex rows() const { return IsRowMajor ? m_outerSize : m_innerSize; } - inline StorageIndex cols() const { return IsRowMajor ? m_innerSize : m_outerSize; } - inline StorageIndex innerSize() const { return m_innerSize; } - inline StorageIndex outerSize() const { return m_outerSize; } - bool isCompressed() const { return true; } - - //---------------------------------------- - // direct access interface - inline const Scalar* valuePtr() const { return m_values; } - inline Scalar* valuePtr() { return m_values; } - - inline const StorageIndex* innerIndexPtr() const { return m_innerIndices; } - inline StorageIndex* innerIndexPtr() { return m_innerIndices; } - - inline const StorageIndex* outerIndexPtr() const { return m_outerIndex; } - inline StorageIndex* outerIndexPtr() { return m_outerIndex; } - //---------------------------------------- - - inline Scalar coeff(Index row, Index col) const - { - const Index outer = IsRowMajor ? row : col; - const Index inner = IsRowMajor ? col : row; - - Index start = m_outerIndex[outer]; - Index end = m_outerIndex[outer+1]; - if (start==end) - return Scalar(0); - else if (end>0 && inner==m_innerIndices[end-1]) - return m_values[end-1]; - // ^^ optimization: let's first check if it is the last coefficient - // (very common in high level algorithms) + typedef typename Base::StorageIndex StorageIndex; + typedef typename Base::Scalar Scalar; - const StorageIndex* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end-1],inner); - const Index id = r-&m_innerIndices[0]; - return ((*r==inner) && (id<end)) ? m_values[id] : Scalar(0); - } - - inline Scalar& coeffRef(Index row, Index col) - { - const Index outer = IsRowMajor ? row : col; - const Index inner = IsRowMajor ? col : row; - - Index start = m_outerIndex[outer]; - Index end = m_outerIndex[outer+1]; - eigen_assert(end>=start && "you probably called coeffRef on a non finalized matrix"); - eigen_assert(end>start && "coeffRef cannot be called on a zero coefficient"); - StorageIndex* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end],inner); - const Index id = r-&m_innerIndices[0]; - eigen_assert((*r==inner) && (id<end) && "coeffRef cannot be called on a zero coefficient"); - return m_values[id]; - } - - class InnerIterator; - class ReverseInnerIterator; - - /** \returns the number of non zero coefficients */ - inline StorageIndex nonZeros() const { return m_nnz; } - - inline MappedSparseMatrix(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr, StorageIndex* innerIndexPtr, Scalar* valuePtr) - : m_outerSize(convert_index(IsRowMajor?rows:cols)), m_innerSize(convert_index(IsRowMajor?cols:rows)), m_nnz(convert_index(nnz)), - m_outerIndex(outerIndexPtr), m_innerIndices(innerIndexPtr), m_values(valuePtr) + inline MappedSparseMatrix(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr, StorageIndex* innerIndexPtr, Scalar* valuePtr, StorageIndex* innerNonZeroPtr = 0) + : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZeroPtr) {} /** Empty destructor */ inline ~MappedSparseMatrix() {} }; -template<typename Scalar, int _Flags, typename _StorageIndex> -class MappedSparseMatrix<Scalar,_Flags,_StorageIndex>::InnerIterator -{ - public: - InnerIterator(const MappedSparseMatrix& mat, Index outer) - : m_matrix(mat), - m_outer(convert_index(outer)), - m_id(mat.outerIndexPtr()[outer]), - m_start(m_id), - m_end(mat.outerIndexPtr()[outer+1]) - {} - - inline InnerIterator& operator++() { m_id++; return *this; } - - inline Scalar value() const { return m_matrix.valuePtr()[m_id]; } - inline Scalar& valueRef() { return const_cast<Scalar&>(m_matrix.valuePtr()[m_id]); } - - inline StorageIndex index() const { return m_matrix.innerIndexPtr()[m_id]; } - inline StorageIndex row() const { return IsRowMajor ? m_outer : index(); } - inline StorageIndex col() const { return IsRowMajor ? index() : m_outer; } - - inline operator bool() const { return (m_id < m_end) && (m_id>=m_start); } - - protected: - const MappedSparseMatrix& m_matrix; - const StorageIndex m_outer; - StorageIndex m_id; - const StorageIndex m_start; - const StorageIndex m_end; -}; - -template<typename Scalar, int _Flags, typename _StorageIndex> -class MappedSparseMatrix<Scalar,_Flags,_StorageIndex>::ReverseInnerIterator -{ - public: - ReverseInnerIterator(const MappedSparseMatrix& mat, Index outer) - : m_matrix(mat), - m_outer(outer), - m_id(mat.outerIndexPtr()[outer+1]), - m_start(mat.outerIndexPtr()[outer]), - m_end(m_id) - {} - - inline ReverseInnerIterator& operator--() { m_id--; return *this; } - - inline Scalar value() const { return m_matrix.valuePtr()[m_id-1]; } - inline Scalar& valueRef() { return const_cast<Scalar&>(m_matrix.valuePtr()[m_id-1]); } - - inline StorageIndex index() const { return m_matrix.innerIndexPtr()[m_id-1]; } - inline StorageIndex row() const { return IsRowMajor ? m_outer : index(); } - inline StorageIndex col() const { return IsRowMajor ? index() : m_outer; } - - inline operator bool() const { return (m_id <= m_end) && (m_id>m_start); } - - protected: - const MappedSparseMatrix& m_matrix; - const StorageIndex m_outer; - StorageIndex m_id; - const StorageIndex m_start; - const StorageIndex m_end; -}; - namespace internal { -template<typename _Scalar, int _Options, typename _Index> -struct evaluator<MappedSparseMatrix<_Scalar,_Options,_Index> > - : evaluator_base<MappedSparseMatrix<_Scalar,_Options,_Index> > +template<typename _Scalar, int _Options, typename _StorageIndex> +struct evaluator<MappedSparseMatrix<_Scalar,_Options,_StorageIndex> > + : evaluator<SparseCompressedBase<MappedSparseMatrix<_Scalar,_Options,_StorageIndex> > > { - typedef MappedSparseMatrix<_Scalar,_Options,_Index> MappedSparseMatrixType; - typedef typename MappedSparseMatrixType::InnerIterator InnerIterator; - typedef typename MappedSparseMatrixType::ReverseInnerIterator ReverseInnerIterator; - - enum { - CoeffReadCost = NumTraits<_Scalar>::ReadCost, - Flags = MappedSparseMatrixType::Flags - }; - - evaluator() : m_matrix(0) {} - explicit evaluator(const MappedSparseMatrixType &mat) : m_matrix(&mat) {} - - operator MappedSparseMatrixType&() { return m_matrix->const_cast_derived(); } - operator const MappedSparseMatrixType&() const { return *m_matrix; } + typedef MappedSparseMatrix<_Scalar,_Options,_StorageIndex> XprType; + typedef evaluator<SparseCompressedBase<XprType> > Base; - const MappedSparseMatrixType *m_matrix; + evaluator() : Base() {} + explicit evaluator(const XprType &mat) : Base(mat) {} }; } diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h index 8db4bbb75..dfdf4314a 100644 --- a/Eigen/src/SparseCore/SparseBlock.h +++ b/Eigen/src/SparseCore/SparseBlock.h @@ -74,7 +74,7 @@ namespace internal { template<typename SparseMatrixType, int BlockRows, int BlockCols>
class sparse_matrix_block_impl
- : public SparseMatrixBase<Block<SparseMatrixType,BlockRows,BlockCols,true> >
+ : public SparseCompressedBase<Block<SparseMatrixType,BlockRows,BlockCols,true> >
{
typedef typename internal::remove_all<typename SparseMatrixType::Nested>::type _MatrixTypeNested;
typedef Block<SparseMatrixType, BlockRows, BlockCols, true> BlockType;
@@ -173,19 +173,24 @@ public: }
inline const Scalar* valuePtr() const
- { return m_matrix.valuePtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
+ { return m_matrix.valuePtr(); }
inline Scalar* valuePtr()
- { return m_matrix.const_cast_derived().valuePtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
+ { return m_matrix.const_cast_derived().valuePtr(); }
inline const StorageIndex* innerIndexPtr() const
- { return m_matrix.innerIndexPtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
+ { return m_matrix.innerIndexPtr(); }
inline StorageIndex* innerIndexPtr()
- { return m_matrix.const_cast_derived().innerIndexPtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
+ { return m_matrix.const_cast_derived().innerIndexPtr(); }
inline const StorageIndex* outerIndexPtr() const
{ return m_matrix.outerIndexPtr() + m_outerStart; }
inline StorageIndex* outerIndexPtr()
{ return m_matrix.const_cast_derived().outerIndexPtr() + m_outerStart; }
+
+ inline const StorageIndex* innerNonZeroPtr() const
+ { return isCompressed() ? 0 : m_matrix.innerNonZeroPtr(); }
+ inline StorageIndex* innerNonZeroPtr()
+ { return isCompressed() ? 0 : m_matrix.const_cast_derived().innerNonZeroPtr(); }
StorageIndex nonZeros() const
{
@@ -197,6 +202,8 @@ public: else
return Map<const IndexVector>(m_matrix.innerNonZeroPtr()+m_outerStart, m_outerSize.value()).sum();
}
+
+ bool isCompressed() const { return m_matrix.innerNonZeroPtr()==0; }
const Scalar& lastCoeff() const
{
diff --git a/Eigen/src/SparseCore/SparseCompressedBase.h b/Eigen/src/SparseCore/SparseCompressedBase.h new file mode 100644 index 000000000..a5ba45e04 --- /dev/null +++ b/Eigen/src/SparseCore/SparseCompressedBase.h @@ -0,0 +1,198 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPARSE_COMPRESSED_BASE_H +#define EIGEN_SPARSE_COMPRESSED_BASE_H + +namespace Eigen { + +template<typename Derived> class SparseCompressedBase; + +namespace internal { + +template<typename Derived> +struct traits<SparseCompressedBase<Derived> > : traits<Derived> +{}; + +} // end namespace internal + +template<typename Derived> +class SparseCompressedBase + : public SparseMatrixBase<Derived> +{ + public: + typedef SparseMatrixBase<Derived> Base; + _EIGEN_SPARSE_PUBLIC_INTERFACE(SparseCompressedBase) + using Base::operator=; + using Base::IsRowMajor; + + class InnerIterator; + class ReverseInnerIterator; + + /** \returns a const pointer to the array of values. + * This function is aimed at interoperability with other libraries. + * \sa innerIndexPtr(), outerIndexPtr() */ + inline const Scalar* valuePtr() const { return derived().valuePtr(); } + /** \returns a non-const pointer to the array of values. + * This function is aimed at interoperability with other libraries. + * \sa innerIndexPtr(), outerIndexPtr() */ + inline Scalar* valuePtr() { return derived().valuePtr(); } + + /** \returns a const pointer to the array of inner indices. + * This function is aimed at interoperability with other libraries. + * \sa valuePtr(), outerIndexPtr() */ + inline const StorageIndex* innerIndexPtr() const { return derived().innerIndexPtr(); } + /** \returns a non-const pointer to the array of inner indices. + * This function is aimed at interoperability with other libraries. + * \sa valuePtr(), outerIndexPtr() */ + inline StorageIndex* innerIndexPtr() { return derived().innerIndexPtr(); } + + /** \returns a const pointer to the array of the starting positions of the inner vectors. + * This function is aimed at interoperability with other libraries. + * \sa valuePtr(), innerIndexPtr() */ + inline const StorageIndex* outerIndexPtr() const { return derived().outerIndexPtr(); } + /** \returns a non-const pointer to the array of the starting positions of the inner vectors. + * This function is aimed at interoperability with other libraries. + * \sa valuePtr(), innerIndexPtr() */ + inline StorageIndex* outerIndexPtr() { return derived().outerIndexPtr(); } + + /** \returns a const pointer to the array of the number of non zeros of the inner vectors. + * This function is aimed at interoperability with other libraries. + * \warning it returns the null pointer 0 in compressed mode */ + inline const StorageIndex* innerNonZeroPtr() const { return derived().innerNonZeroPtr(); } + /** \returns a non-const pointer to the array of the number of non zeros of the inner vectors. + * This function is aimed at interoperability with other libraries. + * \warning it returns the null pointer 0 in compressed mode */ + inline StorageIndex* innerNonZeroPtr() { return derived().innerNonZeroPtr(); } + + /** \returns whether \c *this is in compressed form. */ + inline bool isCompressed() const { return innerNonZeroPtr()==0; } + +}; + +template<typename Derived> +class SparseCompressedBase<Derived>::InnerIterator +{ + public: + InnerIterator(const SparseCompressedBase& mat, Index outer) + : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_id(mat.outerIndexPtr()[outer]) + { + if(mat.isCompressed()) + m_end = mat.outerIndexPtr()[outer+1]; + else + m_end = m_id + mat.innerNonZeroPtr()[outer]; + } + + inline InnerIterator& operator++() { m_id++; return *this; } + + inline const Scalar& value() const { return m_values[m_id]; } + inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id]); } + + inline StorageIndex index() const { return m_indices[m_id]; } + inline Index outer() const { return m_outer; } + inline Index row() const { return IsRowMajor ? m_outer : index(); } + inline Index col() const { return IsRowMajor ? index() : m_outer; } + + inline operator bool() const { return (m_id < m_end); } + + protected: + const Scalar* m_values; + const StorageIndex* m_indices; + const Index m_outer; + Index m_id; + Index m_end; + private: + // If you get here, then you're not using the right InnerIterator type, e.g.: + // SparseMatrix<double,RowMajor> A; + // SparseMatrix<double>::InnerIterator it(A,0); + template<typename T> InnerIterator(const SparseMatrixBase<T>&, Index outer); +}; + +template<typename Derived> +class SparseCompressedBase<Derived>::ReverseInnerIterator +{ + public: + ReverseInnerIterator(const SparseCompressedBase& mat, Index outer) + : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_start(mat.outerIndexPtr()[outer]) + { + if(mat.isCompressed()) + m_id = mat.outerIndexPtr()[outer+1]; + else + m_id = m_start + mat.innerNonZeroPtr()[outer]; + } + + inline ReverseInnerIterator& operator--() { --m_id; return *this; } + + inline const Scalar& value() const { return m_values[m_id-1]; } + inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id-1]); } + + inline StorageIndex index() const { return m_indices[m_id-1]; } + inline Index outer() const { return m_outer; } + inline Index row() const { return IsRowMajor ? m_outer : index(); } + inline Index col() const { return IsRowMajor ? index() : m_outer; } + + inline operator bool() const { return (m_id > m_start); } + + protected: + const Scalar* m_values; + const StorageIndex* m_indices; + const Index m_outer; + Index m_id; + const Index m_start; +}; + +namespace internal { + +template<typename Derived> +struct evaluator<SparseCompressedBase<Derived> > + : evaluator_base<Derived> +{ + typedef typename Derived::Scalar Scalar; + typedef typename Derived::InnerIterator InnerIterator; + typedef typename Derived::ReverseInnerIterator ReverseInnerIterator; + + enum { + CoeffReadCost = NumTraits<Scalar>::ReadCost, + Flags = Derived::Flags + }; + + evaluator() : m_matrix(0) {} + explicit evaluator(const Derived &mat) : m_matrix(&mat) {} + + operator Derived&() { return m_matrix->const_cast_derived(); } + operator const Derived&() const { return *m_matrix; } + + typedef typename DenseCoeffsBase<Derived,ReadOnlyAccessors>::CoeffReturnType CoeffReturnType; + Scalar coeff(Index row, Index col) const + { return m_matrix->coeff(row,col); } + + Scalar& coeffRef(Index row, Index col) + { + eigen_internal_assert(row>=0 && row<m_matrix->rows() && col>=0 && col<m_matrix->cols()); + + const Index outer = Derived::IsRowMajor ? row : col; + const Index inner = Derived::IsRowMajor ? col : row; + + Index start = m_matrix->outerIndexPtr()[outer]; + Index end = m_matrix->isCompressed() ? m_matrix->outerIndexPtr()[outer+1] : m_matrix->outerIndexPtr()[outer] + m_matrix->innerNonZeroPtr()[outer]; + eigen_assert(end>start && "you are using a non finalized sparse matrix or written coefficient does not exist"); + const Index p = std::lower_bound(m_matrix->innerIndexPtr()+start, m_matrix->innerIndexPtr()+end,inner) + - m_matrix->innerIndexPtr(); + eigen_assert((p<end) && (m_matrix->innerIndexPtr()[p]==inner) && "written coefficient does not exist"); + return m_matrix->const_cast_derived().valuePtr()[p]; + } + + const Derived *m_matrix; +}; + +} + +} // end namespace Eigen + +#endif // EIGEN_SPARSE_COMPRESSED_BASE_H diff --git a/Eigen/src/SparseCore/SparseMap.h b/Eigen/src/SparseCore/SparseMap.h new file mode 100644 index 000000000..a6ff7d559 --- /dev/null +++ b/Eigen/src/SparseCore/SparseMap.h @@ -0,0 +1,239 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPARSE_MAP_H +#define EIGEN_SPARSE_MAP_H + +namespace Eigen { + +namespace internal { + +template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType> +struct traits<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > + : public traits<SparseMatrix<MatScalar,MatOptions,MatIndex> > +{ + typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType; + typedef traits<PlainObjectType> TraitsBase; + enum { + Flags = TraitsBase::Flags & (~NestByRefBit) + }; +}; + +template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType> +struct traits<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > + : public traits<SparseMatrix<MatScalar,MatOptions,MatIndex> > +{ + typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType; + typedef traits<PlainObjectType> TraitsBase; + enum { + Flags = TraitsBase::Flags & (~ (NestByRefBit | LvalueBit)) + }; +}; + +} // end namespace internal + +template<typename Derived, + int Level = internal::accessors_level<Derived>::has_write_access ? WriteAccessors : ReadOnlyAccessors +> class SparseMapBase; + +template<typename Derived> +class SparseMapBase<Derived,ReadOnlyAccessors> + : public SparseCompressedBase<Derived> +{ + public: + typedef SparseCompressedBase<Derived> Base; + typedef typename Base::Scalar Scalar; + typedef typename Base::StorageIndex StorageIndex; + enum { IsRowMajor = Base::IsRowMajor }; + using Base::operator=; + protected: + + typedef typename internal::conditional< + bool(internal::is_lvalue<Derived>::value), + Scalar *, const Scalar *>::type ScalarPointer; + typedef typename internal::conditional< + bool(internal::is_lvalue<Derived>::value), + StorageIndex *, const StorageIndex *>::type IndexPointer; + + Index m_outerSize; + Index m_innerSize; + Index m_nnz; + IndexPointer m_outerIndex; + IndexPointer m_innerIndices; + ScalarPointer m_values; + IndexPointer m_innerNonZeros; + + public: + + inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; } + inline Index cols() const { return IsRowMajor ? m_innerSize : m_outerSize; } + inline Index innerSize() const { return m_innerSize; } + inline Index outerSize() const { return m_outerSize; } + + bool isCompressed() const { return m_innerNonZeros==0; } + + //---------------------------------------- + // direct access interface + inline const Scalar* valuePtr() const { return m_values; } + inline const StorageIndex* innerIndexPtr() const { return m_innerIndices; } + inline const StorageIndex* outerIndexPtr() const { return m_outerIndex; } + inline const StorageIndex* innerNonZeroPtr() const { return m_innerNonZeros; } + //---------------------------------------- + + inline Scalar coeff(Index row, Index col) const + { + const Index outer = IsRowMajor ? row : col; + const Index inner = IsRowMajor ? col : row; + + Index start = m_outerIndex[outer]; + Index end = isCompressed() ? m_outerIndex[outer+1] : start + m_innerNonZeros[outer]; + if (start==end) + return Scalar(0); + else if (end>0 && inner==m_innerIndices[end-1]) + return m_values[end-1]; + // ^^ optimization: let's first check if it is the last coefficient + // (very common in high level algorithms) + + const StorageIndex* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end-1],inner); + const Index id = r-&m_innerIndices[0]; + return ((*r==inner) && (id<end)) ? m_values[id] : Scalar(0); + } + + /** \returns the number of non zero coefficients */ + inline Index nonZeros() const { return m_nnz; } + + inline SparseMapBase(Index rows, Index cols, Index nnz, IndexPointer outerIndexPtr, IndexPointer innerIndexPtr, + ScalarPointer valuePtr, IndexPointer innerNonZerosPtr = 0) + : m_outerSize(IsRowMajor?rows:cols), m_innerSize(IsRowMajor?cols:rows), m_nnz(nnz), m_outerIndex(outerIndexPtr), + m_innerIndices(innerIndexPtr), m_values(valuePtr), m_innerNonZeros(innerNonZerosPtr) + {} + + /** Empty destructor */ + inline ~SparseMapBase() {} +}; + +template<typename Derived> +class SparseMapBase<Derived,WriteAccessors> + : public SparseMapBase<Derived,ReadOnlyAccessors> +{ + typedef MapBase<Derived, ReadOnlyAccessors> ReadOnlyMapBase; + + public: + typedef SparseMapBase<Derived, ReadOnlyAccessors> Base; + typedef typename Base::Scalar Scalar; + typedef typename Base::StorageIndex StorageIndex; + enum { IsRowMajor = Base::IsRowMajor }; + + using Base::operator=; + + public: + + //---------------------------------------- + // direct access interface + using Base::valuePtr; + using Base::innerIndexPtr; + using Base::outerIndexPtr; + using Base::innerNonZeroPtr; + inline Scalar* valuePtr() { return Base::m_values; } + inline StorageIndex* innerIndexPtr() { return Base::m_innerIndices; } + inline StorageIndex* outerIndexPtr() { return Base::m_outerIndex; } + inline StorageIndex* innerNonZeroPtr() { return Base::m_innerNonZeros; } + //---------------------------------------- + + inline Scalar& coeffRef(Index row, Index col) + { + const Index outer = IsRowMajor ? row : col; + const Index inner = IsRowMajor ? col : row; + + Index start = Base::m_outerIndex[outer]; + Index end = Base::isCompressed() ? Base::m_outerIndex[outer+1] : start + Base::m_innerNonZeros[outer]; + eigen_assert(end>=start && "you probably called coeffRef on a non finalized matrix"); + eigen_assert(end>start && "coeffRef cannot be called on a zero coefficient"); + Index* r = std::lower_bound(&Base::m_innerIndices[start],&Base::m_innerIndices[end],inner); + const Index id = r - &Base::m_innerIndices[0]; + eigen_assert((*r==inner) && (id<end) && "coeffRef cannot be called on a zero coefficient"); + return const_cast<Scalar*>(Base::m_values)[id]; + } + + inline SparseMapBase(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr, StorageIndex* innerIndexPtr, + Scalar* valuePtr, StorageIndex* innerNonZerosPtr = 0) + : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr) + {} + + /** Empty destructor */ + inline ~SparseMapBase() {} +}; + +template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType> +class Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> + : public SparseMapBase<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > +{ + public: + typedef SparseMapBase<Map> Base; + _EIGEN_SPARSE_PUBLIC_INTERFACE(Map) + enum { IsRowMajor = Base::IsRowMajor }; + + public: + + inline Map(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr, + StorageIndex* innerIndexPtr, Scalar* valuePtr, StorageIndex* innerNonZerosPtr = 0) + : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr) + {} + + /** Empty destructor */ + inline ~Map() {} +}; + +template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType> +class Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> + : public SparseMapBase<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > +{ + public: + typedef SparseMapBase<Map> Base; + _EIGEN_SPARSE_PUBLIC_INTERFACE(Map) + enum { IsRowMajor = Base::IsRowMajor }; + + public: + + inline Map(Index rows, Index cols, Index nnz, const StorageIndex* outerIndexPtr, + const StorageIndex* innerIndexPtr, const Scalar* valuePtr, const StorageIndex* innerNonZerosPtr = 0) + : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr) + {} + + /** Empty destructor */ + inline ~Map() {} +}; + +namespace internal { + +template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType> +struct evaluator<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > + : evaluator<SparseCompressedBase<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > +{ + typedef evaluator<SparseCompressedBase<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base; + typedef Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType; + evaluator() : Base() {} + explicit evaluator(const XprType &mat) : Base(mat) {} +}; + +template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType> +struct evaluator<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > + : evaluator<SparseCompressedBase<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > +{ + typedef evaluator<SparseCompressedBase<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base; + typedef Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType; + evaluator() : Base() {} + explicit evaluator(const XprType &mat) : Base(mat) {} +}; + +} + +} // end namespace Eigen + +#endif // EIGEN_SPARSE_MAP_H diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h index dd11fcafd..c049f9bcb 100644 --- a/Eigen/src/SparseCore/SparseMatrix.h +++ b/Eigen/src/SparseCore/SparseMatrix.h @@ -51,7 +51,7 @@ struct traits<SparseMatrix<_Scalar, _Options, _Index> > ColsAtCompileTime = Dynamic, MaxRowsAtCompileTime = Dynamic, MaxColsAtCompileTime = Dynamic, - Flags = _Options | NestByRefBit | LvalueBit, + Flags = _Options | NestByRefBit | LvalueBit | CompressedAccessBit, SupportedAccessPatterns = InnerRandomAccessPattern }; }; @@ -90,16 +90,20 @@ struct traits<Diagonal<const SparseMatrix<_Scalar, _Options, _Index>, DiagIndex> template<typename _Scalar, int _Options, typename _Index> class SparseMatrix - : public SparseMatrixBase<SparseMatrix<_Scalar, _Options, _Index> > + : public SparseCompressedBase<SparseMatrix<_Scalar, _Options, _Index> > { public: - EIGEN_SPARSE_PUBLIC_INTERFACE(SparseMatrix) + typedef SparseCompressedBase<SparseMatrix> Base; + using Base::isCompressed; + _EIGEN_SPARSE_PUBLIC_INTERFACE(SparseMatrix) EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, +=) EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, -=) typedef MappedSparseMatrix<Scalar,Flags> Map; typedef Diagonal<SparseMatrix> DiagonalReturnType; typedef Diagonal<const SparseMatrix> ConstDiagonalReturnType; + typedef typename Base::InnerIterator InnerIterator; + typedef typename Base::ReverseInnerIterator ReverseInnerIterator; using Base::IsRowMajor; @@ -124,9 +128,6 @@ class SparseMatrix public: - /** \returns whether \c *this is in compressed form. */ - inline bool isCompressed() const { return m_innerNonZeros==0; } - /** \returns the number of rows of the matrix */ inline StorageIndex rows() const { return IsRowMajor ? m_outerSize : m_innerSize; } /** \returns the number of columns of the matrix */ @@ -180,7 +181,7 @@ class SparseMatrix /** \returns the value of the matrix at position \a i, \a j * This function returns Scalar(0) if the element is an explicit \em zero */ - inline const Scalar& coeff(Index row, Index col) const + inline Scalar coeff(Index row, Index col) const { eigen_assert(row>=0 && row<rows() && col>=0 && col<cols()); @@ -242,9 +243,6 @@ class SparseMatrix public: - class InnerIterator; - class ReverseInnerIterator; - /** Removes all non zeros but keep allocated memory */ inline void setZero() { @@ -874,77 +872,6 @@ private: }; }; -template<typename Scalar, int _Options, typename _Index> -class SparseMatrix<Scalar,_Options,_Index>::InnerIterator -{ - public: - InnerIterator(const SparseMatrix& mat, Index outer) - : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(convert_index(outer)), m_id(mat.m_outerIndex[outer]) - { - if(mat.isCompressed()) - m_end = mat.m_outerIndex[outer+1]; - else - m_end = m_id + mat.m_innerNonZeros[outer]; - } - - inline InnerIterator& operator++() { m_id++; return *this; } - - inline const Scalar& value() const { return m_values[m_id]; } - inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id]); } - - inline StorageIndex index() const { return m_indices[m_id]; } - inline StorageIndex outer() const { return m_outer; } - inline StorageIndex row() const { return IsRowMajor ? m_outer : index(); } - inline StorageIndex col() const { return IsRowMajor ? index() : m_outer; } - - inline operator bool() const { return (m_id < m_end); } - - protected: - const Scalar* m_values; - const StorageIndex* m_indices; - const StorageIndex m_outer; - StorageIndex m_id; - StorageIndex m_end; - private: - // If you get here, then you're not using the right InnerIterator type, e.g.: - // SparseMatrix<double,RowMajor> A; - // SparseMatrix<double>::InnerIterator it(A,0); - template<typename T> InnerIterator(const SparseMatrixBase<T>&,Index outer); -}; - -template<typename Scalar, int _Options, typename _Index> -class SparseMatrix<Scalar,_Options,_Index>::ReverseInnerIterator -{ - public: - ReverseInnerIterator(const SparseMatrix& mat, Index outer) - : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_start(mat.m_outerIndex[outer]) - { - if(mat.isCompressed()) - m_id = mat.m_outerIndex[outer+1]; - else - m_id = m_start + mat.m_innerNonZeros[outer]; - } - - inline ReverseInnerIterator& operator--() { --m_id; return *this; } - - inline const Scalar& value() const { return m_values[m_id-1]; } - inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id-1]); } - - inline StorageIndex index() const { return m_indices[m_id-1]; } - inline StorageIndex outer() const { return m_outer; } - inline StorageIndex row() const { return IsRowMajor ? m_outer : index(); } - inline StorageIndex col() const { return IsRowMajor ? index() : m_outer; } - - inline operator bool() const { return (m_id > m_start); } - - protected: - const Scalar* m_values; - const StorageIndex* m_indices; - const StorageIndex m_outer; - StorageIndex m_id; - const StorageIndex m_start; -}; - namespace internal { template<typename InputIterator, typename SparseMatrixType> @@ -1074,6 +1001,10 @@ EIGEN_DONT_INLINE SparseMatrix<Scalar,_Options,_Index>& SparseMatrix<Scalar,_Opt EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value), YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) + #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN + EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN + #endif + const bool needToTranspose = (Flags & RowMajorBit) != (internal::evaluator<OtherDerived>::Flags & RowMajorBit); if (needToTranspose) { @@ -1276,44 +1207,12 @@ namespace internal { template<typename _Scalar, int _Options, typename _Index> struct evaluator<SparseMatrix<_Scalar,_Options,_Index> > - : evaluator_base<SparseMatrix<_Scalar,_Options,_Index> > + : evaluator<SparseCompressedBase<SparseMatrix<_Scalar,_Options,_Index> > > { - typedef _Scalar Scalar; + typedef evaluator<SparseCompressedBase<SparseMatrix<_Scalar,_Options,_Index> > > Base; typedef SparseMatrix<_Scalar,_Options,_Index> SparseMatrixType; - typedef typename SparseMatrixType::InnerIterator InnerIterator; - typedef typename SparseMatrixType::ReverseInnerIterator ReverseInnerIterator; - - enum { - CoeffReadCost = NumTraits<_Scalar>::ReadCost, - Flags = SparseMatrixType::Flags - }; - - evaluator() : m_matrix(0) {} - explicit evaluator(const SparseMatrixType &mat) : m_matrix(&mat) {} - - operator SparseMatrixType&() { return m_matrix->const_cast_derived(); } - operator const SparseMatrixType&() const { return *m_matrix; } - - typedef typename DenseCoeffsBase<SparseMatrixType,ReadOnlyAccessors>::CoeffReturnType CoeffReturnType; - CoeffReturnType coeff(Index row, Index col) const - { return m_matrix->coeff(row,col); } - - Scalar& coeffRef(Index row, Index col) - { - eigen_internal_assert(row>=0 && row<m_matrix->rows() && col>=0 && col<m_matrix->cols()); - - const Index outer = SparseMatrixType::IsRowMajor ? row : col; - const Index inner = SparseMatrixType::IsRowMajor ? col : row; - - Index start = m_matrix->outerIndexPtr()[outer]; - Index end = m_matrix->isCompressed() ? m_matrix->outerIndexPtr()[outer+1] : m_matrix->outerIndexPtr()[outer] + m_matrix->innerNonZeroPtr()[outer]; - eigen_assert(end>start && "you are using a non finalized sparse matrix or written coefficient does not exist"); - const Index p = m_matrix->data().searchLowerIndex(start,end-1,inner); - eigen_assert((p<end) && (m_matrix->data().index(p)==inner) && "written coefficient does not exist"); - return m_matrix->const_cast_derived().data().value(p); - } - - const SparseMatrixType *m_matrix; + evaluator() : Base() {} + explicit evaluator(const SparseMatrixType &mat) : Base(mat) {} }; } diff --git a/Eigen/src/SparseCore/SparseRef.h b/Eigen/src/SparseCore/SparseRef.h new file mode 100644 index 000000000..2ca039323 --- /dev/null +++ b/Eigen/src/SparseCore/SparseRef.h @@ -0,0 +1,192 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPARSE_REF_H +#define EIGEN_SPARSE_REF_H + +namespace Eigen { + +namespace internal { + +template<typename Derived> class SparseRefBase; + +template<typename MatScalar, int MatOptions, typename MatIndex, int _Options, typename _StrideType> +struct traits<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> > + : public traits<SparseMatrix<MatScalar,MatOptions,MatIndex> > +{ + typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType; + enum { + Options = _Options, + Flags = traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >::Flags | CompressedAccessBit | NestByRefBit + }; + + template<typename Derived> struct match { + enum { + StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)), + MatchAtCompileTime = (Derived::Flags&CompressedAccessBit) && StorageOrderMatch + }; + typedef typename internal::conditional<MatchAtCompileTime,internal::true_type,internal::false_type>::type type; + }; + +}; + +template<typename MatScalar, int MatOptions, typename MatIndex, int _Options, typename _StrideType> +struct traits<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> > + : public traits<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> > +{ + enum { + Flags = (traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >::Flags | CompressedAccessBit | NestByRefBit) & ~LvalueBit + }; +}; + +template<typename Derived> +struct traits<SparseRefBase<Derived> > : public traits<Derived> {}; + +template<typename Derived> class SparseRefBase + : public SparseMapBase<Derived> +{ +public: + + typedef SparseMapBase<Derived> Base; + _EIGEN_SPARSE_PUBLIC_INTERFACE(SparseRefBase) + + SparseRefBase() + : Base(RowsAtCompileTime==Dynamic?0:RowsAtCompileTime,ColsAtCompileTime==Dynamic?0:ColsAtCompileTime, 0, 0, 0, 0, 0) + {} + +protected: + + + template<typename Expression> + void construct(Expression& expr) + { + ::new (static_cast<Base*>(this)) Base(expr.rows(), expr.cols(), expr.nonZeros(), expr.outerIndexPtr(), expr.innerIndexPtr(), expr.valuePtr(), expr.innerNonZeroPtr()); + } +}; + +} // namespace internal + +template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType> +class Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType > + : public internal::SparseRefBase<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType > > +{ + typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType; + typedef internal::traits<Ref> Traits; + template<int OtherOptions> + inline Ref(const SparseMatrix<MatScalar,OtherOptions,MatIndex>& expr); + template<int OtherOptions> + inline Ref(const MappedSparseMatrix<MatScalar,OtherOptions,MatIndex>& expr); + public: + + typedef internal::SparseRefBase<Ref> Base; + _EIGEN_SPARSE_PUBLIC_INTERFACE(Ref) + + + #ifndef EIGEN_PARSED_BY_DOXYGEN + template<int OtherOptions> + inline Ref(SparseMatrix<MatScalar,OtherOptions,MatIndex>& expr) + { + EIGEN_STATIC_ASSERT(bool(Traits::template match<SparseMatrix<MatScalar,OtherOptions,MatIndex> >::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH); + Base::construct(expr.derived()); + } + + template<int OtherOptions> + inline Ref(MappedSparseMatrix<MatScalar,OtherOptions,MatIndex>& expr) + { + EIGEN_STATIC_ASSERT(bool(Traits::template match<SparseMatrix<MatScalar,OtherOptions,MatIndex> >::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH); + Base::construct(expr.derived()); + } + + template<typename Derived> + inline Ref(const SparseCompressedBase<Derived>& expr) + #else + template<typename Derived> + inline Ref(SparseCompressedBase<Derived>& expr) + #endif + { + EIGEN_STATIC_ASSERT(bool(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY); + EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH); + Base::construct(expr.const_cast_derived()); + } +}; + +// this is the const ref version +template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType> +class Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> + : public internal::SparseRefBase<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > +{ + typedef SparseMatrix<MatScalar,MatOptions,MatIndex> TPlainObjectType; + typedef internal::traits<Ref> Traits; + public: + + typedef internal::SparseRefBase<Ref> Base; + _EIGEN_SPARSE_PUBLIC_INTERFACE(Ref) + + template<typename Derived> + inline Ref(const SparseMatrixBase<Derived>& expr) + { + construct(expr.derived(), typename Traits::template match<Derived>::type()); + } + + inline Ref(const Ref& other) : Base(other) { + // copy constructor shall not copy the m_object, to avoid unnecessary malloc and copy + } + + template<typename OtherRef> + inline Ref(const RefBase<OtherRef>& other) { + construct(other.derived(), typename Traits::template match<OtherRef>::type()); + } + + protected: + + template<typename Expression> + void construct(const Expression& expr,internal::true_type) + { + Base::construct(expr); + } + + template<typename Expression> + void construct(const Expression& expr, internal::false_type) + { + m_object = expr; + Base::construct(m_object); + } + + protected: + TPlainObjectType m_object; +}; + + +namespace internal { + +template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType> +struct evaluator<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > + : evaluator<SparseCompressedBase<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > +{ + typedef evaluator<SparseCompressedBase<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base; + typedef Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType; + evaluator() : Base() {} + explicit evaluator(const XprType &mat) : Base(mat) {} +}; + +template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType> +struct evaluator<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > + : evaluator<SparseCompressedBase<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > +{ + typedef evaluator<SparseCompressedBase<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base; + typedef Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType; + evaluator() : Base() {} + explicit evaluator(const XprType &mat) : Base(mat) {} +}; + +} + +} // end namespace Eigen + +#endif // EIGEN_SPARSE_REF_H diff --git a/Eigen/src/SparseCore/SparseTranspose.h b/Eigen/src/SparseCore/SparseTranspose.h index c74af46b3..84413c374 100644 --- a/Eigen/src/SparseCore/SparseTranspose.h +++ b/Eigen/src/SparseCore/SparseTranspose.h @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr> +// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr> // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -12,13 +12,41 @@ namespace Eigen { +namespace internal { + template<typename MatrixType,int CompressedAccess=int(MatrixType::Flags&CompressedAccessBit)> + class SparseTransposeImpl + : public SparseMatrixBase<Transpose<MatrixType> > + {}; + + template<typename MatrixType> + class SparseTransposeImpl<MatrixType,CompressedAccessBit> + : public SparseCompressedBase<Transpose<MatrixType> > + { + typedef SparseCompressedBase<Transpose<MatrixType> > Base; + public: + using Base::derived; + typedef typename Base::Scalar Scalar; + typedef typename Base::StorageIndex StorageIndex; + + inline const Scalar* valuePtr() const { return derived().nestedExpression().valuePtr(); } + inline const StorageIndex* innerIndexPtr() const { return derived().nestedExpression().innerIndexPtr(); } + inline const StorageIndex* outerIndexPtr() const { return derived().nestedExpression().outerIndexPtr(); } + inline const StorageIndex* innerNonZeroPtr() const { return derived().nestedExpression().innerNonZeroPtr(); } + + inline Scalar* valuePtr() { return derived().nestedExpression().valuePtr(); } + inline StorageIndex* innerIndexPtr() { return derived().nestedExpression().innerIndexPtr(); } + inline StorageIndex* outerIndexPtr() { return derived().nestedExpression().outerIndexPtr(); } + inline StorageIndex* innerNonZeroPtr() { return derived().nestedExpression().innerNonZeroPtr(); } + }; +} + // Implement nonZeros() for transpose. I'm not sure that's the best approach for that. // Perhaps it should be implemented in Transpose<> itself. template<typename MatrixType> class TransposeImpl<MatrixType,Sparse> - : public SparseMatrixBase<Transpose<MatrixType> > + : public internal::SparseTransposeImpl<MatrixType> { protected: - typedef SparseMatrixBase<Transpose<MatrixType> > Base; + typedef internal::SparseTransposeImpl<MatrixType> Base; public: inline typename MatrixType::StorageIndex nonZeros() const { return Base::derived().nestedExpression().nonZeros(); } }; diff --git a/Eigen/src/SparseCore/SparseUtil.h b/Eigen/src/SparseCore/SparseUtil.h index 5714150c2..ac2b4f10b 100644 --- a/Eigen/src/SparseCore/SparseUtil.h +++ b/Eigen/src/SparseCore/SparseUtil.h @@ -44,8 +44,7 @@ EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, *=) \ EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, /=) // TODO this is mostly the same as EIGEN_GENERIC_PUBLIC_INTERFACE -#define _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived, BaseClass) \ - typedef BaseClass Base; \ +#define _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) \ typedef typename Eigen::internal::traits<Derived >::Scalar Scalar; \ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; \ typedef typename Eigen::internal::nested<Derived >::type Nested; \ @@ -61,7 +60,8 @@ EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, /=) using Base::convert_index; #define EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) \ - _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived, Eigen::SparseMatrixBase<Derived >) + typedef Eigen::SparseMatrixBase<Derived > Base; \ + _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) const int CoherentAccessPattern = 0x1; const int InnerRandomAccessPattern = 0x2 | CoherentAccessPattern; diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h index 0c48fef3e..380ba25c0 100644 --- a/Eigen/src/SparseLU/SparseLU.h +++ b/Eigen/src/SparseLU/SparseLU.h @@ -309,7 +309,7 @@ class SparseLU : public SparseSolverBase<SparseLU<_MatrixType,_OrderingType> >, // Functions void initperfvalues() { - m_perfv.panel_size = 1; + m_perfv.panel_size = 16; m_perfv.relax = 1; m_perfv.maxsuper = 128; m_perfv.rowblk = 16; diff --git a/Eigen/src/SuperLUSupport/SuperLUSupport.h b/Eigen/src/SuperLUSupport/SuperLUSupport.h index f00bc3976..3a9b5fd74 100644 --- a/Eigen/src/SuperLUSupport/SuperLUSupport.h +++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h @@ -627,8 +627,12 @@ void SuperLU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> m_sluFerr.resize(rhsCols); m_sluBerr.resize(rhsCols); - m_sluB = SluMatrix::Map(b.const_cast_derived()); - m_sluX = SluMatrix::Map(x.derived()); + + Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b); + Ref<const Matrix<typename Dest::Scalar,Dynamic,Dynamic,ColMajor> > x_ref(x); + + m_sluB = SluMatrix::Map(b_ref.const_cast_derived()); + m_sluX = SluMatrix::Map(x_ref.const_cast_derived()); typename Rhs::PlainObject b_cpy; if(m_sluEqued!='N') @@ -651,6 +655,10 @@ void SuperLU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &m_sluFerr[0], &m_sluBerr[0], &m_sluStat, &info, Scalar()); StatFree(&m_sluStat); + + if(&x.coeffRef(0) != x_ref.data()) + x = x_ref; + m_info = info==0 ? Success : NumericalIssue; } @@ -938,8 +946,12 @@ void SuperILU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest m_sluFerr.resize(rhsCols); m_sluBerr.resize(rhsCols); - m_sluB = SluMatrix::Map(b.const_cast_derived()); - m_sluX = SluMatrix::Map(x.derived()); + + Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b); + Ref<const Matrix<typename Dest::Scalar,Dynamic,Dynamic,ColMajor> > x_ref(x); + + m_sluB = SluMatrix::Map(b_ref.const_cast_derived()); + m_sluX = SluMatrix::Map(x_ref.const_cast_derived()); typename Rhs::PlainObject b_cpy; if(m_sluEqued!='N') @@ -962,6 +974,9 @@ void SuperILU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest &recip_pivot_growth, &rcond, &m_sluStat, &info, Scalar()); StatFree(&m_sluStat); + + if(&x.coeffRef(0) != x_ref.data()) + x = x_ref; m_info = info==0 ? Success : NumericalIssue; } diff --git a/Eigen/src/UmfPackSupport/UmfPackSupport.h b/Eigen/src/UmfPackSupport/UmfPackSupport.h index 982aa2fca..47e8b6304 100644 --- a/Eigen/src/UmfPackSupport/UmfPackSupport.h +++ b/Eigen/src/UmfPackSupport/UmfPackSupport.h @@ -403,11 +403,22 @@ bool UmfPackLU<MatrixType>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBas eigen_assert(b.derived().data() != x.derived().data() && " Umfpack does not support inplace solve"); int errorCode; + Scalar* x_ptr = 0; + Matrix<Scalar,Dynamic,1> x_tmp; + if(x.innerStride()!=1) + { + x_tmp.resize(x.rows()); + x_ptr = x_tmp.data(); + } for (int j=0; j<rhsCols; ++j) { + if(x.innerStride()==1) + x_ptr = &x.col(j).coeffRef(0); errorCode = umfpack_solve(UMFPACK_A, m_outerIndexPtr, m_innerIndexPtr, m_valuePtr, - &x.col(j).coeffRef(0), &b.const_cast_derived().col(j).coeffRef(0), m_numeric, 0, 0); + x_ptr, &b.const_cast_derived().col(j).coeffRef(0), m_numeric, 0, 0); + if(x.innerStride()!=1) + x.col(j) = x_tmp; if (errorCode!=0) return false; } |