diff options
author | 2008-08-09 18:41:24 +0000 | |
---|---|---|
committer | 2008-08-09 18:41:24 +0000 | |
commit | 4fa40367e9bf55ea8b2ad1040b3fb73f94e4f12f (patch) | |
tree | 3ca6d7cff691daf2d6bc8d6b1ecb00971f9debf3 | |
parent | becbeda50ac17288dba0a93c6adc67b663d32a7a (diff) |
* Big change in Block and Map:
- added a MapBase base xpr on top of which Map and the specialization
of Block are implemented
- MapBase forces both aligned loads (and aligned stores, see below) in expressions
such as "x.block(...) += other_expr"
* Significant vectorization improvement:
- added a AlignedBit flag meaning the first coeff/packet is aligned,
this allows to not generate extra code to deal with the first unaligned part
- removed all unaligned stores when no unrolling
- removed unaligned loads in Sum when the input as the DirectAccessBit flag
* Some code simplification in CacheFriendly product
* Some minor documentation improvements
-rw-r--r-- | Eigen/Core | 3 | ||||
-rw-r--r-- | Eigen/src/Core/Assign.h | 59 | ||||
-rw-r--r-- | Eigen/src/Core/Block.h | 145 | ||||
-rw-r--r-- | Eigen/src/Core/CacheFriendlyProduct.h | 42 | ||||
-rw-r--r-- | Eigen/src/Core/Coeffs.h | 12 | ||||
-rw-r--r-- | Eigen/src/Core/CwiseBinaryOp.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/CwiseUnaryOp.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/Dot.h | 16 | ||||
-rw-r--r-- | Eigen/src/Core/Map.h | 117 | ||||
-rw-r--r-- | Eigen/src/Core/MapBase.h | 167 | ||||
-rw-r--r-- | Eigen/src/Core/MatrixBase.h | 4 | ||||
-rw-r--r-- | Eigen/src/Core/Sum.h | 36 | ||||
-rw-r--r-- | Eigen/src/Core/Swap.h | 30 | ||||
-rw-r--r-- | Eigen/src/Core/util/Constants.h | 27 | ||||
-rw-r--r-- | Eigen/src/Core/util/ForwardDeclarations.h | 6 | ||||
-rw-r--r-- | Eigen/src/Core/util/Meta.h | 6 | ||||
-rw-r--r-- | Eigen/src/Geometry/Quaternion.h | 9 |
17 files changed, 392 insertions, 291 deletions
diff --git a/Eigen/Core b/Eigen/Core index e671f7879..db2fde30d 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -45,6 +45,8 @@ namespace Eigen { #include "src/Core/Product.h" #include "src/Core/DiagonalProduct.h" #include "src/Core/InverseProduct.h" +#include "src/Core/MapBase.h" +#include "src/Core/Map.h" #include "src/Core/Block.h" #include "src/Core/Minor.h" #include "src/Core/Transpose.h" @@ -54,7 +56,6 @@ namespace Eigen { #include "src/Core/Redux.h" #include "src/Core/Visitor.h" #include "src/Core/Fuzzy.h" -#include "src/Core/Map.h" #include "src/Core/IO.h" #include "src/Core/Swap.h" #include "src/Core/CommaInitializer.h" diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h index d744a15a4..5ea59e3db 100644 --- a/Eigen/src/Core/Assign.h +++ b/Eigen/src/Core/Assign.h @@ -34,6 +34,13 @@ template <typename Derived, typename OtherDerived> struct ei_assign_traits { +public: + enum { + DstIsAligned = Derived::Flags & AlignedBit, + SrcIsAligned = OtherDerived::Flags & AlignedBit, + SrcAlignment = DstIsAligned && SrcIsAligned ? Aligned : Unaligned + }; + private: enum { InnerSize = int(Derived::Flags)&RowMajorBit @@ -48,7 +55,8 @@ private: enum { MightVectorize = (int(Derived::Flags) & int(OtherDerived::Flags) & ActualPacketAccessBit) && ((int(Derived::Flags)&RowMajorBit)==(int(OtherDerived::Flags)&RowMajorBit)), - MayInnerVectorize = MightVectorize && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0, + MayInnerVectorize = MightVectorize && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0 + && int(DstIsAligned) && int(SrcIsAligned), MayLinearVectorize = MightVectorize && (int(Derived::Flags) & int(OtherDerived::Flags) & LinearAccessBit), MaySliceVectorize = MightVectorize && int(InnerMaxSize)==Dynamic /* slice vectorization can be slow, so we only want it if the slices are big, which is indicated by InnerMaxSize rather than InnerSize, think of the case @@ -79,7 +87,7 @@ public: : int(NoUnrolling) ) : int(Vectorization) == int(LinearVectorization) - ? ( int(MayUnrollCompletely) ? int(CompleteUnrolling) : int(NoUnrolling) ) + ? ( int(MayUnrollCompletely) && int(DstIsAligned) ? int(CompleteUnrolling) : int(NoUnrolling) ) : int(NoUnrolling) }; }; @@ -154,7 +162,7 @@ struct ei_assign_innervec_CompleteUnrolling inline static void run(Derived1 &dst, const Derived2 &src) { - dst.template copyPacket<Derived2, Aligned>(row, col, src); + dst.template copyPacket<Derived2, Aligned, Aligned>(row, col, src); ei_assign_innervec_CompleteUnrolling<Derived1, Derived2, Index+ei_packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src); } @@ -173,7 +181,7 @@ struct ei_assign_innervec_InnerUnrolling { const int row = int(Derived1::Flags)&RowMajorBit ? row_or_col : Index; const int col = int(Derived1::Flags)&RowMajorBit ? Index : row_or_col; - dst.template copyPacket<Derived2, Aligned>(row, col, src); + dst.template copyPacket<Derived2, Aligned, Aligned>(row, col, src); ei_assign_innervec_InnerUnrolling<Derived1, Derived2, Index+ei_packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src, row_or_col); } @@ -256,9 +264,9 @@ struct ei_assign_impl<Derived1, Derived2, InnerVectorization, NoUnrolling> for(int i = 0; i < innerSize; i+=packetSize) { if(int(Derived1::Flags)&RowMajorBit) - dst.template copyPacket<Derived2, Aligned>(j, i, src); + dst.template copyPacket<Derived2, Aligned, Aligned>(j, i, src); else - dst.template copyPacket<Derived2, Aligned>(i, j, src); + dst.template copyPacket<Derived2, Aligned, Aligned>(i, j, src); } } }; @@ -298,14 +306,19 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorization, NoUnrolling> { const int size = dst.size(); const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size; - const int alignedSize = (size/packetSize)*packetSize; + const int alignedStart = ei_assign_traits<Derived1,Derived2>::DstIsAligned ? 0 + : ei_alignmentOffset(&dst.coeffRef(0), size); + const int alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize; - for(int index = 0; index < alignedSize; index += packetSize) + for(int index = 0; index < alignedStart; index++) + dst.copyCoeff(index, src); + + for(int index = alignedStart; index < alignedEnd; index += packetSize) { - dst.template copyPacket<Derived2, Aligned>(index, src); + dst.template copyPacket<Derived2, Aligned, ei_assign_traits<Derived1,Derived2>::SrcAlignment>(index, src); } - for(int index = alignedSize; index < size; index++) + for(int index = alignedEnd; index < size; index++) dst.copyCoeff(index, src); } }; @@ -334,29 +347,45 @@ struct ei_assign_impl<Derived1, Derived2, SliceVectorization, NoUnrolling> static void run(Derived1 &dst, const Derived2 &src) { const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size; + const int packetAlignedMask = packetSize - 1; const int innerSize = dst.innerSize(); const int outerSize = dst.outerSize(); - const int alignedInnerSize = (innerSize/packetSize)*packetSize; + const int alignedStep = (packetSize - dst.stride() % packetSize) & packetAlignedMask; + int alignedStart = ei_assign_traits<Derived1,Derived2>::DstIsAligned ? 0 + : ei_alignmentOffset(&dst.coeffRef(0), innerSize); for(int i = 0; i < outerSize; i++) { + const int alignedEnd = alignedStart + ((innerSize-alignedStart) & ~packetAlignedMask); + + // do the non-vectorizable part of the assignment + for (int index = 0; index<alignedStart ; index++) + { + if(Derived1::Flags&RowMajorBit) + dst.copyCoeff(i, index, src); + else + dst.copyCoeff(index, i, src); + } + // do the vectorizable part of the assignment - for (int index = 0; index<alignedInnerSize ; index+=packetSize) + for (int index = alignedStart; index<alignedEnd; index+=packetSize) { if(Derived1::Flags&RowMajorBit) - dst.template copyPacket<Derived2, Unaligned>(i, index, src); + dst.template copyPacket<Derived2, Aligned, Unaligned>(i, index, src); else - dst.template copyPacket<Derived2, Unaligned>(index, i, src); + dst.template copyPacket<Derived2, Aligned, Unaligned>(index, i, src); } // do the non-vectorizable part of the assignment - for (int index = alignedInnerSize; index<innerSize ; index++) + for (int index = alignedEnd; index<innerSize ; index++) { if(Derived1::Flags&RowMajorBit) dst.copyCoeff(i, index, src); else dst.copyCoeff(index, i, src); } + + alignedStart = (alignedStart+alignedStep)%packetSize; } } }; diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index c3174073b..c2c9606a5 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -33,6 +33,8 @@ * \param MatrixType the type of the object in which we are taking a block * \param BlockRows the number of rows of the block we are taking at compile time (optional) * \param BlockCols the number of columns of the block we are taking at compile time (optional) + * \param _PacketAccess + * \param _DirectAccessStatus \internal used for partial specialization * * This class represents an expression of either a fixed-size or dynamic-size block. It is the return * type of MatrixBase::block(int,int,int,int) and MatrixBase::block<int,int>(int,int) and @@ -56,8 +58,8 @@ * * \sa MatrixBase::block(int,int,int,int), MatrixBase::block(int,int), class VectorBlock */ -template<typename MatrixType, int BlockRows, int BlockCols, int DirectAccesStatus> -struct ei_traits<Block<MatrixType, BlockRows, BlockCols, DirectAccesStatus> > +template<typename MatrixType, int BlockRows, int BlockCols, int _PacketAccess, int _DirectAccessStatus> +struct ei_traits<Block<MatrixType, BlockRows, BlockCols, _PacketAccess, _DirectAccessStatus> > { typedef typename MatrixType::Scalar Scalar; enum{ @@ -74,17 +76,21 @@ struct ei_traits<Block<MatrixType, BlockRows, BlockCols, DirectAccesStatus> > RowMajor = int(MatrixType::Flags)&RowMajorBit, InnerSize = RowMajor ? ColsAtCompileTime : RowsAtCompileTime, InnerMaxSize = RowMajor ? MaxColsAtCompileTime : MaxRowsAtCompileTime, - MaskPacketAccessBit = (InnerMaxSize == Dynamic || (InnerSize % ei_packet_traits<Scalar>::size) == 0) + MaskPacketAccessBit = (InnerMaxSize == Dynamic || (InnerSize >= ei_packet_traits<Scalar>::size)) ? PacketAccessBit : 0, FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0, Flags = (MatrixType::Flags & (HereditaryBits | MaskPacketAccessBit | DirectAccessBit) & MaskLargeBit) | FlagsLinearAccessBit, - CoeffReadCost = MatrixType::CoeffReadCost + CoeffReadCost = MatrixType::CoeffReadCost, + PacketAccess = _PacketAccess }; + typedef typename ei_meta_if<int(PacketAccess)==Aligned, + Block<MatrixType, BlockRows, BlockCols, _PacketAccess, _DirectAccessStatus>&, + Block<MatrixType, BlockRows, BlockCols, Aligned, _DirectAccessStatus> >::ret AlignedDerivedType; }; -template<typename MatrixType, int BlockRows, int BlockCols, int DirectAccesStatus> class Block - : public MatrixBase<Block<MatrixType, BlockRows, BlockCols, DirectAccesStatus> > +template<typename MatrixType, int BlockRows, int BlockCols, int PacketAccess, int _DirectAccessStatus> class Block + : public MatrixBase<Block<MatrixType, BlockRows, BlockCols, PacketAccess, _DirectAccessStatus> > { public: @@ -205,26 +211,36 @@ template<typename MatrixType, int BlockRows, int BlockCols, int DirectAccesStatu }; /** \internal */ -template<typename MatrixType, int BlockRows, int BlockCols> class Block<MatrixType,BlockRows,BlockCols,HasDirectAccess> - : public MatrixBase<Block<MatrixType, BlockRows, BlockCols,HasDirectAccess> > +template<typename MatrixType, int BlockRows, int BlockCols, int PacketAccess> +class Block<MatrixType,BlockRows,BlockCols,PacketAccess,HasDirectAccess> + : public MapBase<Block<MatrixType, BlockRows, BlockCols,PacketAccess,HasDirectAccess> > { - enum { - IsRowMajor = int(ei_traits<MatrixType>::Flags)&RowMajorBit ? 1 : 0 - }; - public: - EIGEN_GENERIC_PUBLIC_INTERFACE(Block) + _EIGEN_GENERIC_PUBLIC_INTERFACE(Block, MapBase<Block>) + + typedef typename ei_traits<Block>::AlignedDerivedType AlignedDerivedType; + + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Block) + + AlignedDerivedType allowAligned() + { + if (PacketAccess==Aligned) + return *this; + else + return Block<MatrixType,BlockRows,BlockCols,Aligned,HasDirectAccess> + (m_matrix, Base::m_data, Base::m_rows.value(), Base::m_cols.value()); + } /** Column or Row constructor */ inline Block(const MatrixType& matrix, int i) - : m_matrix(matrix), - m_data_ptr(&matrix.const_cast_derived().coeffRef( - (BlockRows==1) && (BlockCols==MatrixType::ColsAtCompileTime) ? i : 0, - (BlockRows==MatrixType::RowsAtCompileTime) && (BlockCols==1) ? i : 0)), - m_blockRows(matrix.rows()), - m_blockCols(matrix.cols()) + : Base(&matrix.const_cast_derived().coeffRef( + (BlockRows==1) && (BlockCols==MatrixType::ColsAtCompileTime) ? i : 0, + (BlockRows==MatrixType::RowsAtCompileTime) && (BlockCols==1) ? i : 0), + BlockRows==1 ? 1 : matrix.rows(), + BlockCols==1 ? 1 : matrix.cols()), + m_matrix(matrix) { ei_assert( (i>=0) && ( ((BlockRows==1) && (BlockCols==MatrixType::ColsAtCompileTime) && i<matrix.rows()) @@ -234,13 +250,10 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block<MatrixTy /** Fixed-size constructor */ inline Block(const MatrixType& matrix, int startRow, int startCol) - : m_matrix(matrix), m_data_ptr(&matrix.const_cast_derived().coeffRef(startRow,startCol)), - m_blockRows(matrix.rows()), m_blockCols(matrix.cols()) + : Base(&matrix.const_cast_derived().coeffRef(startRow,startCol)), m_matrix(matrix) { - EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && RowsAtCompileTime!=Dynamic,this_method_is_only_for_fixed_size); - ei_assert(RowsAtCompileTime!=Dynamic && RowsAtCompileTime!=Dynamic); ei_assert(startRow >= 0 && BlockRows >= 1 && startRow + BlockRows <= matrix.rows() - && startCol >= 0 && BlockCols >= 1 && startCol + BlockCols <= matrix.cols()); + && startCol >= 0 && BlockCols >= 1 && startCol + BlockCols <= matrix.cols()); } /** Dynamic-size constructor @@ -248,91 +261,25 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block<MatrixTy inline Block(const MatrixType& matrix, int startRow, int startCol, int blockRows, int blockCols) - : m_matrix(matrix), m_data_ptr(&matrix.const_cast_derived().coeffRef(startRow,startCol)), - m_blockRows(blockRows), m_blockCols(blockCols) + : Base(&matrix.const_cast_derived().coeffRef(startRow,startCol), blockRows, blockCols), + m_matrix(matrix) { ei_assert((RowsAtCompileTime==Dynamic || RowsAtCompileTime==blockRows) - && (ColsAtCompileTime==Dynamic || ColsAtCompileTime==blockCols)); + && (ColsAtCompileTime==Dynamic || ColsAtCompileTime==blockCols)); ei_assert(startRow >= 0 && blockRows >= 1 && startRow + blockRows <= matrix.rows() - && startCol >= 0 && blockCols >= 1 && startCol + blockCols <= matrix.cols()); + && startCol >= 0 && blockCols >= 1 && startCol + blockCols <= matrix.cols()); } - EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Block) - - inline int rows() const { return m_blockRows.value(); } - inline int cols() const { return m_blockCols.value(); } - inline int stride(void) const { return m_matrix.stride(); } - inline Scalar& coeffRef(int row, int col) - { - if (IsRowMajor) - return m_data_ptr[col + row * stride()]; - else - return m_data_ptr[row + col * stride()]; - } - - inline const Scalar coeff(int row, int col) const - { - if (IsRowMajor) - return m_data_ptr[col + row * stride()]; - else - return m_data_ptr[row + col * stride()]; - } - - inline Scalar& coeffRef(int index) - { - EIGEN_STATIC_ASSERT_VECTOR_ONLY(Block); - return m_data_ptr[index]; - } - - inline const Scalar coeff(int index) const - { - EIGEN_STATIC_ASSERT_VECTOR_ONLY(Block); - if ( (RowsAtCompileTime == 1) == IsRowMajor ) - return m_data_ptr[index]; - else - return m_data_ptr[index*stride()]; - } - - template<int LoadMode> - inline PacketScalar packet(int row, int col) const - { - if (IsRowMajor) - return ei_ploadu(&m_data_ptr[col + row * stride()]); - else - return ei_ploadu(&m_data_ptr[row + col * stride()]); - } - - template<int LoadMode> - inline void writePacket(int row, int col, const PacketScalar& x) - { - if (IsRowMajor) - ei_pstoreu(&m_data_ptr[col + row * stride()], x); - else - ei_pstoreu(&m_data_ptr[row + col * stride()], x); - } - - template<int LoadMode> - inline PacketScalar packet(int index) const - { - EIGEN_STATIC_ASSERT_VECTOR_ONLY(Block); - return ei_ploadu(&m_data_ptr[index]); - } - - template<int LoadMode> - inline void writePacket(int index, const PacketScalar& x) - { - EIGEN_STATIC_ASSERT_VECTOR_ONLY(Block); - ei_pstoreu(&m_data_ptr[index], x); - } - protected: + /** \internal used by allowAligned() */ + inline Block(const MatrixType& matrix, const Scalar* data, int blockRows, int blockCols) + : Base(data, blockRows, blockCols), m_matrix(matrix) + {} + const typename MatrixType::Nested m_matrix; - Scalar* m_data_ptr; - const ei_int_if_dynamic<RowsAtCompileTime> m_blockRows; - const ei_int_if_dynamic<ColsAtCompileTime> m_blockCols; }; /** \returns a dynamic-size expression of a block in *this. diff --git a/Eigen/src/Core/CacheFriendlyProduct.h b/Eigen/src/Core/CacheFriendlyProduct.h index 9d4b0af36..bd9f4d0d9 100644 --- a/Eigen/src/Core/CacheFriendlyProduct.h +++ b/Eigen/src/Core/CacheFriendlyProduct.h @@ -419,16 +419,19 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector( ei_internal_assert((alignmentPattern==NoneAligned) || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(Packet))==0); } + + int offset1 = (FirstAligned && alignmentStep==1?3:1); + int offset3 = (FirstAligned && alignmentStep==1?1:3); int columnBound = ((rhs.size()-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns; for (int i=skipColumns; i<columnBound; i+=columnsAtOnce) { - Packet ptmp0 = ei_pset1(rhs[i]), ptmp1 = ei_pset1(rhs[i+1]), - ptmp2 = ei_pset1(rhs[i+2]), ptmp3 = ei_pset1(rhs[i+3]); + Packet ptmp0 = ei_pset1(rhs[i]), ptmp1 = ei_pset1(rhs[i+offset1]), + ptmp2 = ei_pset1(rhs[i+2]), ptmp3 = ei_pset1(rhs[i+offset3]); // this helps a lot generating better binary code - const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+1)*lhsStride, - *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+3)*lhsStride; + const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, + *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; if (PacketSize>1) { @@ -453,17 +456,11 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector( if(peels>1) { Packet A00, A01, A02, A03, A10, A11, A12, A13; - if (alignmentStep==1) - { - A00 = ptmp1; ptmp1 = ptmp3; ptmp3 = A00; - const Scalar* aux = lhs1; - lhs1 = lhs3; lhs3 = aux; - } A01 = ei_pload(&lhs1[alignedStart-1]); A02 = ei_pload(&lhs2[alignedStart-2]); A03 = ei_pload(&lhs3[alignedStart-3]); - + for (int j = alignedStart; j<peeledSize; j+=peels*PacketSize) { A11 = ei_pload(&lhs1[j-1+PacketSize]); ei_palign<1>(A01,A11); @@ -613,6 +610,9 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector( ei_internal_assert((alignmentPattern==NoneAligned) || PacketSize==1 || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(Packet))==0); } + + int offset1 = (FirstAligned && alignmentStep==1?3:1); + int offset3 = (FirstAligned && alignmentStep==1?1:3); int rowBound = ((res.size()-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; for (int i=skipRows; i<rowBound; i+=rowsAtOnce) @@ -620,8 +620,8 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector( Scalar tmp0 = Scalar(0), tmp1 = Scalar(0), tmp2 = Scalar(0), tmp3 = Scalar(0); // this helps the compiler generating good binary code - const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+1)*lhsStride, - *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+3)*lhsStride; + const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, + *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; if (PacketSize>1) { @@ -658,13 +658,6 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector( * than basic unaligned loads. */ Packet A01, A02, A03, b, A11, A12, A13; - if (alignmentStep==1) - { - // flip row #1 and #3 - b = ptmp1; ptmp1 = ptmp3; ptmp3 = b; - const Scalar* aux = lhs1; - lhs1 = lhs3; lhs3 = aux; - } A01 = ei_pload(&lhs1[alignedStart-1]); A02 = ei_pload(&lhs2[alignedStart-2]); A03 = ei_pload(&lhs3[alignedStart-3]); @@ -690,13 +683,6 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector( ptmp2 = ei_pmadd(b, A12, ptmp2); ptmp3 = ei_pmadd(b, A13, ptmp3); } - if (alignmentStep==1) - { - // restore rows #1 and #3 - b = ptmp1; ptmp1 = ptmp3; ptmp3 = b; - const Scalar* aux = lhs1; - lhs1 = lhs3; lhs3 = aux; - } } for (int j = peeledSize; j<alignedSize; j+=PacketSize) _EIGEN_ACCUMULATE_PACKETS(,u,u,); @@ -720,7 +706,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector( Scalar b = rhs[j]; tmp0 += b*lhs0[j]; tmp1 += b*lhs1[j]; tmp2 += b*lhs2[j]; tmp3 += b*lhs3[j]; } - res[i] += tmp0; res[i+1] += tmp1; res[i+2] += tmp2; res[i+3] += tmp3; + res[i] += tmp0; res[i+offset1] += tmp1; res[i+2] += tmp2; res[i+offset3] += tmp3; } // process remaining first and last rows (at most columnsAtOnce-1) diff --git a/Eigen/src/Core/Coeffs.h b/Eigen/src/Core/Coeffs.h index e21083664..8836998be 100644 --- a/Eigen/src/Core/Coeffs.h +++ b/Eigen/src/Core/Coeffs.h @@ -298,22 +298,22 @@ inline void MatrixBase<Derived>::copyCoeff(int index, const MatrixBase<OtherDeri } template<typename Derived> -template<typename OtherDerived, int LoadStoreMode> +template<typename OtherDerived, int StoreMode, int LoadMode> inline void MatrixBase<Derived>::copyPacket(int row, int col, const MatrixBase<OtherDerived>& other) { ei_internal_assert(row >= 0 && row < rows() && col >= 0 && col < cols()); - derived().template writePacket<LoadStoreMode>(row, col, - other.derived().template packet<LoadStoreMode>(row, col)); + derived().template writePacket<StoreMode>(row, col, + other.derived().template packet<LoadMode>(row, col)); } template<typename Derived> -template<typename OtherDerived, int LoadStoreMode> +template<typename OtherDerived, int StoreMode, int LoadMode> inline void MatrixBase<Derived>::copyPacket(int index, const MatrixBase<OtherDerived>& other) { ei_internal_assert(index >= 0 && index < size()); - derived().template writePacket<LoadStoreMode>(index, - other.derived().template packet<LoadStoreMode>(index)); + derived().template writePacket<StoreMode>(index, + other.derived().template packet<LoadMode>(index)); } #endif // EIGEN_COEFFS_H diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h index 51c1f9e43..dcf2c9063 100644 --- a/Eigen/src/Core/CwiseBinaryOp.h +++ b/Eigen/src/Core/CwiseBinaryOp.h @@ -67,7 +67,7 @@ struct ei_traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> > MaxColsAtCompileTime = Lhs::MaxColsAtCompileTime, Flags = (int(LhsFlags) | int(RhsFlags)) & ( HereditaryBits - | (int(LhsFlags) & int(RhsFlags) & LinearAccessBit) + | (int(LhsFlags) & int(RhsFlags) & (LinearAccessBit | AlignedBit)) | (ei_functor_traits<BinaryOp>::PacketAccess && ((int(LhsFlags) & RowMajorBit)==(int(RhsFlags) & RowMajorBit)) ? (int(LhsFlags) & int(RhsFlags) & PacketAccessBit) : 0)), CoeffReadCost = LhsCoeffReadCost + RhsCoeffReadCost + ei_functor_traits<BinaryOp>::Cost diff --git a/Eigen/src/Core/CwiseUnaryOp.h b/Eigen/src/Core/CwiseUnaryOp.h index bb354958e..e9aeb6608 100644 --- a/Eigen/src/Core/CwiseUnaryOp.h +++ b/Eigen/src/Core/CwiseUnaryOp.h @@ -55,7 +55,7 @@ struct ei_traits<CwiseUnaryOp<UnaryOp, MatrixType> > MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, Flags = (MatrixTypeFlags & ( - HereditaryBits | LinearAccessBit + HereditaryBits | LinearAccessBit | AlignedBit | (ei_functor_traits<UnaryOp>::PacketAccess ? PacketAccessBit : 0))), CoeffReadCost = MatrixTypeCoeffReadCost + ei_functor_traits<UnaryOp>::Cost }; diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h index a5d2f0ba3..9bdff50b3 100644 --- a/Eigen/src/Core/Dot.h +++ b/Eigen/src/Core/Dot.h @@ -123,7 +123,9 @@ struct ei_dot_vec_unroller<Derived1, Derived2, Index, Stop, true> row1 = Derived1::RowsAtCompileTime == 1 ? 0 : Index, col1 = Derived1::RowsAtCompileTime == 1 ? Index : 0, row2 = Derived2::RowsAtCompileTime == 1 ? 0 : Index, - col2 = Derived2::RowsAtCompileTime == 1 ? Index : 0 + col2 = Derived2::RowsAtCompileTime == 1 ? Index : 0, + alignment1 = (Derived1::Flags & AlignedBit) ? Aligned : Unaligned, + alignment2 = (Derived2::Flags & AlignedBit) ? Aligned : Unaligned }; typedef typename Derived1::Scalar Scalar; @@ -131,7 +133,7 @@ struct ei_dot_vec_unroller<Derived1, Derived2, Index, Stop, true> inline static PacketScalar run(const Derived1& v1, const Derived2& v2) { - return ei_pmul(v1.template packet<Aligned>(row1, col1), v2.template packet<Aligned>(row2, col2)); + return ei_pmul(v1.template packet<alignment1>(row1, col1), v2.template packet<alignment2>(row2, col2)); } }; @@ -175,20 +177,22 @@ struct ei_dot_impl<Derived1, Derived2, LinearVectorization, NoUnrolling> const int size = v1.size(); const int packetSize = ei_packet_traits<Scalar>::size; const int alignedSize = (size/packetSize)*packetSize; + const int alignment1 = (Derived1::Flags & AlignedBit) ? Aligned : Unaligned; + const int alignment2 = (Derived2::Flags & AlignedBit) ? Aligned : Unaligned; Scalar res; // do the vectorizable part of the sum if(size >= packetSize) { PacketScalar packet_res = ei_pmul( - v1.template packet<Aligned>(0), - v2.template packet<Aligned>(0) + v1.template packet<alignment1>(0), + v2.template packet<alignment2>(0) ); for(int index = packetSize; index<alignedSize; index += packetSize) { packet_res = ei_pmadd( - v1.template packet<Aligned>(index), - v2.template packet<Aligned>(index), + v1.template packet<alignment1>(index), + v2.template packet<alignment2>(index), packet_res ); } diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h index f8d924f7d..a1953993f 100644 --- a/Eigen/src/Core/Map.h +++ b/Eigen/src/Core/Map.h @@ -2,6 +2,7 @@ // for linear algebra. Eigen itself is part of the KDE project. // // Copyright (C) 2006-2008 Benoit Jacob <jacob@math.jussieu.fr> +// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr> // // Eigen is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public @@ -29,8 +30,8 @@ * * \brief A matrix or vector expression mapping an existing array of data. * - * \param Alignment can be either Aligned or Unaligned. Tells whether the array is suitably aligned for - * vectorization on the present CPU architecture. Defaults to Unaligned. + * \param _PacketAccess controls whether vectorized aligned loads or stores are allowed (Aligned) + * or forced to unaligned (Unaligned). Defaults to Unaligned. * * This class represents a matrix or vector expression mapping an existing array of data. * It can be used to let Eigen interface without any overhead with non-Eigen data structures, @@ -40,117 +41,43 @@ * * \sa Matrix::map() */ -template<typename MatrixType, int Alignment> -struct ei_traits<Map<MatrixType, Alignment> > +template<typename MatrixType, int _PacketAccess> +struct ei_traits<Map<MatrixType, _PacketAccess> > : public ei_traits<MatrixType> { - typedef typename MatrixType::Scalar Scalar; enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, - MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, - MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, - Flags = MatrixType::Flags, - CoeffReadCost = NumTraits<Scalar>::ReadCost + PacketAccess = _PacketAccess, + Flags = ei_traits<MatrixType>::Flags & ~AlignedBit }; + typedef typename ei_meta_if<int(PacketAccess)==Aligned, + Map<MatrixType, _PacketAccess>&, + Map<MatrixType, Aligned> >::ret AlignedDerivedType; }; -template<typename MatrixType, int Alignment> class Map - : public MatrixBase<Map<MatrixType, Alignment> > +template<typename MatrixType, int PacketAccess> class Map + : public MapBase<Map<MatrixType, PacketAccess> > { public: - EIGEN_GENERIC_PUBLIC_INTERFACE(Map) - - inline int rows() const { return m_rows.value(); } - inline int cols() const { return m_cols.value(); } + _EIGEN_GENERIC_PUBLIC_INTERFACE(Map, MapBase<Map>) + typedef typename ei_traits<Map>::AlignedDerivedType AlignedDerivedType; inline int stride() const { return this->innerSize(); } - inline const Scalar& coeff(int row, int col) const + AlignedDerivedType allowAligned() { - if(Flags & RowMajorBit) - return m_data[col + row * m_cols.value()]; - else // column-major - return m_data[row + col * m_rows.value()]; + if (PacketAccess==Aligned) + return *this; + else + return Map<MatrixType,Aligned>(Base::m_data, Base::m_rows.value(), Base::m_cols.value()); } - inline Scalar& coeffRef(int row, int col) - { - if(Flags & RowMajorBit) - return const_cast<Scalar*>(m_data)[col + row * m_cols.value()]; - else // column-major - return const_cast<Scalar*>(m_data)[row + col * m_rows.value()]; - } + inline Map(const Scalar* data) : Base(data) {} - inline const Scalar& coeff(int index) const - { - return m_data[index]; - } + inline Map(const Scalar* data, int size) : Base(data, size) {} - inline Scalar& coeffRef(int index) - { - return *const_cast<Scalar*>(m_data + index); - } - - template<int LoadMode> - inline PacketScalar packet(int row, int col) const - { - return ei_ploadt<Scalar, LoadMode == Aligned ? Alignment : Unaligned> - (m_data + (Flags & RowMajorBit - ? col + row * m_cols.value() - : row + col * m_rows.value())); - } - - template<int LoadMode> - inline PacketScalar packet(int index) const - { - return ei_ploadt<Scalar, LoadMode == Aligned ? Alignment : Unaligned>(m_data + index); - } - - template<int StoreMode> - inline void writePacket(int row, int col, const PacketScalar& x) - { - ei_pstoret<Scalar, PacketScalar, StoreMode == Aligned ? Alignment : Unaligned> - (const_cast<Scalar*>(m_data) + (Flags & RowMajorBit - ? col + row * m_cols.value() - : row + col * m_rows.value()), x); - } - - template<int StoreMode> - inline void writePacket(int index, const PacketScalar& x) - { - ei_pstoret<Scalar, PacketScalar, StoreMode == Aligned ? Alignment : Unaligned> - (const_cast<Scalar*>(m_data) + index, x); - } - - inline Map(const Scalar* data) : m_data(data), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime) - { - EIGEN_STATIC_ASSERT_FIXED_SIZE(MatrixType) - } - - inline Map(const Scalar* data, int size) - : m_data(data), - m_rows(RowsAtCompileTime == Dynamic ? size : RowsAtCompileTime), - m_cols(ColsAtCompileTime == Dynamic ? size : ColsAtCompileTime) - { - EIGEN_STATIC_ASSERT_VECTOR_ONLY(MatrixType) - ei_assert(size > 0); - ei_assert(SizeAtCompileTime == Dynamic || SizeAtCompileTime == size); - } - - inline Map(const Scalar* data, int rows, int cols) - : m_data(data), m_rows(rows), m_cols(cols) - { - ei_assert(rows > 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows) - && cols > 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols)); - } + inline Map(const Scalar* data, int rows, int cols) : Base(data, rows, cols) {} EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Map) - - protected: - const Scalar* m_data; - const ei_int_if_dynamic<RowsAtCompileTime> m_rows; - const ei_int_if_dynamic<ColsAtCompileTime> m_cols; }; /** Constructor copying an existing array of data. diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h new file mode 100644 index 000000000..0b54ca7b8 --- /dev/null +++ b/Eigen/src/Core/MapBase.h @@ -0,0 +1,167 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. Eigen itself is part of the KDE project. +// +// Copyright (C) 2006-2008 Benoit Jacob <jacob@math.jussieu.fr> +// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr> +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see <http://www.gnu.org/licenses/>. + +#ifndef EIGEN_MAPBASE_H +#define EIGEN_MAPBASE_H + +/** \internal + * + * \class MapBase + * + * \brief Base class for Map and Block expression with direct access + * + * \sa class Map, class Block + */ +template<typename Derived> class MapBase + : public MatrixBase<Derived> +{ + public: + + typedef MatrixBase<Derived> Base; + enum { + IsRowMajor = int(ei_traits<Derived>::Flags) & RowMajorBit ? 1 : 0, + PacketAccess = ei_traits<Derived>::PacketAccess, + RowsAtCompileTime = ei_traits<Derived>::RowsAtCompileTime, + ColsAtCompileTime = ei_traits<Derived>::ColsAtCompileTime, + SizeAtCompileTime = Base::SizeAtCompileTime + }; + + typedef typename ei_traits<Derived>::AlignedDerivedType AlignedDerivedType; + typedef typename ei_traits<Derived>::Scalar Scalar; + typedef typename Base::PacketScalar PacketScalar; + using Base::derived; + + inline int rows() const { return m_rows.value(); } + inline int cols() const { return m_cols.value(); } + + inline int stride() const { return derived().stride(); } + AlignedDerivedType allowAligned() { return derived().allowAligned(); } + + inline const Scalar& coeff(int row, int col) const + { + if(IsRowMajor) + return m_data[col + row * stride()]; + else // column-major + return m_data[row + col * stride()]; + } + + inline Scalar& coeffRef(int row, int col) + { + if(IsRowMajor) + return const_cast<Scalar*>(m_data)[col + row * stride()]; + else // column-major + return const_cast<Scalar*>(m_data)[row + col * stride()]; + } + + inline const Scalar coeff(int index) const + { + ei_assert(Derived::IsVectorAtCompileTime || (ei_traits<Derived>::Flags & LinearAccessBit)); + if ( ((RowsAtCompileTime == 1) == IsRowMajor) ) + return m_data[index]; + else + return m_data[index*stride()]; + } + + inline Scalar& coeffRef(int index) + { + return *const_cast<Scalar*>(m_data + index); + } + + template<int LoadMode> + inline PacketScalar packet(int row, int col) const + { + return ei_ploadt<Scalar, int(PacketAccess) == Aligned ? Aligned : LoadMode> + (m_data + (IsRowMajor ? col + row * stride() + : row + col * stride())); + } + + template<int LoadMode> + inline PacketScalar packet(int index) const + { + return ei_ploadt<Scalar, int(PacketAccess) == Aligned ? Aligned : LoadMode>(m_data + index); + } + + template<int StoreMode> + inline void writePacket(int row, int col, const PacketScalar& x) + { + ei_pstoret<Scalar, PacketScalar, int(PacketAccess) == Aligned ? Aligned : StoreMode> + (const_cast<Scalar*>(m_data) + (IsRowMajor ? col + row * stride() + : row + col * stride()), x); + } + + template<int StoreMode> + inline void writePacket(int index, const PacketScalar& x) + { + ei_pstoret<Scalar, PacketScalar, int(PacketAccess) == Aligned ? Aligned : StoreMode> + (const_cast<Scalar*>(m_data) + index, x); + } + + inline MapBase(const Scalar* data) : m_data(data), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime) + { + EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) + } + + inline MapBase(const Scalar* data, int size) + : m_data(data), + m_rows(RowsAtCompileTime == Dynamic ? size : RowsAtCompileTime), + m_cols(ColsAtCompileTime == Dynamic ? size : ColsAtCompileTime) + { + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + ei_assert(size > 0); + ei_assert(SizeAtCompileTime == Dynamic || SizeAtCompileTime == size); + } + + inline MapBase(const Scalar* data, int rows, int cols) + : m_data(data), m_rows(rows), m_cols(cols) + { + ei_assert(rows > 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows) + && cols > 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols)); + } + + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(MapBase) + +// EIGEN_INHERIT_ASSIGNMENT_OPERATOR(MapBase, =) + + template<typename OtherDerived> + Derived& operator+=(const MatrixBase<OtherDerived>& other) + { return derived() = allowAligned() + other; } + + template<typename OtherDerived> + Derived& operator-=(const MatrixBase<OtherDerived>& other) + { return derived() = allowAligned() - other; } + + Derived& operator*=(const Scalar& other) + { return derived() = allowAligned() * other; } + + Derived& operator/=(const Scalar& other) + { return derived() = allowAligned() / other; } + + protected: + const Scalar* __restrict__ m_data; + const ei_int_if_dynamic<RowsAtCompileTime> m_rows; + const ei_int_if_dynamic<ColsAtCompileTime> m_cols; +}; + +#endif // EIGEN_MAPBASE_H diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index 0243faaed..25a4c8b08 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -257,9 +257,9 @@ template<typename Derived> class MatrixBase void copyCoeff(int row, int col, const MatrixBase<OtherDerived>& other); template<typename OtherDerived> void copyCoeff(int index, const MatrixBase<OtherDerived>& other); - template<typename OtherDerived, int LoadStoreMode> + template<typename OtherDerived, int StoreMode, int LoadMode> void copyPacket(int row, int col, const MatrixBase<OtherDerived>& other); - template<typename OtherDerived, int LoadStoreMode> + template<typename OtherDerived, int StoreMode, int LoadMode> void copyPacket(int index, const MatrixBase<OtherDerived>& other); template<int LoadMode> diff --git a/Eigen/src/Core/Sum.h b/Eigen/src/Core/Sum.h index fa75429c8..6c7280800 100644 --- a/Eigen/src/Core/Sum.h +++ b/Eigen/src/Core/Sum.h @@ -33,17 +33,22 @@ template<typename Derived> struct ei_sum_traits { +private: + enum { + PacketSize = ei_packet_traits<typename Derived::Scalar>::size + }; + public: enum { Vectorization = (int(Derived::Flags)&ActualPacketAccessBit) && (int(Derived::Flags)&LinearAccessBit) + && (int(Derived::SizeAtCompileTime)>2*PacketSize) ? LinearVectorization : NoVectorization }; private: enum { - PacketSize = ei_packet_traits<typename Derived::Scalar>::size, Cost = Derived::SizeAtCompileTime * Derived::CoeffReadCost + (Derived::SizeAtCompileTime-1) * NumTraits<typename Derived::Scalar>::AddCost, UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Vectorization) == int(NoVectorization) ? 1 : int(PacketSize)) @@ -131,7 +136,8 @@ struct ei_sum_vec_unroller<Derived, Index, Stop, true> : Index % Derived::RowsAtCompileTime, col = int(Derived::Flags)&RowMajorBit ? Index % int(Derived::ColsAtCompileTime) - : Index / Derived::RowsAtCompileTime + : Index / Derived::RowsAtCompileTime, + alignment = (Derived::Flags & AlignedBit) ? Aligned : Unaligned }; typedef typename Derived::Scalar Scalar; @@ -139,7 +145,7 @@ struct ei_sum_vec_unroller<Derived, Index, Stop, true> inline static PacketScalar run(const Derived &mat) { - return mat.template packet<Aligned>(row, col); + return mat.template packet<alignment>(row, col); } }; @@ -185,14 +191,21 @@ struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling> { const int size = mat.size(); const int packetSize = ei_packet_traits<Scalar>::size; - const int alignedSize = (size/packetSize)*packetSize; + const int alignedStart = (Derived::Flags & AlignedBit) + || !(Derived::Flags & DirectAccessBit) + ? 0 + : ei_alignmentOffset(&mat.const_cast_derived().coeffRef(0), size); + const int alignment = (Derived::Flags & DirectAccessBit) || (Derived::Flags & AlignedBit) + ? Aligned : Unaligned; + const int alignedSize = ((size-alignedStart)/packetSize)*packetSize; + const int alignedEnd = alignedStart + alignedSize; Scalar res; - if(size >= packetSize) + if(Derived::SizeAtCompileTime>=2*packetSize && alignedSize >= 2*packetSize) { - PacketScalar packet_res = mat.template packet<Aligned>(0, 0); - for(int index = packetSize; index < alignedSize; index += packetSize) - packet_res = ei_padd(packet_res, mat.template packet<Aligned>(index)); + PacketScalar packet_res = mat.template packet<alignment>(alignedStart, alignedStart); + for(int index = alignedStart + packetSize; index < alignedEnd; index += packetSize) + packet_res = ei_padd(packet_res, mat.template packet<alignment>(index)); res = ei_predux(packet_res); } @@ -202,10 +215,11 @@ struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling> res = Scalar(0); } - for(int index = alignedSize; index < size; index++) - { + for(int index = alignedEnd; index < size; index++) + res += mat.coeff(index); + + for(int index = alignedEnd; index < size; index++) res += mat.coeff(index); - } return res; } diff --git a/Eigen/src/Core/Swap.h b/Eigen/src/Core/Swap.h index b58fd1279..31e8170f5 100644 --- a/Eigen/src/Core/Swap.h +++ b/Eigen/src/Core/Swap.h @@ -59,6 +59,16 @@ template<typename ExpressionType> class SwapWrapper inline int cols() const { return m_expression.cols(); } inline int stride() const { return m_expression.stride(); } + inline Scalar& coeffRef(int row, int col) + { + return m_expression.const_cast_derived().coeffRef(row, col); + } + + inline Scalar& coeffRef(int index) + { + return m_expression.const_cast_derived().coeffRef(index); + } + template<typename OtherDerived> void copyCoeff(int row, int col, const MatrixBase<OtherDerived>& other) { @@ -80,29 +90,29 @@ template<typename ExpressionType> class SwapWrapper _other.coeffRef(index) = tmp; } - template<typename OtherDerived, int LoadStoreMode> + template<typename OtherDerived, int StoreMode, int LoadMode> void copyPacket(int row, int col, const MatrixBase<OtherDerived>& other) { OtherDerived& _other = other.const_cast_derived(); ei_internal_assert(row >= 0 && row < rows() && col >= 0 && col < cols()); - Packet tmp = m_expression.template packet<LoadStoreMode>(row, col); - m_expression.template writePacket<LoadStoreMode>(row, col, - _other.template packet<LoadStoreMode>(row, col) + Packet tmp = m_expression.template packet<StoreMode>(row, col); + m_expression.template writePacket<StoreMode>(row, col, + _other.template packet<LoadMode>(row, col) ); - _other.template writePacket<LoadStoreMode>(row, col, tmp); + _other.template writePacket<LoadMode>(row, col, tmp); } - template<typename OtherDerived, int LoadStoreMode> + template<typename OtherDerived, int StoreMode, int LoadMode> void copyPacket(int index, const MatrixBase<OtherDerived>& other) { OtherDerived& _other = other.const_cast_derived(); ei_internal_assert(index >= 0 && index < m_expression.size()); - Packet tmp = m_expression.template packet<LoadStoreMode>(index); - m_expression.template writePacket<LoadStoreMode>(index, - _other.template packet<LoadStoreMode>(index) + Packet tmp = m_expression.template packet<StoreMode>(index); + m_expression.template writePacket<StoreMode>(index, + _other.template packet<LoadMode>(index) ); - _other.template writePacket<LoadStoreMode>(index, tmp); + _other.template writePacket<LoadMode>(index, tmp); } protected: diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index 24c653e2e..ea3994544 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -121,40 +121,45 @@ const unsigned int DirectAccessBit = 0x20; /** \ingroup flags * + * means the first coefficient packet is guaranteed to be aligned */ +const unsigned int AlignedBit = 0x40; + +/** \ingroup flags + * * means all diagonal coefficients are equal to 0 */ -const unsigned int ZeroDiagBit = 0x40; +const unsigned int ZeroDiagBit = 0x80; /** \ingroup flags * * means all diagonal coefficients are equal to 1 */ -const unsigned int UnitDiagBit = 0x80; +const unsigned int UnitDiagBit = 0x100; /** \ingroup flags * * means the matrix is selfadjoint (M=M*). */ -const unsigned int SelfAdjointBit = 0x100; +const unsigned int SelfAdjointBit = 0x200; /** \ingroup flags * * means the strictly lower triangular part is 0 */ -const unsigned int UpperTriangularBit = 0x200; +const unsigned int UpperTriangularBit = 0x400; /** \ingroup flags * * means the strictly upper triangular part is 0 */ -const unsigned int LowerTriangularBit = 0x400; +const unsigned int LowerTriangularBit = 0x800; /** \ingroup flags * * means the expression includes sparse matrices and the sparse path has to be taken. */ -const unsigned int SparseBit = 0x800; +const unsigned int SparseBit = 0x1000; /** \ingroup flags * * currently unused. Means the matrix probably has a very big size. * Could eventually be used as a hint to determine which algorithms * to use. */ -const unsigned int LargeBit = 0x1000; +const unsigned int LargeBit = 0x2000; // list of flags that are inherited by default const unsigned int HereditaryBits = RowMajorBit @@ -175,15 +180,21 @@ const unsigned int UnitUpper = UpperTriangularBit | UnitDiagBit; const unsigned int UnitLower = LowerTriangularBit | UnitDiagBit; const unsigned int Diagonal = Upper | Lower; -enum { Aligned=0, Unaligned=1 }; +enum { Aligned=0, Unaligned=1, Unknown=2 }; enum { ConditionalJumpCost = 5 }; enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight }; enum DirectionType { Vertical, Horizontal }; enum ProductEvaluationMode { NormalProduct, CacheFriendlyProduct, DiagonalProduct, SparseProduct }; enum { + /** \internal Equivalent to a slice vectorization for fixed-size matrices having good alignement + * and good size */ InnerVectorization, + /** \internal Vectorization path using a single loop plus scalar loops for the + * unaligned boundaries */ LinearVectorization, + /** \internal Generic vectorization path using one vectorized loop per row/column with some + * scalar loops to handle the unaligned boundaries */ SliceVectorization, NoVectorization }; diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h index a886a90d0..7a1f95443 100644 --- a/Eigen/src/Core/util/ForwardDeclarations.h +++ b/Eigen/src/Core/util/ForwardDeclarations.h @@ -43,8 +43,8 @@ template<typename ExpressionType, unsigned int Added, unsigned int Removed> clas template<typename ExpressionType> class NestByValue; template<typename ExpressionType> class SwapWrapper; template<typename MatrixType> class Minor; -template<typename MatrixType, int BlockRows=Dynamic, int BlockCols=Dynamic, - int DirectAccessStatus = ei_traits<MatrixType>::Flags&DirectAccessBit> class Block; +template<typename MatrixType, int BlockRows=Dynamic, int BlockCols=Dynamic, int PacketAccess=Unaligned, + int _DirectAccessStatus = ei_traits<MatrixType>::Flags&DirectAccessBit> class Block; template<typename MatrixType> class Transpose; template<typename MatrixType> class Conjugate; template<typename NullaryOp, typename MatrixType> class CwiseNullaryOp; @@ -53,7 +53,7 @@ template<typename BinaryOp, typename Lhs, typename Rhs> class CwiseBinaryOp; template<typename Lhs, typename Rhs, int ProductMode> class Product; template<typename CoeffsVectorType> class DiagonalMatrix; template<typename MatrixType> class DiagonalCoeffs; -template<typename MatrixType, int Alignment = Unaligned> class Map; +template<typename MatrixType, int PacketAccess = Unaligned> class Map; template<typename MatrixType, unsigned int Mode> class Part; template<typename MatrixType, unsigned int Mode> class Extract; template<typename ExpressionType> class Cwise; diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 2fb401c6c..9d844d222 100644 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -168,12 +168,14 @@ class ei_corrected_matrix_flags packet_access_bit = ei_packet_traits<Scalar>::size > 1 && (is_big || linear_size%ei_packet_traits<Scalar>::size==0) - ? PacketAccessBit : 0 + ? PacketAccessBit : 0, + aligned_bit = packet_access_bit + && (is_big || linear_size%ei_packet_traits<Scalar>::size==0) ? AlignedBit : 0 }; public: enum { ret = (SuggestedFlags & ~(EvalBeforeNestingBit | EvalBeforeAssigningBit | PacketAccessBit | RowMajorBit)) - | LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit + | LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit | aligned_bit }; }; diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h index e17e9ff4a..b2065fdcc 100644 --- a/Eigen/src/Geometry/Quaternion.h +++ b/Eigen/src/Geometry/Quaternion.h @@ -105,8 +105,11 @@ public: /** Constructs and initializes the quaternion \f$ w+xi+yj+zk \f$ from * its four coefficients \a w, \a x, \a y and \a z. + * + * \warning Note the order of the arguments: the real \a w coefficient first, + * while internally the coefficients are stored in the following order: + * [\c x, \c y, \c z, \c w] */ - // FIXME what is the prefered order: w x,y,z or x,y,z,w ? inline Quaternion(Scalar w, Scalar x, Scalar y, Scalar z) { m_coeffs << x, y, z, w; } @@ -313,8 +316,8 @@ inline Quaternion<Scalar>& Quaternion<Scalar>::setFromTwoVectors(const MatrixBas } /** \returns the multiplicative inverse of \c *this - * Note that in most cases, i.e., if you simply want the opposite - * rotation, it is enough to use the conjugate. + * Note that in most cases, i.e., if you simply want the opposite rotation, + * and/or the quaternion is normalized, then it is enough to use the conjugate. * * \sa Quaternion::conjugate() */ |