diff options
author | Gael Guennebaud <g.gael@free.fr> | 2008-07-12 22:59:34 +0000 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2008-07-12 22:59:34 +0000 |
commit | 861d18d5532546ddb0cd2bff8795eda080ce0c85 (patch) | |
tree | a5f874ce1846c3c36a067cb88fe0ae3775adc56b | |
parent | 1bbaea988501ac4c151485bbbe1a170e594ce90b (diff) |
* Optimization: added a specialization of Block for xpr with DirectAccessBit
* some simplifications and fixes in cache friendly products
-rw-r--r-- | Eigen/src/Core/Block.h | 139 | ||||
-rw-r--r-- | Eigen/src/Core/CacheFriendlyProduct.h | 99 | ||||
-rw-r--r-- | Eigen/src/Core/Product.h | 24 | ||||
-rw-r--r-- | Eigen/src/Core/util/Constants.h | 5 | ||||
-rw-r--r-- | Eigen/src/Core/util/ForwardDeclarations.h | 3 | ||||
-rw-r--r-- | bench/benchBlasGemm.cpp | 8 |
6 files changed, 198 insertions, 80 deletions
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index ae4e83c9e..e7eb87a26 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -56,8 +56,8 @@ * * \sa MatrixBase::block(int,int,int,int), MatrixBase::block(int,int), class VectorBlock */ -template<typename MatrixType, int BlockRows, int BlockCols> -struct ei_traits<Block<MatrixType, BlockRows, BlockCols> > +template<typename MatrixType, int BlockRows, int BlockCols, int DirectAccesStatus> +struct ei_traits<Block<MatrixType, BlockRows, BlockCols, DirectAccesStatus> > { typedef typename MatrixType::Scalar Scalar; enum{ @@ -83,8 +83,8 @@ struct ei_traits<Block<MatrixType, BlockRows, BlockCols> > }; }; -template<typename MatrixType, int BlockRows, int BlockCols> class Block - : public MatrixBase<Block<MatrixType, BlockRows, BlockCols> > +template<typename MatrixType, int BlockRows, int BlockCols, int DirectAccesStatus> class Block + : public MatrixBase<Block<MatrixType, BlockRows, BlockCols, DirectAccesStatus> > { public: @@ -203,6 +203,137 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block const ei_int_if_dynamic<ColsAtCompileTime> m_blockCols; }; +/** \internal */ +template<typename MatrixType, int BlockRows, int BlockCols> class Block<MatrixType,BlockRows,BlockCols,HasDirectAccess> + : public MatrixBase<Block<MatrixType, BlockRows, BlockCols,HasDirectAccess> > +{ + enum { + IsRowMajor = int(ei_traits<MatrixType>::Flags)&RowMajorBit ? 1 : 0 + }; + + public: + + EIGEN_GENERIC_PUBLIC_INTERFACE(Block) + + /** Column or Row constructor + */ + inline Block(const MatrixType& matrix, int i) + : m_matrix(matrix), + m_data_ptr(&matrix.const_cast_derived().coeffRef( + (BlockRows==1) && (BlockCols==MatrixType::ColsAtCompileTime) ? i : 0, + (BlockRows==MatrixType::RowsAtCompileTime) && (BlockCols==1) ? i : 0)), + m_blockRows(matrix.rows()), + m_blockCols(matrix.cols()) + { + ei_assert( (i>=0) && ( + ((BlockRows==1) && (BlockCols==MatrixType::ColsAtCompileTime) && i<matrix.rows()) + ||((BlockRows==MatrixType::RowsAtCompileTime) && (BlockCols==1) && i<matrix.cols()))); + } + + /** Fixed-size constructor + */ + inline Block(const MatrixType& matrix, int startRow, int startCol) + : m_matrix(matrix), m_data_ptr(&matrix.const_cast_derived().coeffRef(startRow,startCol)) + { + ei_assert(RowsAtCompileTime!=Dynamic && RowsAtCompileTime!=Dynamic); + ei_assert(startRow >= 0 && BlockRows >= 1 && startRow + BlockRows <= matrix.rows() + && startCol >= 0 && BlockCols >= 1 && startCol + BlockCols <= matrix.cols()); + } + + /** Dynamic-size constructor + */ + inline Block(const MatrixType& matrix, + int startRow, int startCol, + int blockRows, int blockCols) + : m_matrix(matrix), m_data_ptr(&matrix.const_cast_derived().coeffRef(startRow,startCol)), + m_blockRows(blockRows), m_blockCols(blockCols) + { + ei_assert((RowsAtCompileTime==Dynamic || RowsAtCompileTime==blockRows) + && (ColsAtCompileTime==Dynamic || ColsAtCompileTime==blockCols)); + ei_assert(startRow >= 0 && blockRows >= 1 && startRow + blockRows <= matrix.rows() + && startCol >= 0 && blockCols >= 1 && startCol + blockCols <= matrix.cols()); + } + + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Block) + + inline int rows() const { return m_blockRows.value(); } + inline int cols() const { return m_blockCols.value(); } + + inline int stride(void) const { return m_matrix.stride(); } + + inline Scalar& coeffRef(int row, int col) + { + if (IsRowMajor) + return m_data_ptr[col + row * stride()]; + else + return m_data_ptr[row + col * stride()]; + } + + inline const Scalar coeff(int row, int col) const + { +// std::cerr << "coeff(int row, int col)\n"; + if (IsRowMajor) + return m_data_ptr[col + row * stride()]; + else + return m_data_ptr[row + col * stride()]; + } + + inline Scalar& coeffRef(int index) + { + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Block); + return m_data_ptr[index]; + } + + inline const Scalar coeff(int index) const + { + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Block); + if ( (RowsAtCompileTime == 1) == IsRowMajor ) + return m_data_ptr[index]; + else + return m_data_ptr[index*stride()]; + } + + template<int LoadMode> + inline PacketScalar packet(int row, int col) const + { + if (IsRowMajor) + return ei_ploadu(&m_data_ptr[col + row * stride()]); + else + return ei_ploadu(&m_data_ptr[row + col * stride()]); + } + + template<int LoadMode> + inline void writePacket(int row, int col, const PacketScalar& x) + { + if (IsRowMajor) + ei_pstoreu(&m_data_ptr[col + row * stride()], x); + else + ei_pstoreu(&m_data_ptr[row + col * stride()], x); + } + + template<int LoadMode> + inline PacketScalar packet(int index) const + { + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Block); + return ei_ploadu(&m_data_ptr[index]); + } + + template<int LoadMode> + inline void writePacket(int index, const PacketScalar& x) + { + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Block); + ei_pstoreu(&m_data_ptr[index], x); + } + + protected: + + const typename MatrixType::Nested m_matrix; + Scalar* m_data_ptr; + const ei_int_if_dynamic<RowsAtCompileTime> m_blockRows; + const ei_int_if_dynamic<ColsAtCompileTime> m_blockCols; +}; + + /** \returns a dynamic-size expression of a block in *this. * * \param startRow the first row in the block diff --git a/Eigen/src/Core/CacheFriendlyProduct.h b/Eigen/src/Core/CacheFriendlyProduct.h index a710d44d4..06b3f5876 100644 --- a/Eigen/src/Core/CacheFriendlyProduct.h +++ b/Eigen/src/Core/CacheFriendlyProduct.h @@ -367,7 +367,7 @@ static void ei_cache_friendly_product( * TODO: since rhs gets evaluated only once, no need to evaluate it */ template<typename Scalar, typename RhsType> -EIGEN_DONT_INLINE static void ei_cache_friendly_product( +EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector( int size, const Scalar* lhs, int lhsStride, const RhsType& rhs, @@ -408,54 +408,34 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product( : alignmentStep==2 ? EvenAligned : FirstAligned; - // find how many column do we have to skip to be aligned with the result (if possible) + // find how many columns do we have to skip to be aligned with the result (if possible) int skipColumns=0; - for (; skipColumns<PacketSize; ++skipColumns) - { - if (alignedStart == alignmentStep*skipColumns) - break; - } + for (; skipColumns<PacketSize && alignedStart != alignmentStep*skipColumns; ++skipColumns) + {} if (skipColumns==PacketSize) + { + // nothing can be aligned, no need to skip any column alignmentPattern = NoneAligned; - skipColumns = std::min(skipColumns,rhs.size()); - if (alignmentPattern!=NoneAligned) - for (int i=0; i<skipColumns; i++) - { - Scalar tmp0 = rhs[i]; - Packet ptmp0 = ei_pset1(tmp0); - int iN0 = i*lhsStride; - // process first unaligned result's coeffs - for (int j=0; j<alignedStart; j++) - res[j] += tmp0 * lhs[j+iN0]; - // process aligned result's coeffs (we know the lhs columns are not aligned) - for (int j = alignedStart;j<alignedSize;j+=PacketSize) - ei_pstore(&res[j], ei_padd(ei_pmul(ptmp0,ei_ploadu(&lhs[j+iN0])),ei_pload(&res[j]))); - // process remaining result's coeffs - for (int j=alignedSize; j<size; j++) - res[j] += tmp0 * lhs[j+iN0]; - } + skipColumns = 0; + } + else + { + skipColumns = std::min(skipColumns,rhs.size()); + // note that the skiped columns are processed later. + } int columnBound = (rhs.size()/columnsAtOnce)*columnsAtOnce; - for (int i=0; i<columnBound; i+=columnsAtOnce) + for (int i=skipColumns; i<columnBound; i+=columnsAtOnce) { - Scalar tmp0 = rhs[i]; - Packet ptmp0 = ei_pset1(tmp0); - Scalar tmp1 = rhs[i+1]; - Packet ptmp1 = ei_pset1(tmp1); - Scalar tmp2 = rhs[i+2]; - Packet ptmp2 = ei_pset1(tmp2); - Scalar tmp3 = rhs[i+3]; - Packet ptmp3 = ei_pset1(tmp3); - int iN0 = i*lhsStride; - int iN1 = (i+1)*lhsStride; - int iN2 = (i+2)*lhsStride; - int iN3 = (i+3)*lhsStride; + Scalar tmp0 = rhs[i], tmp1 = rhs[i+1], tmp2 = rhs[i+2], tmp3 = rhs[i+3]; + Packet ptmp0 = ei_pset1(tmp0), ptmp1 = ei_pset1(tmp1), ptmp2 = ei_pset1(tmp2), ptmp3 = ei_pset1(tmp3); + int iN0 = i*lhsStride, iN1 = (i+1)*lhsStride, iN2 = (i+2)*lhsStride, iN3 = (i+3)*lhsStride; // process initial unaligned coeffs for (int j=0; j<alignedStart; j++) res[j] += tmp0 * lhs[j+iN0] + tmp1 * lhs[j+iN1] + tmp2 * lhs[j+iN2] + tmp3 * lhs[j+iN3]; - if (alignedSize>0) + if (alignedSize>alignedStart) { switch(alignmentPattern) { @@ -475,10 +455,6 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product( _EIGEN_ACCUMULATE_PACKETS(,u,u,+PacketSize); if (peels>2) _EIGEN_ACCUMULATE_PACKETS(,u,u,+2*PacketSize); if (peels>3) _EIGEN_ACCUMULATE_PACKETS(,u,u,+3*PacketSize); - if (peels>4) _EIGEN_ACCUMULATE_PACKETS(,u,u,+4*PacketSize); - if (peels>5) _EIGEN_ACCUMULATE_PACKETS(,u,u,+5*PacketSize); - if (peels>6) _EIGEN_ACCUMULATE_PACKETS(,u,u,+6*PacketSize); - if (peels>7) _EIGEN_ACCUMULATE_PACKETS(,u,u,+7*PacketSize); } for (int j = peeledSize; j<alignedSize; j+=PacketSize) _EIGEN_ACCUMULATE_PACKETS(,u,u,); @@ -494,25 +470,42 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product( for (int j=alignedSize; j<size; j++) res[j] += tmp0 * lhs[j+iN0] + tmp1 * lhs[j+iN1] + tmp2 * lhs[j+iN2] + tmp3 * lhs[j+iN3]; } - for (int i=columnBound; i<rhs.size(); i++) + + // process remaining first and last columns (at most columnsAtOnce-1) + int end = rhs.size(); + int start = columnBound; + do { - Scalar tmp0 = rhs[i]; - Packet ptmp0 = ei_pset1(tmp0); - int iN0 = i*lhsStride; - if (alignedSize>0) + for (int i=columnBound; i<end; i++) { - bool aligned0 = (iN0 % PacketSize) == 0; - if (aligned0) + Scalar tmp0 = rhs[i]; + Packet ptmp0 = ei_pset1(tmp0); + int iN0 = i*lhsStride; + // process first unaligned result's coeffs + for (int j=0; j<alignedStart; j++) + res[j] += tmp0 * lhs[j+iN0]; + + // process aligned result's coeffs + if ((iN0 % PacketSize) == 0) for (int j = 0;j<alignedSize;j+=PacketSize) ei_pstore(&res[j], ei_padd(ei_pmul(ptmp0,ei_pload(&lhs[j+iN0])),ei_pload(&res[j]))); else for (int j = 0;j<alignedSize;j+=PacketSize) ei_pstore(&res[j], ei_padd(ei_pmul(ptmp0,ei_ploadu(&lhs[j+iN0])),ei_pload(&res[j]))); + + // process remaining scalars + for (int j=alignedSize; j<size; j++) + res[j] += tmp0 * lhs[j+iN0]; } - // process remaining scalars - for (int j=alignedSize; j<size; j++) - res[j] += tmp0 * lhs[j+iN0]; - } + if (skipColumns) + { + start = 0; + end = skipColumns; + skipColumns = 0; + } + else + break; + } while(true); asm("#end matrix_vector_product"); #undef _EIGEN_ACCUMULATE_PACKETS diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index c2f0c07a8..31677e6cb 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -365,8 +365,7 @@ struct ei_product_coeff_impl<InnerVectorization, Index, Lhs, Rhs> } }; -// FIXME the following is a hack to get very high perf with matrix-vector product, -// however, it would be preferable to switch for more general dynamic alignment queries +// NOTE the following specializations are because taking .col(0) on a vector is a bit slower template<typename Lhs, typename Rhs, int LhsRows = Lhs::RowsAtCompileTime, int RhsCols = Rhs::ColsAtCompileTime> struct ei_product_coeff_vectorized_dyn_selector { @@ -481,14 +480,9 @@ struct ei_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, PacketScalar, LoadMod ***************************************************************************/ template<typename Scalar, typename RhsType> -static void ei_cache_friendly_product( +static void ei_cache_friendly_product_colmajor_times_vector( int size, const Scalar* lhs, int lhsStride, const RhsType& rhs, Scalar* res); -enum { - HasDirectAccess, - NoDirectAccess -}; - template<typename ProductType, int LhsRows = ei_traits<ProductType>::RowsAtCompileTime, int LhsOrder = int(ei_traits<ProductType>::LhsFlags)&RowMajorBit ? RowMajor : ColMajor, @@ -507,19 +501,13 @@ struct ei_cache_friendly_product_selector // optimized colmajor * vector path template<typename ProductType, int LhsRows, int RhsOrder, int RhsAccess> -struct ei_cache_friendly_product_selector<ProductType,LhsRows,NoDirectAccess,ColMajor,1,RhsOrder,RhsAccess> +struct ei_cache_friendly_product_selector<ProductType,LhsRows,ColMajor,NoDirectAccess,1,RhsOrder,RhsAccess> { - typedef typename ei_traits<ProductType>::_LhsNested Lhs; template<typename DestDerived> inline static void run(DestDerived& res, const ProductType& product) { - ei_scalar_sum_op<typename ProductType::Scalar> _sum; const int size = product.rhs().rows(); for (int k=0; k<size; ++k) - if (Lhs::Flags&DirectAccessBit) - // TODO to properly hanlde this workaround let's specialize Block for type having the DirectAccessBit - res += product.rhs().coeff(k) * Map<DestDerived>(&product.lhs().const_cast_derived().coeffRef(0,k),product.lhs().rows()); - else res += product.rhs().coeff(k) * product.lhs().col(k); } }; @@ -527,7 +515,7 @@ struct ei_cache_friendly_product_selector<ProductType,LhsRows,NoDirectAccess,Col // optimized cache friendly colmajor * vector path for matrix with direct access flag // NOTE this path coul also be enabled for expressions if we add runtime align queries template<typename ProductType, int LhsRows, int RhsOrder, int RhsAccess> -struct ei_cache_friendly_product_selector<ProductType,LhsRows,HasDirectAccess,ColMajor,1,RhsOrder,RhsAccess> +struct ei_cache_friendly_product_selector<ProductType,LhsRows,ColMajor,HasDirectAccess,1,RhsOrder,RhsAccess> { typedef typename ProductType::Scalar Scalar; @@ -545,7 +533,7 @@ struct ei_cache_friendly_product_selector<ProductType,LhsRows,HasDirectAccess,Co _res = (Scalar*)alloca(sizeof(Scalar)*res.size()); Map<Matrix<Scalar,DestDerived::RowsAtCompileTime,1> >(_res, res.size()) = res; } - ei_cache_friendly_product(res.size(), + ei_cache_friendly_product_colmajor_times_vector(res.size(), &product.lhs().const_cast_derived().coeffRef(0,0), product.lhs().stride(), product.rhs(), _res); @@ -588,7 +576,7 @@ struct ei_cache_friendly_product_selector<ProductType,1,LhsOrder,LhsAccess,RhsCo _res = (Scalar*)alloca(sizeof(Scalar)*res.size()); Map<Matrix<Scalar,DestDerived::RowsAtCompileTime,1> >(_res, res.size()) = res; } - ei_cache_friendly_product(res.size(), + ei_cache_friendly_product_colmajor_times_vector(res.size(), &product.rhs().const_cast_derived().coeffRef(0,0), product.rhs().stride(), product.lhs().transpose(), _res); diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index c51aa282b..7e2d37dff 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -206,6 +206,11 @@ enum { RowMajor = RowMajorBit }; +enum { + NoDirectAccess = 0, + HasDirectAccess = DirectAccessBit +}; + const int FullyCoherentAccessPattern = 0x1; const int InnerCoherentAccessPattern = 0x2 | FullyCoherentAccessPattern; const int OuterCoherentAccessPattern = 0x4 | InnerCoherentAccessPattern; diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h index 922a3716f..f96d57747 100644 --- a/Eigen/src/Core/util/ForwardDeclarations.h +++ b/Eigen/src/Core/util/ForwardDeclarations.h @@ -42,7 +42,8 @@ class Matrix; template<typename ExpressionType, unsigned int Added, unsigned int Removed> class Flagged; template<typename ExpressionType> class NestByValue; template<typename MatrixType> class Minor; -template<typename MatrixType, int BlockRows=Dynamic, int BlockCols=Dynamic> class Block; +template<typename MatrixType, int BlockRows=Dynamic, int BlockCols=Dynamic, + int DirectAccessStatus = ei_traits<MatrixType>::Flags&DirectAccessBit> class Block; template<typename MatrixType> class Transpose; template<typename MatrixType> class Conjugate; template<typename NullaryOp, typename MatrixType> class CwiseNullaryOp; diff --git a/bench/benchBlasGemm.cpp b/bench/benchBlasGemm.cpp index 02f067e1a..5455b6ed5 100644 --- a/bench/benchBlasGemm.cpp +++ b/bench/benchBlasGemm.cpp @@ -82,12 +82,12 @@ int main(int argc, char *argv[]) std::cout << "Usage: " << argv[0] << " size nbloops nbtries\n"; std::cout << "Usage: " << argv[0] << " M N K nbloops nbtries\n"; std::cout << "Usage: " << argv[0] << " check\n"; - std::cout << "Options:\n" + std::cout << "Options:\n"; std::cout << " size unique size of the 2 matrices (integer)\n"; std::cout << " auto automatically set the number of repetitions and tries\n"; - std::cout << " nbloops number of times the GEMM routines is executed\n" - std::cout << " nbtries number of times the loop is benched (return the best try)\n" - std::cout << " M N K sizes of the matrices: MxN = MxK * KxN (integers)\n" + std::cout << " nbloops number of times the GEMM routines is executed\n"; + std::cout << " nbtries number of times the loop is benched (return the best try)\n"; + std::cout << " M N K sizes of the matrices: MxN = MxK * KxN (integers)\n"; std::cout << " check check eigen product using cblas as a reference\n"; exit(1); } |