// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2008-2009 Gael Guennebaud // // Eigen is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 3 of the License, or (at your option) any later version. // // Alternatively, you can redistribute it and/or // modify it under the terms of the GNU General Public License as // published by the Free Software Foundation; either version 2 of // the License, or (at your option) any later version. // // Eigen is distributed in the hope that it will be useful, but WITHOUT ANY // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS // FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the // GNU General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License and a copy of the GNU General Public License along with // Eigen. If not, see . #ifndef EIGEN_GENERAL_MATRIX_MATRIX_H #define EIGEN_GENERAL_MATRIX_MATRIX_H /* Specialization for a row-major destination matrix => simple transposition of the product */ template< typename Scalar, typename Index, int LhsStorageOrder, bool ConjugateLhs, int RhsStorageOrder, bool ConjugateRhs> struct ei_general_matrix_matrix_product { static EIGEN_STRONG_INLINE void run( Index rows, Index cols, Index depth, const Scalar* lhs, Index lhsStride, const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, GemmParallelInfo* info = 0) { // transpose the product such that the result is column major ei_general_matrix_matrix_product ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,info); } }; /* Specialization for a col-major destination matrix * => Blocking algorithm following Goto's paper */ template< typename Scalar, typename Index, int LhsStorageOrder, bool ConjugateLhs, int RhsStorageOrder, bool ConjugateRhs> struct ei_general_matrix_matrix_product { static void run(Index rows, Index cols, Index depth, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, GemmParallelInfo* info = 0) { ei_const_blas_data_mapper lhs(_lhs,lhsStride); ei_const_blas_data_mapper rhs(_rhs,rhsStride); if (ConjugateRhs) alpha = ei_conj(alpha); typedef typename ei_packet_traits::type PacketType; typedef ei_product_blocking_traits Blocking; Index kc; // cache block size along the K direction Index mc; // cache block size along the M direction Index nc; // cache block size along the N direction getBlockingSizes(kc, mc, nc); kc = std::min(kc,depth); mc = std::min(mc,rows); nc = std::min(nc,cols); ei_gemm_pack_rhs pack_rhs; ei_gemm_pack_lhs pack_lhs; ei_gebp_kernel > gebp; #ifdef EIGEN_HAS_OPENMP if(info) { // this is the parallel version! Index tid = omp_get_thread_num(); Index threads = omp_get_num_threads(); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); std::size_t sizeW = kc*Blocking::PacketSize*Blocking::nr*8; Scalar* w = ei_aligned_stack_new(Scalar, sizeW); Scalar* blockB = (Scalar*)info[tid].blockB; // For each horizontal panel of the rhs, and corresponding panel of the lhs... // (==GEMM_VAR1) for(Index k=0; k rows of B', and cols of the A' // In order to reduce the chance that a thread has to wait for the other, // let's start by packing A'. pack_lhs(blockA, &lhs(0,k), lhsStride, actual_kc, mc); // Pack B_k to B' in parallel fashion: // each thread packs the sub block B_k,j to B'_j where j is the thread id. // However, before copying to B'_j, we have to make sure that no other thread is still using it, // i.e., we test that info[tid].users equals 0. // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it. while(info[tid].users!=0) {} info[tid].users += threads; pack_rhs(blockB+info[tid].rhs_start*kc, &rhs(k,info[tid].rhs_start), rhsStride, alpha, actual_kc, info[tid].rhs_length); // Notify the other threads that the part B'_j is ready to go. info[tid].sync = k; // Computes C_i += A' * B' per B'_j for(Index shift=0; shift0) while(info[j].sync!=k) {} gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*kc, mc, actual_kc, info[j].rhs_length, -1,-1,0,0, w); } // Then keep going as usual with the remaining A' for(Index i=mc; i Pack rhs's panel into a sequential chunk of memory (L2 caching) // Note that this panel will be read as many times as the number of blocks in the lhs's // vertical panel which is, in practice, a very low number. pack_rhs(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols); // For each mc x kc block of the lhs's vertical panel... // (==GEPP_VAR1) for(Index i2=0; i2 for "large" GEMM, i.e., * implementation of the high level wrapper to ei_general_matrix_matrix_product **********************************************************************************/ template struct ei_traits > : ei_traits, Lhs, Rhs> > {}; template struct ei_gemm_functor { typedef typename Rhs::Scalar BlockBScalar; ei_gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, Scalar actualAlpha) : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha) {} void operator() (Index row, Index rows, Index col=0, Index cols=-1, GemmParallelInfo* info=0) const { if(cols==-1) cols = m_rhs.cols(); Gemm::run(rows, cols, m_lhs.cols(), (const Scalar*)&(m_lhs.const_cast_derived().coeffRef(row,0)), m_lhs.outerStride(), (const Scalar*)&(m_rhs.const_cast_derived().coeffRef(0,col)), m_rhs.outerStride(), (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(), m_actualAlpha, info); } Index sharedBlockBSize() const { int maxKc, maxMc; getBlockingSizes(maxKc,maxMc); return std::min(maxKc,m_rhs.rows()) * m_rhs.cols(); } protected: const Lhs& m_lhs; const Rhs& m_rhs; Dest& m_dest; Scalar m_actualAlpha; }; template class GeneralProduct : public ProductBase, Lhs, Rhs> { public: EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct) GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) { EIGEN_STATIC_ASSERT((ei_is_same_type::ret), YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) } template void scaleAndAddTo(Dest& dst, Scalar alpha) const { ei_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs); const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs); Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) * RhsBlasTraits::extractScalarFactor(m_rhs); typedef ei_gemm_functor< Scalar, Index, ei_general_matrix_matrix_product< Scalar, Index, (_ActualLhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate), (_ActualRhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate), (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>, _ActualLhsType, _ActualRhsType, Dest> GemmFunctor; ei_parallelize_gemm<(Dest::MaxRowsAtCompileTime>32)>(GemmFunctor(lhs, rhs, dst, actualAlpha), this->rows(), this->cols()); } }; #endif // EIGEN_GENERAL_MATRIX_MATRIX_H