diff options
author | 2010-11-10 18:58:55 +0100 | |
---|---|---|
committer | 2010-11-10 18:58:55 +0100 | |
commit | 2577ef90c027d8bad4dd632022fa65cec6313159 (patch) | |
tree | 69b63c743bcc79daa4af6ab7df29c3dac6e1e034 /Eigen/src/Core/products/SelfadjointProduct.h | |
parent | c810d14d4d595f6a9e748f625ddb419bb91b8262 (diff) |
generalize our internal rank K update routine to support more general A*B product while evaluating only one triangular part and make it available via, e.g.:
R.triangularView<Lower>() += s * A * B;
Diffstat (limited to 'Eigen/src/Core/products/SelfadjointProduct.h')
-rw-r--r-- | Eigen/src/Core/products/SelfadjointProduct.h | 176 |
1 files changed, 7 insertions, 169 deletions
diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h index 16320fa17..c6a5287b5 100644 --- a/Eigen/src/Core/products/SelfadjointProduct.h +++ b/Eigen/src/Core/products/SelfadjointProduct.h @@ -25,175 +25,12 @@ #ifndef EIGEN_SELFADJOINT_PRODUCT_H #define EIGEN_SELFADJOINT_PRODUCT_H -namespace internal { - /********************************************************************** * This file implements a self adjoint product: C += A A^T updating only * half of the selfadjoint matrix C. * It corresponds to the level 3 SYRK Blas routine. **********************************************************************/ -// forward declarations (defined at the end of this file) -template<typename Scalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo> -struct sybb_kernel; - -/* Optimized selfadjoint product (_SYRK) */ -template <typename Scalar, typename Index, - int RhsStorageOrder, - int ResStorageOrder, bool AAT, int UpLo> -struct selfadjoint_product; - -// as usual if the result is row major => we transpose the product -template <typename Scalar, typename Index, int MatStorageOrder, bool AAT, int UpLo> -struct selfadjoint_product<Scalar, Index, MatStorageOrder, RowMajor, AAT, UpLo> -{ - static EIGEN_STRONG_INLINE void run(Index size, Index depth, const Scalar* mat, Index matStride, Scalar* res, Index resStride, Scalar alpha) - { - selfadjoint_product<Scalar, Index, MatStorageOrder, ColMajor, !AAT, UpLo==Lower?Upper:Lower> - ::run(size, depth, mat, matStride, res, resStride, alpha); - } -}; - -template <typename Scalar, typename Index, - int MatStorageOrder, bool AAT, int UpLo> -struct selfadjoint_product<Scalar, Index, MatStorageOrder, ColMajor, AAT, UpLo> -{ - - static EIGEN_DONT_INLINE void run( - Index size, Index depth, - const Scalar* _mat, Index matStride, - Scalar* res, Index resStride, - Scalar alpha) - { - const_blas_data_mapper<Scalar, Index, MatStorageOrder> mat(_mat,matStride); - -// if(AAT) -// alpha = conj(alpha); - - typedef gebp_traits<Scalar,Scalar> Traits; - - Index kc = depth; // cache block size along the K direction - Index mc = size; // cache block size along the M direction - Index nc = size; // cache block size along the N direction - computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc); - // !!! mc must be a multiple of nr: - if(mc>Traits::nr) - mc = (mc/Traits::nr)*Traits::nr; - - Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - std::size_t sizeW = kc*Traits::WorkSpaceFactor; - std::size_t sizeB = sizeW + kc*size; - Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); - Scalar* blockB = allocatedBlockB + sizeW; - - // note that the actual rhs is the transpose/adjoint of mat - enum { - ConjLhs = NumTraits<Scalar>::IsComplex && !AAT, - ConjRhs = NumTraits<Scalar>::IsComplex && AAT - }; - - gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjLhs, ConjRhs> gebp_kernel; - gemm_pack_rhs<Scalar, Index, Traits::nr,MatStorageOrder==RowMajor ? ColMajor : RowMajor> pack_rhs; - gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, MatStorageOrder, false> pack_lhs; - sybb_kernel<Scalar, Index, Traits::mr, Traits::nr, ConjLhs, ConjRhs, UpLo> sybb; - - for(Index k2=0; k2<depth; k2+=kc) - { - const Index actual_kc = std::min(k2+kc,depth)-k2; - - // note that the actual rhs is the transpose/adjoint of mat - pack_rhs(blockB, &mat(0,k2), matStride, actual_kc, size); - - for(Index i2=0; i2<size; i2+=mc) - { - const Index actual_mc = std::min(i2+mc,size)-i2; - - pack_lhs(blockA, &mat(i2, k2), matStride, actual_kc, actual_mc); - - // the selected actual_mc * size panel of res is split into three different part: - // 1 - before the diagonal => processed with gebp or skipped - // 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel - // 3 - after the diagonal => processed with gebp or skipped - if (UpLo==Lower) - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, std::min(size,i2), alpha, - -1, -1, 0, 0, allocatedBlockB); - - sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha, allocatedBlockB); - - if (UpLo==Upper) - { - Index j2 = i2+actual_mc; - gebp_kernel(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, std::max(Index(0), size-j2), alpha, - -1, -1, 0, 0, allocatedBlockB); - } - } - } - ei_aligned_stack_delete(Scalar, blockA, kc*mc); - ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB); - } -}; - -// Optimized SYmmetric packed Block * packed Block product kernel. -// This kernel is built on top of the gebp kernel: -// - the current selfadjoint block (res) is processed per panel of actual_mc x BlockSize -// where BlockSize is set to the minimal value allowing gebp to be as fast as possible -// - then, as usual, each panel is split into three parts along the diagonal, -// the sub blocks above and below the diagonal are processed as usual, -// while the selfadjoint block overlapping the diagonal is evaluated into a -// small temporary buffer which is then accumulated into the result using a -// triangular traversal. -template<typename Scalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo> -struct sybb_kernel -{ - enum { - PacketSize = packet_traits<Scalar>::size, - BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr) - }; - void operator()(Scalar* res, Index resStride, const Scalar* blockA, const Scalar* blockB, Index size, Index depth, Scalar alpha, Scalar* workspace) - { - gebp_kernel<Scalar, Scalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel; - Matrix<Scalar,BlockSize,BlockSize,ColMajor> buffer; - - // let's process the block per panel of actual_mc x BlockSize, - // again, each is split into three parts, etc. - for (Index j=0; j<size; j+=BlockSize) - { - Index actualBlockSize = std::min<Index>(BlockSize,size - j); - const Scalar* actual_b = blockB+j*depth; - - if(UpLo==Upper) - gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha, - -1, -1, 0, 0, workspace); - - // selfadjoint micro block - { - Index i = j; - buffer.setZero(); - // 1 - apply the kernel on the temporary buffer - gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, - -1, -1, 0, 0, workspace); - // 2 - triangular accumulation - for(Index j1=0; j1<actualBlockSize; ++j1) - { - Scalar* r = res + (j+j1)*resStride + i; - for(Index i1=UpLo==Lower ? j1 : 0; - UpLo==Lower ? i1<actualBlockSize : i1<=j1; ++i1) - r[i1] += buffer(i1,j1); - } - } - - if(UpLo==Lower) - { - Index i = j+actualBlockSize; - gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha, - -1, -1, 0, 0, workspace); - } - } - } -}; - -} // end namespace internal - // high level API template<typename MatrixType, unsigned int UpLo> @@ -209,12 +46,13 @@ SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo> Scalar actualAlpha = alpha * UBlasTraits::extractScalarFactor(u.derived()); enum { IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0 }; - - internal::selfadjoint_product<Scalar, Index, - _ActualUType::Flags&RowMajorBit ? RowMajor : ColMajor, - MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor, - !UBlasTraits::NeedToConjugate, UpLo> - ::run(_expression().cols(), actualU.cols(), &actualU.coeff(0,0), actualU.outerStride(), + + internal::general_matrix_matrix_triangular_product<Index, + Scalar, _ActualUType::Flags&RowMajorBit ? RowMajor : ColMajor, UBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex, + Scalar, _ActualUType::Flags&RowMajorBit ? ColMajor : RowMajor, (!UBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex, + MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor, UpLo> + ::run(_expression().cols(), actualU.cols(), + &actualU.coeff(0,0), actualU.outerStride(), &actualU.coeff(0,0), actualU.outerStride(), const_cast<Scalar*>(_expression().data()), _expression().outerStride(), actualAlpha); return *this; |