// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2009 Gael Guennebaud // // Eigen is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 3 of the License, or (at your option) any later version. // // Alternatively, you can redistribute it and/or // modify it under the terms of the GNU General Public License as // published by the Free Software Foundation; either version 2 of // the License, or (at your option) any later version. // // Eigen is distributed in the hope that it will be useful, but WITHOUT ANY // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS // FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the // GNU General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License and a copy of the GNU General Public License along with // Eigen. If not, see . #ifndef EIGEN_SELFADJOINT_PRODUCT_H #define EIGEN_SELFADJOINT_PRODUCT_H namespace internal { /********************************************************************** * This file implements a self adjoint product: C += A A^T updating only * half of the selfadjoint matrix C. * It corresponds to the level 3 SYRK Blas routine. **********************************************************************/ // forward declarations (defined at the end of this file) template struct sybb_kernel; /* Optimized selfadjoint product (_SYRK) */ template struct selfadjoint_product; // as usual if the result is row major => we transpose the product template struct selfadjoint_product { static EIGEN_STRONG_INLINE void run(Index size, Index depth, const Scalar* mat, Index matStride, Scalar* res, Index resStride, Scalar alpha) { selfadjoint_product ::run(size, depth, mat, matStride, res, resStride, alpha); } }; template struct selfadjoint_product { static EIGEN_DONT_INLINE void run( Index size, Index depth, const Scalar* _mat, Index matStride, Scalar* res, Index resStride, Scalar alpha) { const_blas_data_mapper mat(_mat,matStride); // if(AAT) // alpha = conj(alpha); typedef gebp_traits Traits; Index kc = depth; // cache block size along the K direction Index mc = size; // cache block size along the M direction Index nc = size; // cache block size along the N direction computeProductBlockingSizes(kc, mc, nc); // !!! mc must be a multiple of nr: if(mc>Traits::nr) mc = (mc/Traits::nr)*Traits::nr; Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); std::size_t sizeW = kc*Traits::WorkSpaceFactor; std::size_t sizeB = sizeW + kc*size; Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); Scalar* blockB = allocatedBlockB + sizeW; // note that the actual rhs is the transpose/adjoint of mat enum { ConjLhs = NumTraits::IsComplex && !AAT, ConjRhs = NumTraits::IsComplex && AAT }; gebp_kernel gebp_kernel; gemm_pack_rhs pack_rhs; gemm_pack_lhs pack_lhs; sybb_kernel sybb; for(Index k2=0; k2 processed with gebp or skipped // 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel // 3 - after the diagonal => processed with gebp or skipped if (UpLo==Lower) gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, std::min(size,i2), alpha, -1, -1, 0, 0, allocatedBlockB); sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha, allocatedBlockB); if (UpLo==Upper) { Index j2 = i2+actual_mc; gebp_kernel(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, std::max(Index(0), size-j2), alpha, -1, -1, 0, 0, allocatedBlockB); } } } ei_aligned_stack_delete(Scalar, blockA, kc*mc); ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB); } }; // Optimized SYmmetric packed Block * packed Block product kernel. // This kernel is built on top of the gebp kernel: // - the current selfadjoint block (res) is processed per panel of actual_mc x BlockSize // where BlockSize is set to the minimal value allowing gebp to be as fast as possible // - then, as usual, each panel is split into three parts along the diagonal, // the sub blocks above and below the diagonal are processed as usual, // while the selfadjoint block overlapping the diagonal is evaluated into a // small temporary buffer which is then accumulated into the result using a // triangular traversal. template struct sybb_kernel { enum { PacketSize = packet_traits::size, BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr) }; void operator()(Scalar* res, Index resStride, const Scalar* blockA, const Scalar* blockB, Index size, Index depth, Scalar alpha, Scalar* workspace) { gebp_kernel gebp_kernel; Matrix buffer; // let's process the block per panel of actual_mc x BlockSize, // again, each is split into three parts, etc. for (Index j=0; j(BlockSize,size - j); const Scalar* actual_b = blockB+j*depth; if(UpLo==Upper) gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha, -1, -1, 0, 0, workspace); // selfadjoint micro block { Index i = j; buffer.setZero(); // 1 - apply the kernel on the temporary buffer gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, -1, -1, 0, 0, workspace); // 2 - triangular accumulation for(Index j1=0; j1 template SelfAdjointView& SelfAdjointView ::rankUpdate(const MatrixBase& u, Scalar alpha) { typedef internal::blas_traits UBlasTraits; typedef typename UBlasTraits::DirectLinearAccessType ActualUType; typedef typename internal::cleantype::type _ActualUType; const ActualUType actualU = UBlasTraits::extract(u.derived()); Scalar actualAlpha = alpha * UBlasTraits::extractScalarFactor(u.derived()); enum { IsRowMajor = (internal::traits::Flags&RowMajorBit) ? 1 : 0 }; internal::selfadjoint_product ::run(_expression().cols(), actualU.cols(), &actualU.coeff(0,0), actualU.outerStride(), const_cast(_expression().data()), _expression().outerStride(), actualAlpha); return *this; } #endif // EIGEN_SELFADJOINT_PRODUCT_H