From a81388fae9a4101060206d29019d031808ec83d7 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 23 Jul 2009 19:01:20 +0200 Subject: Implement efficient sefladjoint product (aka SYRK) : C += alpha * U U^T It is currently available via SelfAdjointView::rankKupdate. TODO: allows to write SelfAdjointView += u * u.adjoint() --- Eigen/src/Core/SelfAdjointView.h | 10 + Eigen/src/Core/products/GeneralMatrixMatrix.h | 18 ++ Eigen/src/Core/products/SelfadjointProduct.h | 419 ++++++++++++++++++++++++++ Eigen/src/Core/util/BlasUtil.h | 4 +- 4 files changed, 449 insertions(+), 2 deletions(-) create mode 100644 Eigen/src/Core/products/SelfadjointProduct.h (limited to 'Eigen/src/Core') diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h index 540f4fe93..b1b4f9e32 100644 --- a/Eigen/src/Core/SelfAdjointView.h +++ b/Eigen/src/Core/SelfAdjointView.h @@ -128,6 +128,16 @@ template class SelfAdjointView template void rank2update(const MatrixBase& u, const MatrixBase& v, Scalar alpha = Scalar(1)); + /** Perform a symmetric rank K update of the selfadjoint matrix \c *this: + * \f$ this = this + \alpha ( u u^* ) \f$ + * where \a u is a vector or matrix. + * + * Note that to perform \f$ this = this + \alpha ( u^* u ) \f$ you can simply + * call this function with u.adjoint(). + */ + template + void rankKupdate(const MatrixBase& u, Scalar alpha = Scalar(1)); + /////////// Cholesky module /////////// const LLT llt() const; diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index b2f51ca5b..9166921fe 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -334,6 +334,19 @@ struct ei_gebp_kernel }; // pack a block of the lhs +// The travesal is as follow (mr==4): +// 0 4 8 12 ... +// 1 5 9 13 ... +// 2 6 10 14 ... +// 3 7 11 15 ... +// +// 16 20 24 28 ... +// 17 21 25 29 ... +// 18 22 26 30 ... +// 19 23 27 31 ... +// +// 32 33 34 35 ... +// 36 36 38 39 ... template struct ei_gemm_pack_lhs { @@ -357,6 +370,11 @@ struct ei_gemm_pack_lhs // copy a complete panel of the rhs while expending each coefficient into a packet form // this version is optimized for column major matrices +// The traversal order is as follow (nr==4): +// 0 1 2 3 12 13 14 15 24 27 +// 4 5 6 7 16 17 18 19 25 28 +// 8 9 10 11 20 21 22 23 26 29 +// . . . . . . . . . . template struct ei_gemm_pack_rhs { diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h new file mode 100644 index 000000000..5d045a921 --- /dev/null +++ b/Eigen/src/Core/products/SelfadjointProduct.h @@ -0,0 +1,419 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Gael Guennebaud +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see . + +#ifndef EIGEN_SELFADJOINT_PRODUCT_H +#define EIGEN_SELFADJOINT_PRODUCT_H + +/********************************************************************** +* This file implement a self adjoint product: C += A A^T updating only +* an half of the selfadjoint matrix C. +* It corresponds to the level 3 SYRK Blas routine. +**********************************************************************/ + +// forward declarations (defined at the end of this file) +template +struct ei_sybb_kernel; + +/* Optimized selfadjoint product (_SYRK) */ +template +struct ei_selfadjoint_product; + +// as usual if the result is row major => we transpose the product +template +struct ei_selfadjoint_product +{ + static EIGEN_STRONG_INLINE void run(int size, const Scalar* mat, int matStride, Scalar* res, int resStride, Scalar alpha) + { + ei_selfadjoint_product + ::run(size, mat, matStride, res, resStride, alpha); + } +}; + +template +struct ei_selfadjoint_product +{ + + static EIGEN_DONT_INLINE void run( + int size, + const Scalar* _mat, int matStride, + Scalar* res, int resStride, + Scalar alpha) + { + ei_const_blas_data_mapper mat(_mat,matStride); + + if(AAT) + alpha = ei_conj(alpha); + + typedef ei_product_blocking_traits Blocking; + + int kc = std::min(Blocking::Max_kc,size); // cache block size along the K direction + int mc = std::min(Blocking::Max_mc,size); // cache block size along the M direction + + Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); + Scalar* blockB = ei_aligned_stack_new(Scalar, kc*size*Blocking::PacketSize); + + // number of columns which can be processed by packet of nr columns + int packet_cols = (size/Blocking::nr)*Blocking::nr; + + // note that the actual rhs is the transpose/adjoint of mat + typedef ei_conj_helper::IsComplex && !AAT, NumTraits::IsComplex && AAT> Conj; + + ei_gebp_kernel gebp_kernel; + + for(int k2=0; k2() + (blockB, &mat(0,k2), matStride, alpha, actual_kc, packet_cols, size); + + for(int i2=0; i2() + (blockA, &mat(i2, k2), matStride, actual_kc, actual_mc); + + // the selected actual_mc * size panel of res is split into three different part: + // 1 - before the diagonal => processed with gebp or skipped + // 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel + // 3 - after the diagonal => processed with gebp or skipped + if (UpLo==LowerTriangular) + gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, std::min(packet_cols,i2), i2, std::min(size,i2)); + + ei_sybb_kernel() + (res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*Blocking::PacketSize*i2, actual_mc, actual_kc, std::min(actual_mc,std::max(packet_cols-i2,0))); + + if (UpLo==UpperTriangular) + { + int j2 = i2+actual_mc; + gebp_kernel(res+resStride*j2, resStride, blockA, blockB+actual_kc*Blocking::PacketSize*j2, actual_mc, actual_kc, + std::max(0,packet_cols-j2), i2, std::max(0,size-j2)); + } + } + } + ei_aligned_stack_delete(Scalar, blockA, kc*mc); + ei_aligned_stack_delete(Scalar, blockB, kc*size*Blocking::PacketSize); + } +}; + +// high level API + +template +template +void SelfAdjointView +::rankKupdate(const MatrixBase& u, Scalar alpha) +{ + typedef ei_blas_traits UBlasTraits; + typedef typename UBlasTraits::DirectLinearAccessType ActualUType; + typedef typename ei_cleantype::type _ActualUType; + const ActualUType actualU = UBlasTraits::extract(u.derived()); + + Scalar actualAlpha = alpha * UBlasTraits::extractScalarFactor(u.derived()); + + enum { IsRowMajor = (ei_traits::Flags&RowMajorBit)?1:0 }; + + ei_selfadjoint_product::Flags&RowMajorBit ? RowMajor : ColMajor, + !UBlasTraits::NeedToConjugate, + UpLo>::run(_expression().cols(), &actualU.coeff(0,0), actualU.stride(), const_cast(_expression().data()), _expression().stride(), actualAlpha); +} + + + +// optimized SYmmetric packed Block * packed Block product kernel +// this kernel is very similar to the gebp kernel: the only differences are +// the piece of code to avoid the writes off the diagonal +// => TODO find a way to factorize the two kernels in a single one +template +struct ei_sybb_kernel +{ + void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int actual_mc, int actual_kc, int packet_cols) + { + typedef typename ei_packet_traits::type PacketType; + enum { PacketSize = ei_packet_traits::size }; + Conj cj; + const int peeled_mc = (actual_mc/mr)*mr; + // loops on each cache friendly block of the result/rhs + for(int j2=0; j2=j2+nr : i+mr<=j2) + { + ei_pstoreu(&res[(j2+0)*resStride + i], C0); + ei_pstoreu(&res[(j2+1)*resStride + i], C1); + if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i], C2); + if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i], C3); + ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4); + ei_pstoreu(&res[(j2+1)*resStride + i + PacketSize], C5); + if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i + PacketSize], C6); + if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i + PacketSize], C7); + } + else + { + Scalar buf[mr*nr]; + // overlap => copy to a temporary mr x nr buffer and then triangular copy + ei_pstore(&buf[0*mr], C0); + ei_pstore(&buf[1*mr], C1); + if(nr==4) ei_pstore(&buf[2*mr], C2); + if(nr==4) ei_pstore(&buf[3*mr], C3); + ei_pstore(&buf[0*mr + PacketSize], C4); + ei_pstore(&buf[1*mr + PacketSize], C5); + if(nr==4) ei_pstore(&buf[2*mr + PacketSize], C6); + if(nr==4) ei_pstore(&buf[3*mr + PacketSize], C7); + + for(int j1=0; j1= j2+j1 : i+i1 <= j2+j1) + res[(j2+j1)*resStride + i+i1] = buf[i1 + j1 * mr]; + } + } + } + for(int i=std::max(start_i,peeled_mc); i=j2+nr : i+mr<=j2) { + res[(j2+0)*resStride + i] += C0; + res[(j2+1)*resStride + i] += C1; + if(nr==4) res[(j2+2)*resStride + i] += C2; + if(nr==4) res[(j2+3)*resStride + i] += C3; + } + else + { + if(UpLo==LowerTriangular ? i>=j2+0 : i<=j2+0) res[(j2+0)*resStride + i] += C0; + if(UpLo==LowerTriangular ? i>=j2+1 : i<=j2+1) res[(j2+1)*resStride + i] += C1; + if(nr==4) if(UpLo==LowerTriangular ? i>=j2+2 : i<=j2+2) res[(j2+2)*resStride + i] += C2; + if(nr==4) if(UpLo==LowerTriangular ? i>=j2+3 : i<=j2+3) res[(j2+3)*resStride + i] += C3; + } + } + } + + // process remaining rhs/res columns one at a time + // => do the same but with nr==1 + for(int j2=packet_cols; j2=j2 : i<=j2) ei_pstoreu(&res[(j2+0)*resStride + i], C0); + if(UpLo==LowerTriangular ? i+PacketSize>=j2 : i+PacketSize<=j2) ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4); + } + if(UpLo==LowerTriangular) + start_i = j2; + for(int i=std::max(start_i,peeled_mc); i