diff options
author | Gael Guennebaud <g.gael@free.fr> | 2009-07-24 10:08:21 +0200 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2009-07-24 10:08:21 +0200 |
commit | 6076173f0b941fbb4cd22ff6e0d1cb9f0b56452f (patch) | |
tree | 19fb0982da8e3ca86ab5d5ce7c28c5347149b3a9 | |
parent | 82c5438c95a587708483d3bb0933b9166d7bb303 (diff) |
add a simplified version of the sybb kernel built on top of gebp
-rw-r--r-- | Eigen/src/Core/products/SelfadjointProduct.h | 55 |
1 files changed, 54 insertions, 1 deletions
diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h index 08df9e15c..facde7252 100644 --- a/Eigen/src/Core/products/SelfadjointProduct.h +++ b/Eigen/src/Core/products/SelfadjointProduct.h @@ -150,11 +150,64 @@ SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo> // optimized SYmmetric packed Block * packed Block product kernel +template<typename Scalar, int mr, int nr, typename Conj, int UpLo> +struct ei_sybb_kernel +{ + enum { + PacketSize = ei_packet_traits<Scalar>::size, + BlockSize = EIGEN_ENUM_MAX(mr,nr) + }; + void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int actual_mc, int actual_kc, int packet_cols) + { + ei_gebp_kernel<Scalar, mr, nr, Conj> gebp_kernel; + Matrix<Scalar,BlockSize,BlockSize,ColMajor> buffer; + + // let's process the block per panel of actual_mc x BlockSize, + // again, each is split into three parts, etc. + for (int j=0; j<actual_mc; j+=BlockSize) + { + int actualBlockSize = std::min(BlockSize,actual_mc - j); + int actualPacketCols = std::min(actualBlockSize,std::max(packet_cols-j,0)); + const Scalar* actual_b = blockB+j*actual_kc*PacketSize; + + if(UpLo==UpperTriangular) + gebp_kernel(res+j*resStride, resStride, blockA, actual_b, + j, actual_kc, actualPacketCols, 0, actualBlockSize); + + // selfadjoint micro block + // => use the gebp kernel on a temporary buffer, + // and then perform a triangular copy + { + int i = j; + buffer.setZero(); + gebp_kernel(buffer.data(), BlockSize, blockA+actual_kc*i, actual_b, + actualBlockSize, actual_kc, actualPacketCols, 0, actualBlockSize); + for(int j1=0; j1<actualBlockSize; ++j1) + { + Scalar* r = res + (j+j1)*resStride; + for(int i1=UpLo==LowerTriangular ? j1 : 0; + UpLo==LowerTriangular ? i1<actualBlockSize : i1<=j1; ++j1) + r[i1+j1*resStride] = buffer(i1,j1); + } + } + + if(UpLo==LowerTriangular) + { + int i = j+actualBlockSize; + if(i<actual_mc) + gebp_kernel(res+j*resStride+i, resStride, blockA+actual_kc*i, actual_b, + actual_mc-i, actual_kc, actualPacketCols, 0, actualBlockSize); + } + } + } +}; + +// optimized SYmmetric packed Block * packed Block product kernel // this kernel is very similar to the gebp kernel: the only differences are // the piece of code to avoid the writes off the diagonal // => TODO find a way to factorize the two kernels in a single one template<typename Scalar, int mr, int nr, typename Conj, int UpLo> -struct ei_sybb_kernel +struct ei_sybb_kernel_ { void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int actual_mc, int actual_kc, int packet_cols) { |