generalize our internal rank K update routine to support more general A*B product while evaluating only one triangular part and make it available via, e.g.:

R.triangularView<Lower>() += s * A * B;
author: Gael Guennebaud <g.gael@free.fr> 2010-11-10 18:58:55 +0100
committer: Gael Guennebaud <g.gael@free.fr> 2010-11-10 18:58:55 +0100
commit: 2577ef90c027d8bad4dd632022fa65cec6313159 (patch)
tree: 69b63c743bcc79daa4af6ab7df29c3dac6e1e034 /Eigen/src/Core/products/SelfadjointProduct.h
parent: c810d14d4d595f6a9e748f625ddb419bb91b8262 (diff)
1 files changed, 7 insertions, 169 deletions
diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h
index 16320fa17..c6a5287b5 100644
--- a/Eigen/src/Core/products/SelfadjointProduct.h
+++ b/Eigen/src/Core/products/SelfadjointProduct.h
@@ -25,175 +25,12 @@
 #ifndef EIGEN_SELFADJOINT_PRODUCT_H
 #define EIGEN_SELFADJOINT_PRODUCT_H
 
-namespace internal {
-
 /**********************************************************************
 * This file implements a self adjoint product: C += A A^T updating only
 * half of the selfadjoint matrix C.
 * It corresponds to the level 3 SYRK Blas routine.
 **********************************************************************/
 
-// forward declarations (defined at the end of this file)
-template<typename Scalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo>
-struct sybb_kernel;
-
-/* Optimized selfadjoint product (_SYRK) */
-template <typename Scalar, typename Index,
-          int RhsStorageOrder,
-          int ResStorageOrder, bool AAT, int UpLo>
-struct selfadjoint_product;
-
-// as usual if the result is row major => we transpose the product
-template <typename Scalar, typename Index, int MatStorageOrder, bool AAT, int UpLo>
-struct selfadjoint_product<Scalar, Index, MatStorageOrder, RowMajor, AAT, UpLo>
-{
-  static EIGEN_STRONG_INLINE void run(Index size, Index depth, const Scalar* mat, Index matStride, Scalar* res, Index resStride, Scalar alpha)
-  {
-    selfadjoint_product<Scalar, Index, MatStorageOrder, ColMajor, !AAT, UpLo==Lower?Upper:Lower>
-      ::run(size, depth, mat, matStride, res, resStride, alpha);
-  }
-};
-
-template <typename Scalar, typename Index,
-          int MatStorageOrder, bool AAT, int UpLo>
-struct selfadjoint_product<Scalar, Index, MatStorageOrder, ColMajor, AAT, UpLo>
-{
-
-  static EIGEN_DONT_INLINE void run(
-    Index size, Index depth,
-    const Scalar* _mat, Index matStride,
-    Scalar* res,        Index resStride,
-    Scalar alpha)
-  {
-    const_blas_data_mapper<Scalar, Index, MatStorageOrder> mat(_mat,matStride);
-
-//     if(AAT)
-//       alpha = conj(alpha);
-
-    typedef gebp_traits<Scalar,Scalar> Traits;
-
-    Index kc = depth; // cache block size along the K direction
-    Index mc = size;  // cache block size along the M direction
-    Index nc = size;  // cache block size along the N direction
-    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
-    // !!! mc must be a multiple of nr:
-    if(mc>Traits::nr)
-      mc = (mc/Traits::nr)*Traits::nr;
-
-    Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
-    std::size_t sizeB = sizeW + kc*size;
-    Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
-    Scalar* blockB = allocatedBlockB + sizeW;
-
-    // note that the actual rhs is the transpose/adjoint of mat
-    enum {
-      ConjLhs = NumTraits<Scalar>::IsComplex && !AAT,
-      ConjRhs = NumTraits<Scalar>::IsComplex && AAT
-    };
-
-    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjLhs, ConjRhs> gebp_kernel;
-    gemm_pack_rhs<Scalar, Index, Traits::nr,MatStorageOrder==RowMajor ? ColMajor : RowMajor> pack_rhs;
-    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, MatStorageOrder, false> pack_lhs;
-    sybb_kernel<Scalar, Index, Traits::mr, Traits::nr, ConjLhs, ConjRhs, UpLo> sybb;
-
-    for(Index k2=0; k2<depth; k2+=kc)
-    {
-      const Index actual_kc = std::min(k2+kc,depth)-k2;
-
-      // note that the actual rhs is the transpose/adjoint of mat
-      pack_rhs(blockB, &mat(0,k2), matStride, actual_kc, size);
-
-      for(Index i2=0; i2<size; i2+=mc)
-      {
-        const Index actual_mc = std::min(i2+mc,size)-i2;
-
-        pack_lhs(blockA, &mat(i2, k2), matStride, actual_kc, actual_mc);
-
-        // the selected actual_mc * size panel of res is split into three different part:
-        //  1 - before the diagonal => processed with gebp or skipped
-        //  2 - the actual_mc x actual_mc symmetric block => processed with a special kernel
-        //  3 - after the diagonal => processed with gebp or skipped
-        if (UpLo==Lower)
-          gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, std::min(size,i2), alpha,
-                      -1, -1, 0, 0, allocatedBlockB);
-
-        sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha, allocatedBlockB);
-
-        if (UpLo==Upper)
-        {
-          Index j2 = i2+actual_mc;
-          gebp_kernel(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, std::max(Index(0), size-j2), alpha,
-                      -1, -1, 0, 0, allocatedBlockB);
-        }
-      }
-    }
-    ei_aligned_stack_delete(Scalar, blockA, kc*mc);
-    ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
-  }
-};
-
-// Optimized SYmmetric packed Block * packed Block product kernel.
-// This kernel is built on top of the gebp kernel:
-// - the current selfadjoint block (res) is processed per panel of actual_mc x BlockSize
-//   where BlockSize is set to the minimal value allowing gebp to be as fast as possible
-// - then, as usual, each panel is split into three parts along the diagonal,
-//   the sub blocks above and below the diagonal are processed as usual,
-//   while the selfadjoint block overlapping the diagonal is evaluated into a
-//   small temporary buffer which is then accumulated into the result using a
-//   triangular traversal.
-template<typename Scalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo>
-struct sybb_kernel
-{
-  enum {
-    PacketSize = packet_traits<Scalar>::size,
-    BlockSize  = EIGEN_PLAIN_ENUM_MAX(mr,nr)
-  };
-  void operator()(Scalar* res, Index resStride, const Scalar* blockA, const Scalar* blockB, Index size, Index depth, Scalar alpha, Scalar* workspace)
-  {
-    gebp_kernel<Scalar, Scalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
-    Matrix<Scalar,BlockSize,BlockSize,ColMajor> buffer;
-
-    // let's process the block per panel of actual_mc x BlockSize,
-    // again, each is split into three parts, etc.
-    for (Index j=0; j<size; j+=BlockSize)
-    {
-      Index actualBlockSize = std::min<Index>(BlockSize,size - j);
-      const Scalar* actual_b = blockB+j*depth;
-
-      if(UpLo==Upper)
-        gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha,
-                    -1, -1, 0, 0, workspace);
-
-      // selfadjoint micro block
-      {
-        Index i = j;
-        buffer.setZero();
-        // 1 - apply the kernel on the temporary buffer
-        gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
-                    -1, -1, 0, 0, workspace);
-        // 2 - triangular accumulation
-        for(Index j1=0; j1<actualBlockSize; ++j1)
-        {
-          Scalar* r = res + (j+j1)*resStride + i;
-          for(Index i1=UpLo==Lower ? j1 : 0;
-              UpLo==Lower ? i1<actualBlockSize : i1<=j1; ++i1)
-            r[i1] += buffer(i1,j1);
-        }
-      }
-
-      if(UpLo==Lower)
-      {
-        Index i = j+actualBlockSize;
-        gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha,
-                    -1, -1, 0, 0, workspace);
-      }
-    }
-  }
-};
-
-} // end namespace internal
-
 // high level API
 
 template<typename MatrixType, unsigned int UpLo>
@@ -209,12 +46,13 @@ SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
   Scalar actualAlpha = alpha * UBlasTraits::extractScalarFactor(u.derived());
 
   enum { IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0 };
-
-  internal::selfadjoint_product<Scalar, Index,
-    _ActualUType::Flags&RowMajorBit ? RowMajor : ColMajor,
-    MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor,
-    !UBlasTraits::NeedToConjugate, UpLo>
-    ::run(_expression().cols(), actualU.cols(), &actualU.coeff(0,0), actualU.outerStride(),
+  
+  internal::general_matrix_matrix_triangular_product<Index,
+    Scalar, _ActualUType::Flags&RowMajorBit ? RowMajor : ColMajor,   UBlasTraits::NeedToConjugate  && NumTraits<Scalar>::IsComplex,
+    Scalar, _ActualUType::Flags&RowMajorBit ? ColMajor : RowMajor, (!UBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex,
+    MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor, UpLo>
+    ::run(_expression().cols(), actualU.cols(),
+          &actualU.coeff(0,0), actualU.outerStride(), &actualU.coeff(0,0), actualU.outerStride(),
           const_cast<Scalar*>(_expression().data()), _expression().outerStride(), actualAlpha);
 
   return *this;
author	Gael Guennebaud <g.gael@free.fr>	2010-11-10 18:58:55 +0100
committer	Gael Guennebaud <g.gael@free.fr>	2010-11-10 18:58:55 +0100
commit	2577ef90c027d8bad4dd632022fa65cec6313159 (patch)
tree	69b63c743bcc79daa4af6ab7df29c3dac6e1e034 /Eigen/src/Core/products/SelfadjointProduct.h
parent	c810d14d4d595f6a9e748f625ddb419bb91b8262 (diff)