diff options
-rw-r--r-- | Eigen/src/Core/products/TriangularMatrixMatrix.h | 68 | ||||
-rw-r--r-- | Eigen/src/Core/products/TriangularMatrixVector_MKL.h | 8 | ||||
-rw-r--r-- | blas/level3_impl.h | 12 | ||||
-rw-r--r-- | test/nomalloc.cpp | 2 |
4 files changed, 54 insertions, 36 deletions
diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index 06053bfd9..c41d997d8 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -76,7 +76,7 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,LhsIsTriangular, const Scalar* lhs, Index lhsStride, const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, - Scalar alpha) + Scalar alpha, level3_blocking<Scalar,Scalar>& blocking) { product_triangular_matrix_matrix<Scalar, Index, (Mode&(UnitDiag|ZeroDiag)) | ((Mode&Upper) ? Lower : Upper), @@ -86,7 +86,7 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,LhsIsTriangular, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs, ColMajor> - ::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha); + ::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking); } }; @@ -111,7 +111,7 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,true, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, Scalar* res, Index resStride, - Scalar alpha) + Scalar alpha, level3_blocking<Scalar,Scalar>& blocking) { // strip zeros Index diagSize = (std::min)(_rows,_depth); @@ -122,15 +122,16 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,true, const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); - Index kc = depth; // cache block size along the K direction - Index mc = rows; // cache block size along the M direction - Index nc = cols; // cache block size along the N direction - computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc); + Index kc = blocking.kc(); // cache block size along the K direction + Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction + + std::size_t sizeA = kc*mc; + std::size_t sizeB = kc*cols; std::size_t sizeW = kc*Traits::WorkSpaceFactor; - std::size_t sizeB = sizeW + kc*cols; - ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); - ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); - Scalar* blockB = allocatedBlockB + sizeW; + + ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); + ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); + ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW()); Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer; triangularBuffer.setZero(); @@ -188,7 +189,7 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,true, pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth); gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha, - actualPanelWidth, actual_kc, 0, blockBOffset); + actualPanelWidth, actual_kc, 0, blockBOffset, blockW); // GEBP with remaining micro panel if (lengthTarget>0) @@ -198,7 +199,7 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,true, pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget); gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha, - actualPanelWidth, actual_kc, 0, blockBOffset); + actualPanelWidth, actual_kc, 0, blockBOffset, blockW); } } } @@ -212,7 +213,7 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,true, gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>() (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW); } } } @@ -239,7 +240,7 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,false, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, Scalar* res, Index resStride, - Scalar alpha) + Scalar alpha, level3_blocking<Scalar,Scalar>& blocking) { // strip zeros Index diagSize = (std::min)(_cols,_depth); @@ -250,16 +251,16 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,false, const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); - Index kc = depth; // cache block size along the K direction - Index mc = rows; // cache block size along the M direction - Index nc = cols; // cache block size along the N direction - computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc); + Index kc = blocking.kc(); // cache block size along the K direction + Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction + std::size_t sizeA = kc*mc; + std::size_t sizeB = kc*cols; std::size_t sizeW = kc*Traits::WorkSpaceFactor; - std::size_t sizeB = sizeW + kc*cols; - ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); - ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); - Scalar* blockB = allocatedBlockB + sizeW; + + ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); + ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); + ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW()); Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer; triangularBuffer.setZero(); @@ -347,13 +348,13 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,false, alpha, actual_kc, actual_kc, // strides blockOffset, blockOffset,// offsets - allocatedBlockB); // workspace + blockW); // workspace } } gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride, blockA, geb, actual_mc, actual_kc, rs, alpha, - -1, -1, 0, 0, allocatedBlockB); + -1, -1, 0, 0, blockW); } } } @@ -386,17 +387,28 @@ struct TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false> Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) * RhsBlasTraits::extractScalarFactor(m_rhs); + typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar, + Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,4> BlockingType; + + enum { IsLower = (Mode&Lower) == Lower }; + Index stripedRows = ((!LhsIsTriangular) || (IsLower)) ? lhs.rows() : (std::min)(lhs.rows(),lhs.cols()); + Index stripedCols = ((LhsIsTriangular) || (!IsLower)) ? rhs.cols() : (std::min)(rhs.cols(),rhs.rows()); + Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(),lhs.rows())) + : ((IsLower) ? rhs.rows() : (std::min)(rhs.rows(),rhs.cols())); + + BlockingType blocking(stripedRows, stripedCols, stripedDepth); + internal::product_triangular_matrix_matrix<Scalar, Index, Mode, LhsIsTriangular, (internal::traits<_ActualLhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate, (internal::traits<_ActualRhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate, (internal::traits<Dest >::Flags&RowMajorBit) ? RowMajor : ColMajor> ::run( - lhs.rows(), rhs.cols(), lhs.cols(),// LhsIsTriangular ? rhs.cols() : lhs.rows(), // sizes + stripedRows, stripedCols, stripedDepth, // sizes &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info - &dst.coeffRef(0,0), dst.outerStride(), // result info - actualAlpha // alpha + &dst.coeffRef(0,0), dst.outerStride(), // result info + actualAlpha, blocking ); } }; diff --git a/Eigen/src/Core/products/TriangularMatrixVector_MKL.h b/Eigen/src/Core/products/TriangularMatrixVector_MKL.h index 3c2c3049a..3589b8c5e 100644 --- a/Eigen/src/Core/products/TriangularMatrixVector_MKL.h +++ b/Eigen/src/Core/products/TriangularMatrixVector_MKL.h @@ -82,11 +82,11 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE, LowUp = IsLower ? Lower : Upper \ }; \ static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \ - const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \ + const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha, level3_blocking<EIGTYPE,EIGTYPE>& blocking) \ { \ if (ConjLhs || IsZeroDiag) { \ triangular_matrix_vector_product<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor,BuiltIn>::run( \ - _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \ + _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha, blocking); \ return; \ }\ Index size = (std::min)(_rows,_cols); \ @@ -167,11 +167,11 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE, LowUp = IsLower ? Lower : Upper \ }; \ static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \ - const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \ + const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha, level3_blocking<EIGTYPE,EIGTYPE>& blocking) \ { \ if (IsZeroDiag) { \ triangular_matrix_vector_product<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor,BuiltIn>::run( \ - _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \ + _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha, blocking); \ return; \ }\ Index size = (std::min)(_rows,_cols); \ diff --git a/blas/level3_impl.h b/blas/level3_impl.h index 4f4f39080..0a6dbc383 100644 --- a/blas/level3_impl.h +++ b/blas/level3_impl.h @@ -167,7 +167,7 @@ int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m, int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb) { // std::cerr << "in trmm " << *side << " " << *uplo << " " << *opa << " " << *diag << " " << *m << " " << *n << " " << *lda << " " << *ldb << " " << *palpha << "\n"; - typedef void (*functype)(DenseIndex, DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, Scalar); + typedef void (*functype)(DenseIndex, DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, Scalar, internal::level3_blocking<Scalar,Scalar>&); static functype func[32]; static bool init = false; if(!init) @@ -236,9 +236,15 @@ int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m, matrix(b,*m,*n,*ldb).setZero(); if(SIDE(*side)==LEFT) - func[code](*m, *n, *m, a, *lda, tmp.data(), tmp.outerStride(), b, *ldb, alpha); + { + internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m); + func[code](*m, *n, *m, a, *lda, tmp.data(), tmp.outerStride(), b, *ldb, alpha, blocking); + } else - func[code](*m, *n, *n, tmp.data(), tmp.outerStride(), a, *lda, b, *ldb, alpha); + { + internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n); + func[code](*m, *n, *n, tmp.data(), tmp.outerStride(), a, *lda, b, *ldb, alpha, blocking); + } return 1; } diff --git a/test/nomalloc.cpp b/test/nomalloc.cpp index 1feeff4bc..f7c2ed229 100644 --- a/test/nomalloc.cpp +++ b/test/nomalloc.cpp @@ -165,7 +165,7 @@ void ctms_decompositions() X = hQR.solve(B); x = hQR.solve(b); Eigen::ColPivHouseholderQR<Matrix> cpQR; cpQR.compute(A); - // FIXME X = cpQR.solve(B); + X = cpQR.solve(B); x = cpQR.solve(b); Eigen::FullPivHouseholderQR<Matrix> fpQR; fpQR.compute(A); // FIXME X = fpQR.solve(B); |