diff options
author | Gael Guennebaud <g.gael@free.fr> | 2009-07-27 10:27:01 +0200 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2009-07-27 10:27:01 +0200 |
commit | 6aba84719d09cf19a43a0e8356b010a0f37f2a5d (patch) | |
tree | 343d22e941872c404488d2d2ef5374db9bda0244 /Eigen/src/Core | |
parent | 1d4d9a37fd2b60d13f1bd520ab2ebebf18882b4e (diff) |
trmm is now working in all storage order configurations
Diffstat (limited to 'Eigen/src/Core')
-rw-r--r-- | Eigen/src/Core/products/GeneralMatrixMatrix.h | 58 | ||||
-rw-r--r-- | Eigen/src/Core/products/TriangularSolverMatrix.h | 47 | ||||
-rw-r--r-- | Eigen/src/Core/util/BlasUtil.h | 2 |
3 files changed, 36 insertions, 71 deletions
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index a1be8ea50..66fee793c 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -125,7 +125,7 @@ struct ei_gebp_kernel // loops on each register blocking of lhs/res for(int i=0; i<peeled_mc; i+=mr) { - const Scalar* blA = &blockA[i*strideA]; + const Scalar* blA = &blockA[i*strideA+offsetA*mr]; #ifdef EIGEN_VECTORIZE_SSE _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); #endif @@ -248,7 +248,7 @@ struct ei_gebp_kernel } for(int i=peeled_mc; i<rows; i++) { - const Scalar* blA = &blockA[i*strideA]; + const Scalar* blA = &blockA[i*strideA+offsetA]; #ifdef EIGEN_VECTORIZE_SSE _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); #endif @@ -285,7 +285,7 @@ struct ei_gebp_kernel { for(int i=0; i<peeled_mc; i+=mr) { - const Scalar* blA = &blockA[i*strideA]; + const Scalar* blA = &blockA[i*strideA+offsetA*mr]; #ifdef EIGEN_VECTORIZE_SSE _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); #endif @@ -317,7 +317,7 @@ struct ei_gebp_kernel } for(int i=peeled_mc; i<rows; i++) { - const Scalar* blA = &blockA[i*strideA]; + const Scalar* blA = &blockA[i*strideA+offsetA]; #ifdef EIGEN_VECTORIZE_SSE _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); #endif @@ -375,17 +375,21 @@ struct ei_gemm_pack_lhs // 4 5 6 7 16 17 18 19 25 28 // 8 9 10 11 20 21 22 23 26 29 // . . . . . . . . . . -template<typename Scalar, int nr> -struct ei_gemm_pack_rhs<Scalar, nr, ColMajor> +template<typename Scalar, int nr, bool PanelMode> +struct ei_gemm_pack_rhs<Scalar, nr, ColMajor, PanelMode> { enum { PacketSize = ei_packet_traits<Scalar>::size }; - void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols) + void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols, + int stride=0, int offset=0) { + ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); bool hasAlpha = alpha != Scalar(1); int packet_cols = (cols/nr) * nr; int count = 0; for(int j2=0; j2<packet_cols; j2+=nr) { + // skip what we have before + if(PanelMode) count += PacketSize * nr * offset; const Scalar* b0 = &rhs[(j2+0)*rhsStride]; const Scalar* b1 = &rhs[(j2+1)*rhsStride]; const Scalar* b2 = &rhs[(j2+2)*rhsStride]; @@ -418,10 +422,13 @@ struct ei_gemm_pack_rhs<Scalar, nr, ColMajor> count += nr*PacketSize; } } + // skip what we have after + if(PanelMode) count += PacketSize * nr * (stride-offset-depth); } // copy the remaining columns one at a time (nr==1) for(int j2=packet_cols; j2<cols; ++j2) { + if(PanelMode) count += PacketSize * offset; const Scalar* b0 = &rhs[(j2+0)*rhsStride]; if (hasAlpha) { @@ -439,34 +446,36 @@ struct ei_gemm_pack_rhs<Scalar, nr, ColMajor> count += PacketSize; } } + if(PanelMode) count += PacketSize * (stride-offset-depth); } } }; // this version is optimized for row major matrices -template<typename Scalar, int nr> -struct ei_gemm_pack_rhs<Scalar, nr, RowMajor> +template<typename Scalar, int nr, bool PanelMode> +struct ei_gemm_pack_rhs<Scalar, nr, RowMajor, PanelMode> { enum { PacketSize = ei_packet_traits<Scalar>::size }; - void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols) + void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols, + int stride=0, int offset=0) { + ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); bool hasAlpha = alpha != Scalar(1); int packet_cols = (cols/nr) * nr; int count = 0; for(int j2=0; j2<packet_cols; j2+=nr) { + // skip what we have before + if(PanelMode) count += PacketSize * nr * offset; if (hasAlpha) { for(int k=0; k<depth; k++) { const Scalar* b0 = &rhs[k*rhsStride + j2]; - ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[0])); - ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b0[1])); - if (nr==4) - { - ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b0[2])); - ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b0[3])); - } + ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[0])); + ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b0[1])); + if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b0[2])); + if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b0[3])); count += nr*PacketSize; } } @@ -475,26 +484,27 @@ struct ei_gemm_pack_rhs<Scalar, nr, RowMajor> for(int k=0; k<depth; k++) { const Scalar* b0 = &rhs[k*rhsStride + j2]; - ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[0])); - ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b0[1])); - if (nr==4) - { - ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b0[2])); - ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b0[3])); - } + ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[0])); + ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b0[1])); + if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b0[2])); + if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b0[3])); count += nr*PacketSize; } } + // skip what we have after + if(PanelMode) count += PacketSize * nr * (stride-offset-depth); } // copy the remaining columns one at a time (nr==1) for(int j2=packet_cols; j2<cols; ++j2) { + if(PanelMode) count += PacketSize * offset; const Scalar* b0 = &rhs[j2]; for(int k=0; k<depth; k++) { ei_pstore(&blockB[count], ei_pset1(alpha*b0[k*rhsStride])); count += PacketSize; } + if(PanelMode) count += PacketSize * (stride-offset-depth); } } }; diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h index 550076f68..e28aba747 100644 --- a/Eigen/src/Core/products/TriangularSolverMatrix.h +++ b/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -25,9 +25,6 @@ #ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_H #define EIGEN_TRIANGULAR_SOLVER_MATRIX_H -template<typename Scalar, int nr> -struct ei_gemm_pack_rhs_panel; - // if the rhs is row major, we have to evaluate it in a temporary colmajor matrix template <typename Scalar, int LhsStorageOrder, bool ConjugateLhs, int Mode> struct ei_triangular_solve_matrix<Scalar,LhsStorageOrder,ConjugateLhs,RowMajor,Mode> @@ -136,7 +133,7 @@ struct ei_triangular_solve_matrix<Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,M int blockBOffset = IsLowerTriangular ? k1 : lengthTarget; // update the respective rows of B from rhs - ei_gemm_pack_rhs_panel<Scalar, Blocking::nr>() + ei_gemm_pack_rhs<Scalar, Blocking::nr, ColMajor, true>() (blockB, _rhs+startBlock, rhsStride, -1, actualPanelWidth, cols, actual_kc, blockBOffset); // GEBP @@ -174,46 +171,4 @@ struct ei_triangular_solve_matrix<Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,M } }; -template<typename Scalar, int nr> -struct ei_gemm_pack_rhs_panel -{ - enum { PacketSize = ei_packet_traits<Scalar>::size }; - void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols, int stride, int offset) - { - int packet_cols = (cols/nr) * nr; - int count = 0; - for(int j2=0; j2<packet_cols; j2+=nr) - { - // skip what we have before - count += PacketSize * nr * offset; - const Scalar* b0 = &rhs[(j2+0)*rhsStride]; - const Scalar* b1 = &rhs[(j2+1)*rhsStride]; - const Scalar* b2 = &rhs[(j2+2)*rhsStride]; - const Scalar* b3 = &rhs[(j2+3)*rhsStride]; - for(int k=0; k<depth; k++) - { - ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[k])); - ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b1[k])); - if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b2[k])); - if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b3[k])); - count += nr*PacketSize; - } - // skip what we have after - count += PacketSize * nr * (stride-offset-depth); - } - // copy the remaining columns one at a time (nr==1) - for(int j2=packet_cols; j2<cols; ++j2) - { - count += PacketSize * offset; - const Scalar* b0 = &rhs[(j2+0)*rhsStride]; - for(int k=0; k<depth; k++) - { - ei_pstore(&blockB[count], ei_pset1(alpha*b0[k])); - count += PacketSize; - } - count += PacketSize * (stride-offset-depth); - } - } -}; - #endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_H diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index c3a7289a8..339e11de7 100644 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -32,7 +32,7 @@ template<typename Scalar, int mr, int nr, typename Conj> struct ei_gebp_kernel; -template<typename Scalar, int nr, int StorageOrder> +template<typename Scalar, int nr, int StorageOrder, bool PanelMode=false> struct ei_gemm_pack_rhs; template<typename Scalar, int mr, int StorageOrder, bool Conjugate = false> |