diff options
Diffstat (limited to 'Eigen/src/Core/products')
-rw-r--r-- | Eigen/src/Core/products/GeneralBlockPanelKernel.h | 272 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralMatrixMatrix.h | 12 | ||||
-rw-r--r-- | Eigen/src/Core/products/SelfadjointMatrixMatrix.h | 64 | ||||
-rw-r--r-- | Eigen/src/Core/products/SelfadjointProduct.h | 21 | ||||
-rw-r--r-- | Eigen/src/Core/products/TriangularMatrixMatrix.h | 22 | ||||
-rw-r--r-- | Eigen/src/Core/products/TriangularSolverMatrix.h | 11 |
6 files changed, 177 insertions, 225 deletions
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index e1c258ff0..9c9dd52f2 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -136,7 +136,7 @@ inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, st // FIXME #ifndef EIGEN_HAS_FUSE_CJMADD #define EIGEN_HAS_FUSE_CJMADD -#endif +#endif #ifdef EIGEN_HAS_FUSE_CJMADD #define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); #else @@ -144,7 +144,7 @@ inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, st #endif /* optimized GEneral packed Block * packed Panel product kernel - * + * * Mixing type logic: C += A * B * | A | B | comments * |real |cplx | no vectorization yet, would require to pack A with duplication @@ -156,10 +156,7 @@ struct ei_gebp_kernel typedef typename ei_scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; enum { - Vectorizable = ei_product_blocking_traits<LhsScalar,RhsScalar>::Vectorizable /*ei_packet_traits<LhsScalar>::Vectorizable - && ei_packet_traits<RhsScalar>::Vectorizable - && (ei_is_same_type<LhsScalar,RhsScalar>::ret - || (NumTraits<LhsScalar>::IsComplex && !NumTraits<RhsScalar>::IsComplex))*/, + Vectorizable = ei_product_blocking_traits<LhsScalar,RhsScalar>::Vectorizable, LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1, RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1, ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1 @@ -173,7 +170,7 @@ struct ei_gebp_kernel typedef typename ei_meta_if<Vectorizable,_RhsPacket,RhsScalar>::ret RhsPacket; typedef typename ei_meta_if<Vectorizable,_ResPacket,ResScalar>::ret ResPacket; - void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, + void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB = 0) { if(strideA==-1) strideA = depth; @@ -447,6 +444,7 @@ EIGEN_ASM_COMMENT("myend"); } ResPacket R0, R1, R2, R3, R4, R5, R6, R7; + ResPacket alphav = ei_pset1<ResPacket>(alpha); R0 = ei_ploadu<ResPacket>(r0); R1 = ei_ploadu<ResPacket>(r1); @@ -457,14 +455,14 @@ EIGEN_ASM_COMMENT("myend"); if(nr==4) R6 = ei_ploadu<ResPacket>(r2 + ResPacketSize); if(nr==4) R7 = ei_ploadu<ResPacket>(r3 + ResPacketSize); - C0 = ei_padd(R0, C0); - C1 = ei_padd(R1, C1); - if(nr==4) C2 = ei_padd(R2, C2); - if(nr==4) C3 = ei_padd(R3, C3); - C4 = ei_padd(R4, C4); - C5 = ei_padd(R5, C5); - if(nr==4) C6 = ei_padd(R6, C6); - if(nr==4) C7 = ei_padd(R7, C7); + C0 = ei_pmadd(C0, alphav, R0); + C1 = ei_pmadd(C1, alphav, R1); + if(nr==4) C2 = ei_pmadd(C2, alphav, R2); + if(nr==4) C3 = ei_pmadd(C3, alphav, R3); + C4 = ei_pmadd(C4, alphav, R4); + C5 = ei_pmadd(C5, alphav, R5); + if(nr==4) C6 = ei_pmadd(C6, alphav, R6); + if(nr==4) C7 = ei_pmadd(C7, alphav, R7); ei_pstoreu(r0, C0); ei_pstoreu(r1, C1); @@ -483,10 +481,10 @@ EIGEN_ASM_COMMENT("myend"); // gets res block as register ResPacket C0, C1, C2, C3; - C0 = ei_ploadu<ResPacket>(&res[(j2+0)*resStride + i]); - C1 = ei_ploadu<ResPacket>(&res[(j2+1)*resStride + i]); - if(nr==4) C2 = ei_ploadu<ResPacket>(&res[(j2+2)*resStride + i]); - if(nr==4) C3 = ei_ploadu<ResPacket>(&res[(j2+3)*resStride + i]); + C0 = ei_pset1<ResPacket>(ResScalar(0)); + C1 = ei_pset1<ResPacket>(ResScalar(0)); + if(nr==4) C2 = ei_pset1<ResPacket>(ResScalar(0)); + if(nr==4) C3 = ei_pset1<ResPacket>(ResScalar(0)); // performs "inner" product const RhsScalar* blB = unpackedB; @@ -573,24 +571,18 @@ EIGEN_ASM_COMMENT("myend"); if(nr==2) { LhsPacket A0; - RhsPacket B0; - #ifndef EIGEN_HAS_FUSE_CJMADD - RhsPacket T0; - #endif + RhsPacket B0, B1; A0 = ei_pload<LhsPacket>(&blA[0*LhsPacketSize]); B0 = ei_pload<RhsPacket>(&blB[0*RhsPacketSize]); - MADD(pcj,A0,B0,C0,T0); - B0 = ei_pload<RhsPacket>(&blB[1*RhsPacketSize]); - MADD(pcj,A0,B0,C1,T0); + B1 = ei_pload<RhsPacket>(&blB[1*RhsPacketSize]); + MADD(pcj,A0,B0,C0,B0); + MADD(pcj,A0,B1,C1,B1); } else { LhsPacket A0; RhsPacket B0, B1, B2, B3; - #ifndef EIGEN_HAS_FUSE_CJMADD - RhsPacket T0, T1; - #endif A0 = ei_pload<LhsPacket>(&blA[0*LhsPacketSize]); B0 = ei_pload<RhsPacket>(&blB[0*RhsPacketSize]); @@ -598,20 +590,38 @@ EIGEN_ASM_COMMENT("myend"); B2 = ei_pload<RhsPacket>(&blB[2*RhsPacketSize]); B3 = ei_pload<RhsPacket>(&blB[3*RhsPacketSize]); - MADD(pcj,A0,B0,C0,T0); - MADD(pcj,A0,B1,C1,T1); - MADD(pcj,A0,B2,C2,T0); - MADD(pcj,A0,B3,C3,T1); + MADD(pcj,A0,B0,C0,B0); + MADD(pcj,A0,B1,C1,B1); + MADD(pcj,A0,B2,C2,B2); + MADD(pcj,A0,B3,C3,B3); } blB += nr*RhsPacketSize; blA += LhsPacketSize; } - ei_pstoreu(&res[(j2+0)*resStride + i], C0); - ei_pstoreu(&res[(j2+1)*resStride + i], C1); - if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i], C2); - if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i], C3); + ResPacket R0, R1, R2, R3; + ResPacket alphav = ei_pset1<ResPacket>(alpha); + + ResScalar* r0 = &res[(j2+0)*resStride + i]; + ResScalar* r1 = r0 + resStride; + ResScalar* r2 = r1 + resStride; + ResScalar* r3 = r2 + resStride; + + R0 = ei_ploadu<ResPacket>(r0); + R1 = ei_ploadu<ResPacket>(r1); + if(nr==4) R2 = ei_ploadu<ResPacket>(r2); + if(nr==4) R3 = ei_ploadu<ResPacket>(r3); + + C0 = ei_pmadd(C0, alphav, R0); + C1 = ei_pmadd(C1, alphav, R1); + if(nr==4) C2 = ei_pmadd(C2, alphav, R2); + if(nr==4) C3 = ei_pmadd(C3, alphav, R3); + + ei_pstoreu(r0, C0); + ei_pstoreu(r1, C1); + if(nr==4) ei_pstoreu(r2, C2); + if(nr==4) ei_pstoreu(r3, C3); } for(Index i=peeled_mc2; i<rows; i++) { @@ -627,24 +637,18 @@ EIGEN_ASM_COMMENT("myend"); if(nr==2) { LhsScalar A0; - RhsScalar B0; - #ifndef EIGEN_HAS_FUSE_CJMADD - ResScalar T0; - #endif + RhsScalar B0, B1; A0 = blA[k]; B0 = blB[0*RhsPacketSize]; - MADD(cj,A0,B0,C0,T0); - B0 = blB[1*RhsPacketSize]; - MADD(cj,A0,B0,C1,T0); + B1 = blB[1*RhsPacketSize]; + MADD(cj,A0,B0,C0,B0); + MADD(cj,A0,B1,C1,B1); } else { LhsScalar A0; RhsScalar B0, B1, B2, B3; - #ifndef EIGEN_HAS_FUSE_CJMADD - ResScalar T0, T1; - #endif A0 = blA[k]; B0 = blB[0*RhsPacketSize]; @@ -652,18 +656,18 @@ EIGEN_ASM_COMMENT("myend"); B2 = blB[2*RhsPacketSize]; B3 = blB[3*RhsPacketSize]; - MADD(cj,A0,B0,C0,T0); - MADD(cj,A0,B1,C1,T1); - MADD(cj,A0,B2,C2,T0); - MADD(cj,A0,B3,C3,T1); + MADD(cj,A0,B0,C0,B0); + MADD(cj,A0,B1,C1,B1); + MADD(cj,A0,B2,C2,B2); + MADD(cj,A0,B3,C3,B3); } blB += nr*RhsPacketSize; } - res[(j2+0)*resStride + i] += C0; - res[(j2+1)*resStride + i] += C1; - if(nr==4) res[(j2+2)*resStride + i] += C2; - if(nr==4) res[(j2+3)*resStride + i] += C3; + res[(j2+0)*resStride + i] += alpha*C0; + res[(j2+1)*resStride + i] += alpha*C1; + if(nr==4) res[(j2+2)*resStride + i] += alpha*C2; + if(nr==4) res[(j2+3)*resStride + i] += alpha*C3; } } @@ -687,8 +691,9 @@ EIGEN_ASM_COMMENT("myend"); // get res block as registers ResPacket C0, C4; - C0 = ei_ploadu<ResPacket>(&res[(j2+0)*resStride + i]); - C4 = ei_ploadu<ResPacket>(&res[(j2+0)*resStride + i + ResPacketSize]); + C0 = ei_pset1<ResPacket>(ResScalar(0)); + C4 = ei_pset1<ResPacket>(ResScalar(0)); + const RhsScalar* blB = unpackedB; for(Index k=0; k<depth; k++) @@ -696,21 +701,31 @@ EIGEN_ASM_COMMENT("myend"); LhsPacket A0, A1; RhsPacket B0; #ifndef EIGEN_HAS_FUSE_CJMADD - RhsPacket T0, T1; + RhsPacket T0; #endif A0 = ei_pload<LhsPacket>(&blA[0*LhsPacketSize]); A1 = ei_pload<LhsPacket>(&blA[1*LhsPacketSize]); B0 = ei_pload<RhsPacket>(&blB[0*RhsPacketSize]); MADD(pcj,A0,B0,C0,T0); - MADD(pcj,A1,B0,C4,T1); + MADD(pcj,A1,B0,C4,B0); blB += RhsPacketSize; blA += mr; } + ResPacket R0, R4; + ResPacket alphav = ei_pset1<ResPacket>(alpha); - ei_pstoreu(&res[(j2+0)*resStride + i], C0); - ei_pstoreu(&res[(j2+0)*resStride + i + ResPacketSize], C4); + ResScalar* r0 = &res[(j2+0)*resStride + i]; + + R0 = ei_ploadu<ResPacket>(r0); + R4 = ei_ploadu<ResPacket>(r0+ResPacketSize); + + C0 = ei_pmadd(C0, alphav, R0); + C4 = ei_pmadd(C4, alphav, R4); + + ei_pstoreu(r0, C0); + ei_pstoreu(r0+ResPacketSize, C4); } if(rows-peeled_mc>=LhsPacketSize) { @@ -718,20 +733,21 @@ EIGEN_ASM_COMMENT("myend"); const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsPacketSize]; ei_prefetch(&blA[0]); - ResPacket C0 = ei_ploadu<ResPacket>(&res[(j2+0)*resStride + i]); + ResPacket C0 = ei_pset1<ResPacket>(ResScalar(0)); const RhsScalar* blB = unpackedB; for(Index k=0; k<depth; k++) { - #ifndef EIGEN_HAS_FUSE_CJMADD - RhsPacket T0; - #endif - MADD(pcj,ei_pload<LhsPacket>(blA), ei_pload<RhsPacket>(blB), C0, T0); + LhsPacket A0 = ei_pload<LhsPacket>(blA); + RhsPacket B0 = ei_pload<RhsPacket>(blB); + MADD(pcj, A0, B0, C0, B0); blB += RhsPacketSize; blA += LhsPacketSize; } - ei_pstoreu(&res[(j2+0)*resStride + i], C0); + ResPacket alphav = ei_pset1<ResPacket>(alpha); + ResPacket R0 = ei_ploadu<ResPacket>(&res[(j2+0)*resStride + i]); + ei_pstoreu(&res[(j2+0)*resStride + i], ei_pmadd(C0, alphav, R0)); } for(Index i=peeled_mc2; i<rows; i++) { @@ -744,12 +760,11 @@ EIGEN_ASM_COMMENT("myend"); const RhsScalar* blB = unpackedB; for(Index k=0; k<depth; k++) { - #ifndef EIGEN_HAS_FUSE_CJMADD - ResScalar T0; - #endif - MADD(cj,blA[k], blB[k*RhsPacketSize], C0, T0); + LhsScalar A0 = blA[k]; + RhsScalar B0 = blB[k*RhsPacketSize]; + MADD(cj, A0, B0, C0, B0); } - res[(j2+0)*resStride + i] += C0; + res[(j2+0)*resStride + i] += alpha*C0; } } } @@ -775,51 +790,36 @@ template<typename Scalar, typename Index, int mr, int StorageOrder, bool Conjuga struct ei_gemm_pack_lhs { void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, - Scalar alpha = Scalar(1), Index stride=0, Index offset=0) + Index stride=0, Index offset=0) { enum { PacketSize = ei_packet_traits<Scalar>::size }; ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); ei_conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; ei_const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride); - bool hasAlpha = alpha != Scalar(1); Index count = 0; Index peeled_mc = (rows/mr)*mr; for(Index i=0; i<peeled_mc; i+=mr) { if(PanelMode) count += mr * offset; - if(hasAlpha) - for(Index k=0; k<depth; k++) - for(Index w=0; w<mr; w++) - blockA[count++] = alpha * cj(lhs(i+w, k)); - else - for(Index k=0; k<depth; k++) - for(Index w=0; w<mr; w++) - blockA[count++] = cj(lhs(i+w, k)); + for(Index k=0; k<depth; k++) + for(Index w=0; w<mr; w++) + blockA[count++] = cj(lhs(i+w, k)); if(PanelMode) count += mr * (stride-offset-depth); } if(rows-peeled_mc>=PacketSize) { if(PanelMode) count += PacketSize*offset; - if(hasAlpha) - for(Index k=0; k<depth; k++) - for(Index w=0; w<PacketSize; w++) - blockA[count++] = alpha * cj(lhs(peeled_mc+w, k)); - else - for(Index k=0; k<depth; k++) - for(Index w=0; w<PacketSize; w++) - blockA[count++] = cj(lhs(peeled_mc+w, k)); + for(Index k=0; k<depth; k++) + for(Index w=0; w<PacketSize; w++) + blockA[count++] = cj(lhs(peeled_mc+w, k)); if(PanelMode) count += PacketSize * (stride-offset-depth); peeled_mc += PacketSize; } for(Index i=peeled_mc; i<rows; i++) { if(PanelMode) count += offset; - if(hasAlpha) - for(Index k=0; k<depth; k++) - blockA[count++] = alpha * cj(lhs(i, k)); - else - for(Index k=0; k<depth; k++) - blockA[count++] = cj(lhs(i, k)); + for(Index k=0; k<depth; k++) + blockA[count++] = cj(lhs(i, k)); if(PanelMode) count += (stride-offset-depth); } } @@ -837,12 +837,11 @@ struct ei_gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode> { typedef typename ei_packet_traits<Scalar>::type Packet; enum { PacketSize = ei_packet_traits<Scalar>::size }; - void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Scalar alpha, Index depth, Index cols, + void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0) { ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); ei_conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; - bool hasAlpha = alpha != Scalar(1); Index packet_cols = (cols/nr) * nr; Index count = 0; for(Index j2=0; j2<packet_cols; j2+=nr) @@ -853,24 +852,14 @@ struct ei_gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode> const Scalar* b1 = &rhs[(j2+1)*rhsStride]; const Scalar* b2 = &rhs[(j2+2)*rhsStride]; const Scalar* b3 = &rhs[(j2+3)*rhsStride]; - if (hasAlpha) - for(Index k=0; k<depth; k++) - { - blockB[count+0] = alpha*cj(b0[k]); - blockB[count+1] = alpha*cj(b1[k]); - if(nr==4) blockB[count+2] = alpha*cj(b2[k]); - if(nr==4) blockB[count+3] = alpha*cj(b3[k]); - count += nr; - } - else - for(Index k=0; k<depth; k++) - { - blockB[count+0] = cj(b0[k]); - blockB[count+1] = cj(b1[k]); - if(nr==4) blockB[count+2] = cj(b2[k]); - if(nr==4) blockB[count+3] = cj(b3[k]); - count += nr; - } + for(Index k=0; k<depth; k++) + { + blockB[count+0] = cj(b0[k]); + blockB[count+1] = cj(b1[k]); + if(nr==4) blockB[count+2] = cj(b2[k]); + if(nr==4) blockB[count+3] = cj(b3[k]); + count += nr; + } // skip what we have after if(PanelMode) count += nr * (stride-offset-depth); } @@ -880,18 +869,11 @@ struct ei_gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode> { if(PanelMode) count += offset; const Scalar* b0 = &rhs[(j2+0)*rhsStride]; - if (hasAlpha) - for(Index k=0; k<depth; k++) - { - blockB[count] = alpha*cj(b0[k]); - count += 1; - } - else - for(Index k=0; k<depth; k++) - { - blockB[count] = cj(b0[k]); - count += 1; - } + for(Index k=0; k<depth; k++) + { + blockB[count] = cj(b0[k]); + count += 1; + } if(PanelMode) count += (stride-offset-depth); } } @@ -902,41 +884,25 @@ template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode struct ei_gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode> { enum { PacketSize = ei_packet_traits<Scalar>::size }; - void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Scalar alpha, Index depth, Index cols, + void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0) { ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); ei_conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; - bool hasAlpha = alpha != Scalar(1); Index packet_cols = (cols/nr) * nr; Index count = 0; for(Index j2=0; j2<packet_cols; j2+=nr) { // skip what we have before if(PanelMode) count += nr * offset; - if (hasAlpha) - { - for(Index k=0; k<depth; k++) - { - const Scalar* b0 = &rhs[k*rhsStride + j2]; - blockB[count+0] = alpha*cj(b0[0]); - blockB[count+1] = alpha*cj(b0[1]); - if(nr==4) blockB[count+2] = alpha*cj(b0[2]); - if(nr==4) blockB[count+3] = alpha*cj(b0[3]); - count += nr; - } - } - else + for(Index k=0; k<depth; k++) { - for(Index k=0; k<depth; k++) - { - const Scalar* b0 = &rhs[k*rhsStride + j2]; - blockB[count+0] = cj(b0[0]); - blockB[count+1] = cj(b0[1]); - if(nr==4) blockB[count+2] = cj(b0[2]); - if(nr==4) blockB[count+3] = cj(b0[3]); - count += nr; - } + const Scalar* b0 = &rhs[k*rhsStride + j2]; + blockB[count+0] = cj(b0[0]); + blockB[count+1] = cj(b0[1]); + if(nr==4) blockB[count+2] = cj(b0[2]); + if(nr==4) blockB[count+3] = cj(b0[3]); + count += nr; } // skip what we have after if(PanelMode) count += nr * (stride-offset-depth); @@ -948,7 +914,7 @@ struct ei_gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode> const Scalar* b0 = &rhs[j2]; for(Index k=0; k<depth; k++) { - blockB[count] = alpha*cj(b0[k*rhsStride]); + blockB[count] = cj(b0[k*rhsStride]); count += 1; } if(PanelMode) count += stride-offset-depth; diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index b89cfa8a7..d92ff7b4f 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -74,7 +74,6 @@ static void run(Index rows, Index cols, Index depth, ei_const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); typedef ei_product_blocking_traits<LhsScalar,RhsScalar> Blocking; - bool alphaOnLhs = NumTraits<LhsScalar>::IsComplex && !NumTraits<RhsScalar>::IsComplex; Index kc = blocking.kc(); // cache block size along the K direction Index mc = std::min(rows,blocking.mc()); // cache block size along the M direction @@ -87,8 +86,6 @@ static void run(Index rows, Index cols, Index depth, ei_gemm_pack_rhs<RhsScalar, Index, Blocking::nr, RhsStorageOrder, ConjugateRhs> pack_rhs; ei_gebp_kernel<LhsScalar, RhsScalar, Index, Blocking::mr, Blocking::nr> gebp; -// if ((ConjugateRhs && !alphaOnLhs) || (ConjugateLhs && alphaOnLhs)) -// alpha = ei_conj(alpha); // ei_gemm_pack_lhs<LhsScalar, Index, Blocking::mr, LhsStorageOrder> pack_lhs; // ei_gemm_pack_rhs<RhsScalar, Index, Blocking::nr, RhsStorageOrder> pack_rhs; // ei_gebp_kernel<LhsScalar, RhsScalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp; @@ -178,9 +175,6 @@ static void run(Index rows, Index cols, Index depth, RhsScalar *blockB = blocking.blockB()==0 ? ei_aligned_stack_new(RhsScalar, sizeB) : blocking.blockB(); RhsScalar *blockW = blocking.blockW()==0 ? ei_aligned_stack_new(RhsScalar, sizeW) : blocking.blockW(); - LhsScalar lhsAlpha = alphaOnLhs ? ei_get_factor<ResScalar,LhsScalar>::run(alpha) : LhsScalar(1); - RhsScalar rhsAlpha = alphaOnLhs ? RhsScalar(1) : ei_get_factor<ResScalar,RhsScalar>::run(alpha); - // For each horizontal panel of the rhs, and corresponding panel of the lhs... // (==GEMM_VAR1) for(Index k2=0; k2<depth; k2+=kc) @@ -191,7 +185,7 @@ static void run(Index rows, Index cols, Index depth, // => Pack rhs's panel into a sequential chunk of memory (L2 caching) // Note that this panel will be read as many times as the number of blocks in the lhs's // vertical panel which is, in practice, a very low number. - pack_rhs(blockB, &rhs(k2,0), rhsStride, rhsAlpha, actual_kc, cols); + pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols); // For each mc x kc block of the lhs's vertical panel... @@ -203,10 +197,10 @@ static void run(Index rows, Index cols, Index depth, // We pack the lhs's block into a sequential chunk of memory (L1 caching) // Note that this block will be read a very high number of times, which is equal to the number of // micro vertical panel of the large rhs's panel (e.g., cols/4 times). - pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc, lhsAlpha); + pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc); // Everything is packed, we can now call the block * panel kernel: - gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, -1, -1, 0, 0, blockW); + gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW); } } diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index 27eb4b2f8..0488114b9 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -89,7 +89,7 @@ template<typename Scalar, typename Index, int nr, int StorageOrder> struct ei_symm_pack_rhs { enum { PacketSize = ei_packet_traits<Scalar>::size }; - void operator()(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Scalar alpha, Index rows, Index cols, Index k2) + void operator()(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2) { Index end_k = k2 + rows; Index count = 0; @@ -101,12 +101,12 @@ struct ei_symm_pack_rhs { for(Index k=k2; k<end_k; k++) { - blockB[count+0] = alpha*rhs(k,j2+0); - blockB[count+1] = alpha*rhs(k,j2+1); + blockB[count+0] = rhs(k,j2+0); + blockB[count+1] = rhs(k,j2+1); if (nr==4) { - blockB[count+2] = alpha*rhs(k,j2+2); - blockB[count+3] = alpha*rhs(k,j2+3); + blockB[count+2] = rhs(k,j2+2); + blockB[count+3] = rhs(k,j2+3); } count += nr; } @@ -119,12 +119,12 @@ struct ei_symm_pack_rhs // transpose for(Index k=k2; k<j2; k++) { - blockB[count+0] = alpha*ei_conj(rhs(j2+0,k)); - blockB[count+1] = alpha*ei_conj(rhs(j2+1,k)); + blockB[count+0] = ei_conj(rhs(j2+0,k)); + blockB[count+1] = ei_conj(rhs(j2+1,k)); if (nr==4) { - blockB[count+2] = alpha*ei_conj(rhs(j2+2,k)); - blockB[count+3] = alpha*ei_conj(rhs(j2+3,k)); + blockB[count+2] = ei_conj(rhs(j2+2,k)); + blockB[count+3] = ei_conj(rhs(j2+3,k)); } count += nr; } @@ -134,25 +134,25 @@ struct ei_symm_pack_rhs { // normal for (Index w=0 ; w<h; ++w) - blockB[count+w] = alpha*rhs(k,j2+w); + blockB[count+w] = rhs(k,j2+w); - blockB[count+h] = alpha*rhs(k,k); + blockB[count+h] = rhs(k,k); // transpose for (Index w=h+1 ; w<nr; ++w) - blockB[count+w] = alpha*ei_conj(rhs(j2+w,k)); + blockB[count+w] = ei_conj(rhs(j2+w,k)); count += nr; ++h; } // normal for(Index k=j2+nr; k<end_k; k++) { - blockB[count+0] = alpha*rhs(k,j2+0); - blockB[count+1] = alpha*rhs(k,j2+1); + blockB[count+0] = rhs(k,j2+0); + blockB[count+1] = rhs(k,j2+1); if (nr==4) { - blockB[count+2] = alpha*rhs(k,j2+2); - blockB[count+3] = alpha*rhs(k,j2+3); + blockB[count+2] = rhs(k,j2+2); + blockB[count+3] = rhs(k,j2+3); } count += nr; } @@ -163,12 +163,12 @@ struct ei_symm_pack_rhs { for(Index k=k2; k<end_k; k++) { - blockB[count+0] = alpha*ei_conj(rhs(j2+0,k)); - blockB[count+1] = alpha*ei_conj(rhs(j2+1,k)); + blockB[count+0] = ei_conj(rhs(j2+0,k)); + blockB[count+1] = ei_conj(rhs(j2+1,k)); if (nr==4) { - blockB[count+2] = alpha*ei_conj(rhs(j2+2,k)); - blockB[count+3] = alpha*ei_conj(rhs(j2+3,k)); + blockB[count+2] = ei_conj(rhs(j2+2,k)); + blockB[count+3] = ei_conj(rhs(j2+3,k)); } count += nr; } @@ -181,13 +181,13 @@ struct ei_symm_pack_rhs Index half = std::min(end_k,j2); for(Index k=k2; k<half; k++) { - blockB[count] = alpha*ei_conj(rhs(j2,k)); + blockB[count] = ei_conj(rhs(j2,k)); count += 1; } if(half==j2 && half<k2+rows) { - blockB[count] = alpha*ei_real(rhs(j2,j2)); + blockB[count] = ei_real(rhs(j2,j2)); count += 1; } else @@ -196,7 +196,7 @@ struct ei_symm_pack_rhs // normal for(Index k=half+1; k<k2+rows; k++) { - blockB[count] = alpha*rhs(k,j2); + blockB[count] = rhs(k,j2); count += 1; } } @@ -253,9 +253,6 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,Conjugate ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); ei_const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); - if (ConjugateRhs) - alpha = ei_conj(alpha); - typedef ei_product_blocking_traits<Scalar,Scalar> Blocking; Index kc = size; // cache block size along the K direction @@ -282,7 +279,7 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,Conjugate // we have selected one row panel of rhs and one column panel of lhs // pack rhs's panel into a sequential chunk of memory // and expand each coeff to a constant packet for further reuse - pack_rhs(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols); + pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols); // the select lhs's panel has to be split in three different parts: // 1 - the transposed panel above the diagonal block => transposed packed copy @@ -294,7 +291,7 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,Conjugate // transposed packed copy pack_lhs_transposed(blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols); + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); } // the block diagonal { @@ -302,7 +299,7 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,Conjugate // symmetric packed copy pack_lhs(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc); - gebp_kernel(res+k2, resStride, blockA, blockB, actual_mc, actual_kc, cols); + gebp_kernel(res+k2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); } for(Index i2=k2+kc; i2<size; i2+=mc) @@ -311,7 +308,7 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,Conjugate ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder,false>() (blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols); + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); } } @@ -338,9 +335,6 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,Conjugat ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); - if (ConjugateRhs) - alpha = ei_conj(alpha); - typedef ei_product_blocking_traits<Scalar,Scalar> Blocking; Index kc = size; // cache block size along the K direction @@ -361,7 +355,7 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,Conjugat { const Index actual_kc = std::min(k2+kc,size)-k2; - pack_rhs(blockB, _rhs, rhsStride, alpha, actual_kc, cols, k2); + pack_rhs(blockB, _rhs, rhsStride, actual_kc, cols, k2); // => GEPP for(Index i2=0; i2<rows; i2+=mc) @@ -369,7 +363,7 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,Conjugat const Index actual_mc = std::min(i2+mc,rows)-i2; pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols); + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); } } diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h index b1314d5dc..0641063fd 100644 --- a/Eigen/src/Core/products/SelfadjointProduct.h +++ b/Eigen/src/Core/products/SelfadjointProduct.h @@ -65,8 +65,8 @@ struct ei_selfadjoint_product<Scalar, Index, MatStorageOrder, ColMajor, AAT, UpL { ei_const_blas_data_mapper<Scalar, Index, MatStorageOrder> mat(_mat,matStride); - if(AAT) - alpha = ei_conj(alpha); +// if(AAT) +// alpha = ei_conj(alpha); typedef ei_product_blocking_traits<Scalar,Scalar> Blocking; @@ -99,7 +99,7 @@ struct ei_selfadjoint_product<Scalar, Index, MatStorageOrder, ColMajor, AAT, UpL const Index actual_kc = std::min(k2+kc,depth)-k2; // note that the actual rhs is the transpose/adjoint of mat - pack_rhs(blockB, &mat(0,k2), matStride, alpha, actual_kc, size); + pack_rhs(blockB, &mat(0,k2), matStride, actual_kc, size); for(Index i2=0; i2<size; i2+=mc) { @@ -112,15 +112,15 @@ struct ei_selfadjoint_product<Scalar, Index, MatStorageOrder, ColMajor, AAT, UpL // 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel // 3 - after the diagonal => processed with gebp or skipped if (UpLo==Lower) - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, std::min(size,i2), + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, std::min(size,i2), alpha, -1, -1, 0, 0, allocatedBlockB); - sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, allocatedBlockB); + sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha, allocatedBlockB); if (UpLo==Upper) { Index j2 = i2+actual_mc; - gebp_kernel(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, std::max(Index(0),size-j2), + gebp_kernel(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, std::max(Index(0), size-j2), alpha, -1, -1, 0, 0, allocatedBlockB); } } @@ -173,7 +173,7 @@ struct ei_sybb_kernel PacketSize = ei_packet_traits<Scalar>::size, BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr) }; - void operator()(Scalar* res, Index resStride, const Scalar* blockA, const Scalar* blockB, Index size, Index depth, Scalar* workspace) + void operator()(Scalar* res, Index resStride, const Scalar* blockA, const Scalar* blockB, Index size, Index depth, Scalar alpha, Scalar* workspace) { ei_gebp_kernel<Scalar, Scalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel; Matrix<Scalar,BlockSize,BlockSize,ColMajor> buffer; @@ -186,14 +186,15 @@ struct ei_sybb_kernel const Scalar* actual_b = blockB+j*depth; if(UpLo==Upper) - gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, -1, -1, 0, 0, workspace); + gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha, + -1, -1, 0, 0, workspace); // selfadjoint micro block { Index i = j; buffer.setZero(); // 1 - apply the kernel on the temporary buffer - gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, + gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, -1, -1, 0, 0, workspace); // 2 - triangular accumulation for(Index j1=0; j1<actualBlockSize; ++j1) @@ -208,7 +209,7 @@ struct ei_sybb_kernel if(UpLo==Lower) { Index i = j+actualBlockSize; - gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, + gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha, -1, -1, 0, 0, workspace); } } diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index 90b4c9eca..556a18e10 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -105,9 +105,6 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true, ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); ei_const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); - if (ConjugateRhs) - alpha = ei_conj(alpha); - typedef ei_product_blocking_traits<Scalar,Scalar> Blocking; enum { SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Blocking::mr,Blocking::nr), @@ -147,7 +144,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true, k2 = k2+actual_kc-kc; } - pack_rhs(blockB, &rhs(actual_k2,0), rhsStride, alpha, actual_kc, cols); + pack_rhs(blockB, &rhs(actual_k2,0), rhsStride, actual_kc, cols); // the selected lhs's panel has to be split in three different parts: // 1 - the part which is above the diagonal block => skip it @@ -176,7 +173,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true, } pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth); - gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, + gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha, actualPanelWidth, actual_kc, 0, blockBOffset); // GEBP with remaining micro panel @@ -186,7 +183,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true, pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget); - gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, + gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha, actualPanelWidth, actual_kc, 0, blockBOffset); } } @@ -201,7 +198,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true, ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder,false>() (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols); + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); } } } @@ -231,9 +228,6 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false, ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); ei_const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); - if (ConjugateRhs) - alpha = ei_conj(alpha); - typedef ei_product_blocking_traits<Scalar,Scalar> Blocking; enum { SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Blocking::mr,Blocking::nr), @@ -280,7 +274,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false, Scalar* geb = blockB+ts*ts; - pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, alpha, actual_kc, rs); + pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs); // pack the triangular part of the rhs padding the unrolled blocks with zeros if(ts>0) @@ -293,7 +287,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false, Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2; // general part pack_rhs_panel(blockB+j2*actual_kc, - &rhs(actual_k2+panelOffset, actual_j2), rhsStride, alpha, + &rhs(actual_k2+panelOffset, actual_j2), rhsStride, panelLength, actualPanelWidth, actual_kc, panelOffset); @@ -307,7 +301,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false, } pack_rhs_panel(blockB+j2*actual_kc, - triangularBuffer.data(), triangularBuffer.outerStride(), alpha, + triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth, actual_kc, j2); } @@ -330,6 +324,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false, gebp_kernel(res+i2+(actual_k2+j2)*resStride, resStride, blockA, blockB+j2*actual_kc, actual_mc, panelLength, actualPanelWidth, + alpha, actual_kc, actual_kc, // strides blockOffset, blockOffset,// offsets allocatedBlockB); // workspace @@ -337,6 +332,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false, } gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride, blockA, geb, actual_mc, actual_kc, rs, + alpha, -1, -1, 0, 0, allocatedBlockB); } } diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h index 1edf0a44d..08ad12938 100644 --- a/Eigen/src/Core/products/TriangularSolverMatrix.h +++ b/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -140,7 +140,7 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStora Index blockBOffset = IsLower ? k1 : lengthTarget; // update the respective rows of B from other - pack_rhs(blockB, _other+startBlock, otherStride, -1, actualPanelWidth, cols, actual_kc, blockBOffset); + pack_rhs(blockB, _other+startBlock, otherStride, actualPanelWidth, cols, actual_kc, blockBOffset); // GEBP if (lengthTarget>0) @@ -149,7 +149,7 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStora pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget); - gebp_kernel(_other+startTarget, otherStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, + gebp_kernel(_other+startTarget, otherStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, Scalar(-1), actualPanelWidth, actual_kc, 0, blockBOffset); } } @@ -166,7 +166,7 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStora { pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc); - gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols); + gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1)); } } } @@ -242,7 +242,7 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStor if (panelLength>0) pack_rhs_panel(blockB+j2*actual_kc, - &rhs(actual_k2+panelOffset, actual_j2), triStride, -1, + &rhs(actual_k2+panelOffset, actual_j2), triStride, panelLength, actualPanelWidth, actual_kc, panelOffset); } @@ -273,6 +273,7 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStor gebp_kernel(&lhs(i2,absolute_j2), otherStride, blockA, blockB+j2*actual_kc, actual_mc, panelLength, actualPanelWidth, + Scalar(-1), actual_kc, actual_kc, // strides panelOffset, panelOffset, // offsets allocatedBlockB); // workspace @@ -305,7 +306,7 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStor if (rs>0) gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb, - actual_mc, actual_kc, rs, + actual_mc, actual_kc, rs, Scalar(-1), -1, -1, 0, 0, allocatedBlockB); } } |