diff options
author | Gael Guennebaud <g.gael@free.fr> | 2010-02-23 13:06:49 +0100 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2010-02-23 13:06:49 +0100 |
commit | eb905500b6c654860aa9f9d9c77c7c2614e0ad10 (patch) | |
tree | 73d13d1389ffb7594777e26a52823f6c45a48eec | |
parent | d579d4cc37693823d03fbfedd2e48c40dcaf8938 (diff) |
significant speedup in the matrix-matrix products
-rw-r--r-- | Eigen/src/Core/arch/SSE/PacketMath.h | 39 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralBlockPanelKernel.h | 555 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralMatrixMatrix.h | 8 | ||||
-rw-r--r-- | Eigen/src/Core/products/SelfadjointMatrixMatrix.h | 66 | ||||
-rw-r--r-- | Eigen/src/Core/products/TriangularMatrixMatrix.h | 30 | ||||
-rw-r--r-- | Eigen/src/Core/products/TriangularSolverMatrix.h | 22 | ||||
-rw-r--r-- | Eigen/src/Core/util/Macros.h | 2 | ||||
-rw-r--r-- | bench/bench_gemm.cpp | 4 | ||||
-rw-r--r-- | bench/bench_gemm_blas.cpp | 25 |
9 files changed, 433 insertions, 318 deletions
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index a5a56f759..de96aaffa 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -184,11 +184,12 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float* from) { template<> EIGEN_STRONG_INLINE Packet2d ei_pload<double>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); } template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); } -#if (!defined __GNUC__) && (!defined __ICC) -template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ps(from); } -template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<double>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); } -template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<int>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); } -#else +// #if (!defined __GNUC__) && (!defined __ICC) +// template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ps(from); } +// template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<double>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); } +// template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<int>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); } +// #else + // Fast unaligned loads. Note that here we cannot directly use intrinsics: this would // require pointer casting to incompatible pointer types and leads to invalid code // because of the strict aliasing rule. The "dummy" stuff are required to enforce @@ -197,28 +198,27 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<int>(const int* from) { EIGEN_ template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD - __m128 res; - asm volatile ("movsd %[from0], %[r]" : [r] "=x" (res) : [from0] "m" (*from), [dummy] "m" (*(from+1)) ); - asm volatile ("movhps %[from2], %[r]" : [r] "+x" (res) : [from2] "m" (*(from+2)), [dummy] "m" (*(from+3)) ); - return res; + __m128d res; + res = _mm_load_sd((const double*)(from)) ; + res = _mm_loadh_pd(res, (const double*)(from+2)) ; + return _mm_castpd_ps(res); } template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD __m128d res; - asm volatile ("movsd %[from0], %[r]" : [r] "=x" (res) : [from0] "m" (*from) ); - asm volatile ("movhpd %[from1], %[r]" : [r] "+x" (res) : [from1] "m" (*(from+1)) ); + res = _mm_load_sd(from) ; + res = _mm_loadh_pd(res,from+1); return res; } template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD - __m128i res; - asm volatile ("movsd %[from0], %[r]" : [r] "=x" (res) : [from0] "m" (*from), [dummy] "m" (*(from+1)) ); - asm volatile ("movhps %[from2], %[r]" : [r] "+x" (res) : [from2] "m" (*(from+2)), [dummy] "m" (*(from+3)) ); - return res; + __m128d res; + res = _mm_load_sd((const double*)(from)) ; + res = _mm_loadh_pd(res, (const double*)(from+2)) ; + return _mm_castpd_si128(res); } -#endif template<> EIGEN_STRONG_INLINE void ei_pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); } template<> EIGEN_STRONG_INLINE void ei_pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); } @@ -277,6 +277,13 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_pabs(const Packet4i& a) #endif } +EIGEN_STRONG_INLINE void ei_punpackp(Packet4f* vecs) +{ + vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55)); + vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA)); + vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF)); + vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00)); +} #ifdef __SSE3__ // TODO implement SSE2 versions as well as integer versions diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 18e913b0e..dfc92c346 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -30,7 +30,7 @@ #ifdef EIGEN_HAS_FUSE_CJMADD #define CJMADD(A,B,C,T) C = cj.pmadd(A,B,C); #else -#define CJMADD(A,B,C,T) T = A; T = cj.pmul(T,B); C = ei_padd(C,T); +#define CJMADD(A,B,C,T) T = B; T = cj.pmul(A,T); C = ei_padd(C,T); #endif // optimized GEneral packed Block * packed Panel product kernel @@ -48,9 +48,66 @@ struct ei_gebp_kernel const int peeled_mc = (rows/mr)*mr; const int peeled_mc2 = peeled_mc + (rows-peeled_mc >= PacketSize ? PacketSize : 0); const int peeled_kc = (depth/4)*4; + + Scalar* unpackedB = const_cast<Scalar*>(blockB - strideB * nr * PacketSize); + // loops on each micro vertical panel of rhs (depth x nr) for(int j2=0; j2<packet_cols; j2+=nr) { + // unpack B + { + const Scalar* blB = &blockB[j2*strideB+offsetB*nr]; + int n = depth*nr; + for(int k=0; k<n; k++) + ei_pstore(&unpackedB[k*PacketSize], ei_pset1(blB[k])); + /*Scalar* dest = unpackedB; + for(int k=0; k<n; k+=4*PacketSize) + { + #ifdef EIGEN_VECTORIZE_SSE + const int S = 128; + const int G = 16; + _mm_prefetch((const char*)(&blB[S/2+0]), _MM_HINT_T0); + _mm_prefetch((const char*)(&dest[S+0*G]), _MM_HINT_T0); + _mm_prefetch((const char*)(&dest[S+1*G]), _MM_HINT_T0); + _mm_prefetch((const char*)(&dest[S+2*G]), _MM_HINT_T0); + _mm_prefetch((const char*)(&dest[S+3*G]), _MM_HINT_T0); + #endif + + PacketType C0[PacketSize], C1[PacketSize], C2[PacketSize], C3[PacketSize]; + C0[0] = ei_pload(blB+0*PacketSize); + C1[0] = ei_pload(blB+1*PacketSize); + C2[0] = ei_pload(blB+2*PacketSize); + C3[0] = ei_pload(blB+3*PacketSize); + + ei_punpackp(C0); + ei_punpackp(C1); + ei_punpackp(C2); + ei_punpackp(C3); + + ei_pstore(dest+ 0*PacketSize, C0[0]); + ei_pstore(dest+ 1*PacketSize, C0[1]); + ei_pstore(dest+ 2*PacketSize, C0[2]); + ei_pstore(dest+ 3*PacketSize, C0[3]); + + ei_pstore(dest+ 4*PacketSize, C1[0]); + ei_pstore(dest+ 5*PacketSize, C1[1]); + ei_pstore(dest+ 6*PacketSize, C1[2]); + ei_pstore(dest+ 7*PacketSize, C1[3]); + + ei_pstore(dest+ 8*PacketSize, C2[0]); + ei_pstore(dest+ 9*PacketSize, C2[1]); + ei_pstore(dest+10*PacketSize, C2[2]); + ei_pstore(dest+11*PacketSize, C2[3]); + + ei_pstore(dest+12*PacketSize, C3[0]); + ei_pstore(dest+13*PacketSize, C3[1]); + ei_pstore(dest+14*PacketSize, C3[2]); + ei_pstore(dest+15*PacketSize, C3[3]); + + blB += 4*PacketSize; + dest += 16*PacketSize; + }*/ + } // loops on each micro horizontal panel of lhs (mr x depth) // => we select a mr x nr micro block of res which is entirely // stored into mr/packet_size x nr registers. @@ -65,19 +122,31 @@ struct ei_gebp_kernel // gets res block as register PacketType C0, C1, C2, C3, C4, C5, C6, C7; - C0 = ei_ploadu(&res[(j2+0)*resStride + i]); - C1 = ei_ploadu(&res[(j2+1)*resStride + i]); - if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i]); - if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]); - C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]); - C5 = ei_ploadu(&res[(j2+1)*resStride + i + PacketSize]); - if(nr==4) C6 = ei_ploadu(&res[(j2+2)*resStride + i + PacketSize]); - if(nr==4) C7 = ei_ploadu(&res[(j2+3)*resStride + i + PacketSize]); + C0 = ei_pset1(Scalar(0)); + C1 = ei_pset1(Scalar(0)); + if(nr==4) C2 = ei_pset1(Scalar(0)); + if(nr==4) C3 = ei_pset1(Scalar(0)); + C4 = ei_pset1(Scalar(0)); + C5 = ei_pset1(Scalar(0)); + if(nr==4) C6 = ei_pset1(Scalar(0)); + if(nr==4) C7 = ei_pset1(Scalar(0)); + + Scalar* r0 = &res[(j2+0)*resStride + i]; + Scalar* r1 = r0 + resStride; + Scalar* r2 = r1 + resStride; + Scalar* r3 = r2 + resStride; + + #ifdef EIGEN_VECTORIZE_SSE + _mm_prefetch((const char*)(r0+16), _MM_HINT_T0); + _mm_prefetch((const char*)(r1+16), _MM_HINT_T0); + _mm_prefetch((const char*)(r2+16), _MM_HINT_T0); + _mm_prefetch((const char*)(r3+16), _MM_HINT_T0); + #endif // performs "inner" product // TODO let's check wether the flowing peeled loop could not be // optimized via optimal prefetching from one loop to the other - const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr]; + const Scalar* blB = unpackedB; for(int k=0; k<peeled_kc; k+=4) { if(nr==2) @@ -88,102 +157,101 @@ struct ei_gebp_kernel A1 = ei_pload(&blA[1*PacketSize]); B0 = ei_pload(&blB[0*PacketSize]); CJMADD(A0,B0,C0,T0); - CJMADD(A1,B0,C4,T0); + CJMADD(A1,B0,C4,B0); B0 = ei_pload(&blB[1*PacketSize]); CJMADD(A0,B0,C1,T0); - CJMADD(A1,B0,C5,T0); + CJMADD(A1,B0,C5,B0); A0 = ei_pload(&blA[2*PacketSize]); A1 = ei_pload(&blA[3*PacketSize]); B0 = ei_pload(&blB[2*PacketSize]); CJMADD(A0,B0,C0,T0); - CJMADD(A1,B0,C4,T0); + CJMADD(A1,B0,C4,B0); B0 = ei_pload(&blB[3*PacketSize]); CJMADD(A0,B0,C1,T0); - CJMADD(A1,B0,C5,T0); + CJMADD(A1,B0,C5,B0); A0 = ei_pload(&blA[4*PacketSize]); A1 = ei_pload(&blA[5*PacketSize]); B0 = ei_pload(&blB[4*PacketSize]); CJMADD(A0,B0,C0,T0); - CJMADD(A1,B0,C4,T0); + CJMADD(A1,B0,C4,B0); B0 = ei_pload(&blB[5*PacketSize]); CJMADD(A0,B0,C1,T0); - CJMADD(A1,B0,C5,T0); + CJMADD(A1,B0,C5,B0); A0 = ei_pload(&blA[6*PacketSize]); A1 = ei_pload(&blA[7*PacketSize]); B0 = ei_pload(&blB[6*PacketSize]); CJMADD(A0,B0,C0,T0); - CJMADD(A1,B0,C4,T0); + CJMADD(A1,B0,C4,B0); B0 = ei_pload(&blB[7*PacketSize]); CJMADD(A0,B0,C1,T0); - CJMADD(A1,B0,C5,T0); + CJMADD(A1,B0,C5,B0); } else { - PacketType B0, B1, B2, B3, A0, A1; - PacketType T0, T1; + PacketType T0; + + A0 = ei_pload(&blA[0*PacketSize]); + A1 = ei_pload(&blA[1*PacketSize]); + B0 = ei_pload(&blB[0*PacketSize]); + B1 = ei_pload(&blB[1*PacketSize]); + + CJMADD(A0,B0,C0,T0); + B2 = ei_pload(&blB[2*PacketSize]); + CJMADD(A1,B0,C4,B0); + B3 = ei_pload(&blB[3*PacketSize]); + B0 = ei_pload(&blB[4*PacketSize]); + CJMADD(A0,B1,C1,T0); + CJMADD(A1,B1,C5,B1); + B1 = ei_pload(&blB[5*PacketSize]); + CJMADD(A0,B2,C2,T0); + CJMADD(A1,B2,C6,B2); + B2 = ei_pload(&blB[6*PacketSize]); + CJMADD(A0,B3,C3,T0); + A0 = ei_pload(&blA[2*PacketSize]); + CJMADD(A1,B3,C7,B3); + A1 = ei_pload(&blA[3*PacketSize]); + B3 = ei_pload(&blB[7*PacketSize]); + CJMADD(A0,B0,C0,T0); + CJMADD(A1,B0,C4,B0); + B0 = ei_pload(&blB[8*PacketSize]); + CJMADD(A0,B1,C1,T0); + CJMADD(A1,B1,C5,B1); + B1 = ei_pload(&blB[9*PacketSize]); + CJMADD(A0,B2,C2,T0); + CJMADD(A1,B2,C6,B2); + B2 = ei_pload(&blB[10*PacketSize]); + CJMADD(A0,B3,C3,T0); + A0 = ei_pload(&blA[4*PacketSize]); + CJMADD(A1,B3,C7,B3); + A1 = ei_pload(&blA[5*PacketSize]); + B3 = ei_pload(&blB[11*PacketSize]); - A0 = ei_pload(&blA[0*PacketSize]); - A1 = ei_pload(&blA[1*PacketSize]); - B0 = ei_pload(&blB[0*PacketSize]); - B1 = ei_pload(&blB[1*PacketSize]); - - CJMADD(A0,B0,C0,T0); - if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); - CJMADD(A1,B0,C4,T1); - if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); - B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]); - CJMADD(A0,B1,C1,T0); - CJMADD(A1,B1,C5,T1); - B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]); - if(nr==4) { CJMADD(A0,B2,C2,T0); } - if(nr==4) { CJMADD(A1,B2,C6,T1); } - if(nr==4) B2 = ei_pload(&blB[6*PacketSize]); - if(nr==4) { CJMADD(A0,B3,C3,T0); } - A0 = ei_pload(&blA[2*PacketSize]); - if(nr==4) { CJMADD(A1,B3,C7,T1); } - A1 = ei_pload(&blA[3*PacketSize]); - if(nr==4) B3 = ei_pload(&blB[7*PacketSize]); - CJMADD(A0,B0,C0,T0); - CJMADD(A1,B0,C4,T1); - B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]); - CJMADD(A0,B1,C1,T0); - CJMADD(A1,B1,C5,T1); - B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]); - if(nr==4) { CJMADD(A0,B2,C2,T0); } - if(nr==4) { CJMADD(A1,B2,C6,T1); } - if(nr==4) B2 = ei_pload(&blB[10*PacketSize]); - if(nr==4) { CJMADD(A0,B3,C3,T0); } - A0 = ei_pload(&blA[4*PacketSize]); - if(nr==4) { CJMADD(A1,B3,C7,T1); } - A1 = ei_pload(&blA[5*PacketSize]); - if(nr==4) B3 = ei_pload(&blB[11*PacketSize]); - - CJMADD(A0,B0,C0,T0); - CJMADD(A1,B0,C4,T1); - B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]); - CJMADD(A0,B1,C1,T0); - CJMADD(A1,B1,C5,T1); - B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]); - if(nr==4) { CJMADD(A0,B2,C2,T0); } - if(nr==4) { CJMADD(A1,B2,C6,T1); } - if(nr==4) B2 = ei_pload(&blB[14*PacketSize]); - if(nr==4) { CJMADD(A0,B3,C3,T0); } - A0 = ei_pload(&blA[6*PacketSize]); - if(nr==4) { CJMADD(A1,B3,C7,T1); } - A1 = ei_pload(&blA[7*PacketSize]); - if(nr==4) B3 = ei_pload(&blB[15*PacketSize]); - CJMADD(A0,B0,C0,T0); - CJMADD(A1,B0,C4,T1); - CJMADD(A0,B1,C1,T0); - CJMADD(A1,B1,C5,T1); - if(nr==4) { CJMADD(A0,B2,C2,T0); } - if(nr==4) { CJMADD(A1,B2,C6,T1); } - if(nr==4) { CJMADD(A0,B3,C3,T0); } - if(nr==4) { CJMADD(A1,B3,C7,T1); } + CJMADD(A0,B0,C0,T0); + CJMADD(A1,B0,C4,B0); + B0 = ei_pload(&blB[12*PacketSize]); + CJMADD(A0,B1,C1,T0); + CJMADD(A1,B1,C5,B1); + B1 = ei_pload(&blB[13*PacketSize]); + CJMADD(A0,B2,C2,T0); + CJMADD(A1,B2,C6,B2); + B2 = ei_pload(&blB[14*PacketSize]); + CJMADD(A0,B3,C3,T0); + A0 = ei_pload(&blA[6*PacketSize]); + CJMADD(A1,B3,C7,B3); + A1 = ei_pload(&blA[7*PacketSize]); + B3 = ei_pload(&blB[15*PacketSize]); + CJMADD(A0,B0,C0,T0); + CJMADD(A1,B0,C4,B0); + CJMADD(A0,B1,C1,T0); + CJMADD(A1,B1,C5,B1); + CJMADD(A0,B2,C2,T0); + CJMADD(A1,B2,C6,B2); + CJMADD(A0,B3,C3,T0); + CJMADD(A1,B3,C7,B3); } blB += 4*nr*PacketSize; @@ -200,45 +268,64 @@ struct ei_gebp_kernel A1 = ei_pload(&blA[1*PacketSize]); B0 = ei_pload(&blB[0*PacketSize]); CJMADD(A0,B0,C0,T0); - CJMADD(A1,B0,C4,T0); + CJMADD(A1,B0,C4,B0); B0 = ei_pload(&blB[1*PacketSize]); CJMADD(A0,B0,C1,T0); - CJMADD(A1,B0,C5,T0); + CJMADD(A1,B0,C5,B0); } else { - PacketType B0, B1, B2, B3, A0, A1, T0, T1; - - A0 = ei_pload(&blA[0*PacketSize]); - A1 = ei_pload(&blA[1*PacketSize]); - B0 = ei_pload(&blB[0*PacketSize]); - B1 = ei_pload(&blB[1*PacketSize]); - - CJMADD(A0,B0,C0,T0); - if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); - CJMADD(A1,B0,C4,T1); - if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); - B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]); - CJMADD(A0,B1,C1,T0); - CJMADD(A1,B1,C5,T1); - if(nr==4) { CJMADD(A0,B2,C2,T0); } - if(nr==4) { CJMADD(A1,B2,C6,T1); } - if(nr==4) { CJMADD(A0,B3,C3,T0); } - if(nr==4) { CJMADD(A1,B3,C7,T1); } + PacketType B0, B1, B2, B3, A0, A1, T0; + + A0 = ei_pload(&blA[0*PacketSize]); + A1 = ei_pload(&blA[1*PacketSize]); + B0 = ei_pload(&blB[0*PacketSize]); + B1 = ei_pload(&blB[1*PacketSize]); + + CJMADD(A0,B0,C0,T0); + B2 = ei_pload(&blB[2*PacketSize]); + CJMADD(A1,B0,C4,B0); + B3 = ei_pload(&blB[3*PacketSize]); + CJMADD(A0,B1,C1,T0); + CJMADD(A1,B1,C5,B1); + CJMADD(A0,B2,C2,T0); + CJMADD(A1,B2,C6,B2); + CJMADD(A0,B3,C3,T0); + CJMADD(A1,B3,C7,B3); } blB += nr*PacketSize; blA += mr; } - ei_pstoreu(&res[(j2+0)*resStride + i], C0); - ei_pstoreu(&res[(j2+1)*resStride + i], C1); - if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i], C2); - if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i], C3); - ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4); - ei_pstoreu(&res[(j2+1)*resStride + i + PacketSize], C5); - if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i + PacketSize], C6); - if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i + PacketSize], C7); + PacketType R0, R1, R2, R3, R4, R5, R6, R7; + + R0 = ei_ploadu(r0); + R1 = ei_ploadu(r1); + if(nr==4) R2 = ei_ploadu(r2); + if(nr==4) R3 = ei_ploadu(r3); + R4 = ei_ploadu(r0 + PacketSize); + R5 = ei_ploadu(r1 + PacketSize); + if(nr==4) R6 = ei_ploadu(r2 + PacketSize); + if(nr==4) R7 = ei_ploadu(r3 + PacketSize); + + C0 = ei_padd(R0, C0); + C1 = ei_padd(R1, C1); + if(nr==4) C2 = ei_padd(R2, C2); + if(nr==4) C3 = ei_padd(R3, C3); + C4 = ei_padd(R4, C4); + C5 = ei_padd(R5, C5); + if(nr==4) C6 = ei_padd(R6, C6); + if(nr==4) C7 = ei_padd(R7, C7); + + ei_pstoreu(r0, C0); + ei_pstoreu(r1, C1); + if(nr==4) ei_pstoreu(r2, C2); + if(nr==4) ei_pstoreu(r3, C3); + ei_pstoreu(r0 + PacketSize, C4); + ei_pstoreu(r1 + PacketSize, C5); + if(nr==4) ei_pstoreu(r2 + PacketSize, C6); + if(nr==4) ei_pstoreu(r3 + PacketSize, C7); } if(rows-peeled_mc>=PacketSize) { @@ -256,81 +343,76 @@ struct ei_gebp_kernel if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]); // performs "inner" product - const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr]; + const Scalar* blB = unpackedB; for(int k=0; k<peeled_kc; k+=4) { if(nr==2) { - PacketType B0, T0, A0; + PacketType B0, B1, A0; A0 = ei_pload(&blA[0*PacketSize]); B0 = ei_pload(&blB[0*PacketSize]); - CJMADD(A0,B0,C0,T0); - B0 = ei_pload(&blB[1*PacketSize]); - CJMADD(A0,B0,C1,T0); - - A0 = ei_pload(&blA[1*PacketSize]); + B1 = ei_pload(&blB[1*PacketSize]); + CJMADD(A0,B0,C0,B0); B0 = ei_pload(&blB[2*PacketSize]); - CJMADD(A0,B0,C0,T0); - B0 = ei_pload(&blB[3*PacketSize]); - CJMADD(A0,B0,C1,T0); - - A0 = ei_pload(&blA[2*PacketSize]); + CJMADD(A0,B1,C1,B1); + A0 = ei_pload(&blA[1*PacketSize]); + B1 = ei_pload(&blB[3*PacketSize]); + CJMADD(A0,B0,C0,B0); B0 = ei_pload(&blB[4*PacketSize]); - CJMADD(A0,B0,C0,T0); - B0 = ei_pload(&blB[5*PacketSize]); - CJMADD(A0,B0,C1,T0); - - A0 = ei_pload(&blA[3*PacketSize]); + CJMADD(A0,B1,C1,B1); + A0 = ei_pload(&blA[2*PacketSize]); + B1 = ei_pload(&blB[5*PacketSize]); + CJMADD(A0,B0,C0,B0); B0 = ei_pload(&blB[6*PacketSize]); - CJMADD(A0,B0,C0,T0); - B0 = ei_pload(&blB[7*PacketSize]); - CJMADD(A0,B0,C1,T0); + CJMADD(A0,B1,C1,B1); + A0 = ei_pload(&blA[3*PacketSize]); + B1 = ei_pload(&blB[7*PacketSize]); + CJMADD(A0,B0,C0,B0); + CJMADD(A0,B1,C1,B1); } else { - PacketType B0, B1, B2, B3, A0; - PacketType T0, T1; - A0 = ei_pload(&blA[0*PacketSize]); - B0 = ei_pload(&blB[0*PacketSize]); - B1 = ei_pload(&blB[1*PacketSize]); - - CJMADD(A0,B0,C0,T0); - if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); - if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); - B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]); - CJMADD(A0,B1,C1,T1); - B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]); - if(nr==4) { CJMADD(A0,B2,C2,T0); } - if(nr==4) B2 = ei_pload(&blB[6*PacketSize]); - if(nr==4) { CJMADD(A0,B3,C3,T1); } - A0 = ei_pload(&blA[1*PacketSize]); - if(nr==4) B3 = ei_pload(&blB[7*PacketSize]); - CJMADD(A0,B0,C0,T0); - B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]); - CJMADD(A0,B1,C1,T1); - B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]); - if(nr==4) { CJMADD(A0,B2,C2,T0); } - if(nr==4) B2 = ei_pload(&blB[10*PacketSize]); - if(nr==4) { CJMADD(A0,B3,C3,T1); } - A0 = ei_pload(&blA[2*PacketSize]); - if(nr==4) B3 = ei_pload(&blB[11*PacketSize]); - - CJMADD(A0,B0,C0,T0); - B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]); - CJMADD(A0,B1,C1,T1); - B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]); - if(nr==4) { CJMADD(A0,B2,C2,T0); } - if(nr==4) B2 = ei_pload(&blB[14*PacketSize]); - if(nr==4) { CJMADD(A0,B3,C3,T1); } - A0 = ei_pload(&blA[3*PacketSize]); - if(nr==4) B3 = ei_pload(&blB[15*PacketSize]); - CJMADD(A0,B0,C0,T0); - CJMADD(A0,B1,C1,T1); - if(nr==4) { CJMADD(A0,B2,C2,T0); } - if(nr==4) { CJMADD(A0,B3,C3,T1); } + A0 = ei_pload(&blA[0*PacketSize]); + B0 = ei_pload(&blB[0*PacketSize]); + B1 = ei_pload(&blB[1*PacketSize]); + + CJMADD(A0,B0,C0,B0); + B2 = ei_pload(&blB[2*PacketSize]); + B3 = ei_pload(&blB[3*PacketSize]); + B0 = ei_pload(&blB[4*PacketSize]); + CJMADD(A0,B1,C1,B1); + B1 = ei_pload(&blB[5*PacketSize]); + CJMADD(A0,B2,C2,B2); + B2 = ei_pload(&blB[6*PacketSize]); + CJMADD(A0,B3,C3,B3); + A0 = ei_pload(&blA[1*PacketSize]); + B3 = ei_pload(&blB[7*PacketSize]); + CJMADD(A0,B0,C0,B0); + B0 = ei_pload(&blB[8*PacketSize]); + CJMADD(A0,B1,C1,B1); + B1 = ei_pload(&blB[9*PacketSize]); + CJMADD(A0,B2,C2,B2); + B2 = ei_pload(&blB[10*PacketSize]); + CJMADD(A0,B3,C3,B3); + A0 = ei_pload(&blA[2*PacketSize]); + B3 = ei_pload(&blB[11*PacketSize]); + + CJMADD(A0,B0,C0,B0); + B0 = ei_pload(&blB[12*PacketSize]); + CJMADD(A0,B1,C1,B1); + B1 = ei_pload(&blB[13*PacketSize]); + CJMADD(A0,B2,C2,B2); + B2 = ei_pload(&blB[14*PacketSize]); + CJMADD(A0,B3,C3,B3); + A0 = ei_pload(&blA[3*PacketSize]); + B3 = ei_pload(&blB[15*PacketSize]); + CJMADD(A0,B0,C0,B0); + CJMADD(A0,B1,C1,B1); + CJMADD(A0,B2,C2,B2); + CJMADD(A0,B3,C3,B3); } blB += 4*nr*PacketSize; @@ -354,16 +436,16 @@ struct ei_gebp_kernel PacketType B0, B1, B2, B3, A0; PacketType T0, T1; - A0 = ei_pload(&blA[0*PacketSize]); - B0 = ei_pload(&blB[0*PacketSize]); - B1 = ei_pload(&blB[1*PacketSize]); - if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); - if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); + A0 = ei_pload(&blA[0*PacketSize]); + B0 = ei_pload(&blB[0*PacketSize]); + B1 = ei_pload(&blB[1*PacketSize]); + B2 = ei_pload(&blB[2*PacketSize]); + B3 = ei_pload(&blB[3*PacketSize]); - CJMADD(A0,B0,C0,T0); - CJMADD(A0,B1,C1,T1); - if(nr==4) { CJMADD(A0,B2,C2,T0); } - if(nr==4) { CJMADD(A0,B3,C3,T1); } + CJMADD(A0,B0,C0,T0); + CJMADD(A0,B1,C1,T1); + CJMADD(A0,B2,C2,T0); + CJMADD(A0,B3,C3,T1); } blB += nr*PacketSize; @@ -384,7 +466,8 @@ struct ei_gebp_kernel // gets a 1 x nr res block as registers Scalar C0(0), C1(0), C2(0), C3(0); - const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr]; + // TODO directly use blockB ??? + const Scalar* blB = unpackedB;//&blockB[j2*strideB+offsetB*nr]; for(int k=0; k<depth; k++) { if(nr==2) @@ -402,16 +485,16 @@ struct ei_gebp_kernel Scalar B0, B1, B2, B3, A0; Scalar T0, T1; - A0 = blA[k]; - B0 = blB[0*PacketSize]; - B1 = blB[1*PacketSize]; - if(nr==4) B2 = blB[2*PacketSize]; - if(nr==4) B3 = blB[3*PacketSize]; + A0 = blA[k]; + B0 = blB[0*PacketSize]; + B1 = blB[1*PacketSize]; + B2 = blB[2*PacketSize]; + B3 = blB[3*PacketSize]; - CJMADD(A0,B0,C0,T0); - CJMADD(A0,B1,C1,T1); - if(nr==4) { CJMADD(A0,B2,C2,T0); } - if(nr==4) { CJMADD(A0,B3,C3,T1); } + CJMADD(A0,B0,C0,T0); + CJMADD(A0,B1,C1,T1); + CJMADD(A0,B2,C2,T0); + CJMADD(A0,B3,C3,T1); } blB += nr*PacketSize; @@ -427,6 +510,13 @@ struct ei_gebp_kernel // => do the same but with nr==1 for(int j2=packet_cols; j2<cols; j2++) { + // unpack B + { + const Scalar* blB = &blockB[j2*strideB+offsetB]; + for(int k=0; k<depth; k++) + ei_pstore(&unpackedB[k*PacketSize], ei_pset1(blB[k])); + } + for(int i=0; i<peeled_mc; i+=mr) { const Scalar* blA = &blockA[i*strideA+offsetA*mr]; @@ -436,12 +526,12 @@ struct ei_gebp_kernel // TODO move the res loads to the stores - // gets res block as register + // get res block as registers PacketType C0, C4; C0 = ei_ploadu(&res[(j2+0)*resStride + i]); C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]); - const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB]; + const Scalar* blB = unpackedB; for(int k=0; k<depth; k++) { PacketType B0, A0, A1, T0, T1; @@ -469,7 +559,7 @@ struct ei_gebp_kernel PacketType C0 = ei_ploadu(&res[(j2+0)*resStride + i]); - const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB]; + const Scalar* blB = unpackedB; for(int k=0; k<depth; k++) { C0 = cj.pmadd(ei_pload(blA), ei_pload(blB), C0); @@ -488,7 +578,8 @@ struct ei_gebp_kernel // gets a 1 x 1 res block as registers Scalar C0(0); - const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB]; + // FIXME directly use blockB ?? + const Scalar* blB = unpackedB; for(int k=0; k<depth; k++) C0 = cj.pmadd(blA[k], blB[k*PacketSize], C0); res[(j2+0)*resStride + i] += C0; @@ -552,9 +643,9 @@ struct ei_gemm_pack_lhs } }; -// copy a complete panel of the rhs while expending each coefficient into a packet form +// copy a complete panel of the rhs // this version is optimized for column major matrices -// The traversal order is as follow (nr==4): +// The traversal order is as follow: (nr==4): // 0 1 2 3 12 13 14 15 24 27 // 4 5 6 7 16 17 18 19 25 28 // 8 9 10 11 20 21 22 23 26 29 @@ -574,65 +665,51 @@ struct ei_gemm_pack_rhs<Scalar, nr, ColMajor, PanelMode> for(int j2=0; j2<packet_cols; j2+=nr) { // skip what we have before - if(PanelMode) count += PacketSize * nr * offset; + if(PanelMode) count += nr * offset; const Scalar* b0 = &rhs[(j2+0)*rhsStride]; const Scalar* b1 = &rhs[(j2+1)*rhsStride]; const Scalar* b2 = &rhs[(j2+2)*rhsStride]; const Scalar* b3 = &rhs[(j2+3)*rhsStride]; if (hasAlpha) - { for(int k=0; k<depth; k++) { - ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[k])); - ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b1[k])); - if (nr==4) - { - ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b2[k])); - ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b3[k])); - } - count += nr*PacketSize; + blockB[count+0] = alpha*b0[k]; + blockB[count+1] = alpha*b1[k]; + if(nr==4) blockB[count+2] = alpha*b2[k]; + if(nr==4) blockB[count+3] = alpha*b3[k]; + count += nr; } - } else - { for(int k=0; k<depth; k++) { - ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[k])); - ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b1[k])); - if (nr==4) - { - ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b2[k])); - ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b3[k])); - } - count += nr*PacketSize; + blockB[count+0] = b0[k]; + blockB[count+1] = b1[k]; + if(nr==4) blockB[count+2] = b2[k]; + if(nr==4) blockB[count+3] = b3[k]; + count += nr; } - } // skip what we have after - if(PanelMode) count += PacketSize * nr * (stride-offset-depth); + if(PanelMode) count += nr * (stride-offset-depth); } // copy the remaining columns one at a time (nr==1) for(int j2=packet_cols; j2<cols; ++j2) { - if(PanelMode) count += PacketSize * offset; + if(PanelMode) count += offset; const Scalar* b0 = &rhs[(j2+0)*rhsStride]; if (hasAlpha) - { for(int k=0; k<depth; k++) { - ei_pstore(&blockB[count], ei_pset1(alpha*b0[k])); - count += PacketSize; + blockB[count] = alpha*b0[k]; + count += 1; } - } else - { for(int k=0; k<depth; k++) { - ei_pstore(&blockB[count], ei_pset1(b0[k])); - count += PacketSize; + blockB[count] = b0[k]; + count += 1; } - } - if(PanelMode) count += PacketSize * (stride-offset-depth); + if(PanelMode) count += (stride-offset-depth); } } }; @@ -652,17 +729,17 @@ struct ei_gemm_pack_rhs<Scalar, nr, RowMajor, PanelMode> for(int j2=0; j2<packet_cols; j2+=nr) { // skip what we have before - if(PanelMode) count += PacketSize * nr * offset; + if(PanelMode) count += nr * offset; if (hasAlpha) { for(int k=0; k<depth; k++) { const Scalar* b0 = &rhs[k*rhsStride + j2]; - ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[0])); - ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b0[1])); - if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b0[2])); - if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b0[3])); - count += nr*PacketSize; + blockB[count+0] = alpha*b0[0]; + blockB[count+1] = alpha*b0[1]; + if(nr==4) blockB[count+2] = alpha*b0[2]; + if(nr==4) blockB[count+3] = alpha*b0[3]; + count += nr; } } else @@ -670,27 +747,27 @@ struct ei_gemm_pack_rhs<Scalar, nr, RowMajor, PanelMode> for(int k=0; k<depth; k++) { const Scalar* b0 = &rhs[k*rhsStride + j2]; - ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[0])); - ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b0[1])); - if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b0[2])); - if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b0[3])); - count += nr*PacketSize; + blockB[count+0] = b0[0]; + blockB[count+1] = b0[1]; + if(nr==4) blockB[count+2] = b0[2]; + if(nr==4) blockB[count+3] = b0[3]; + count += nr; } } // skip what we have after - if(PanelMode) count += PacketSize * nr * (stride-offset-depth); + if(PanelMode) count += nr * (stride-offset-depth); } // copy the remaining columns one at a time (nr==1) for(int j2=packet_cols; j2<cols; ++j2) { - if(PanelMode) count += PacketSize * offset; + if(PanelMode) count += offset; const Scalar* b0 = &rhs[j2]; for(int k=0; k<depth; k++) { - ei_pstore(&blockB[count], ei_pset1(alpha*b0[k*rhsStride])); - count += PacketSize; + blockB[count] = alpha*b0[k*rhsStride]; + count += 1; } - if(PanelMode) count += PacketSize * (stride-offset-depth); + if(PanelMode) count += stride-offset-depth; } } }; diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index c13e09eac..25c8d4c96 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -78,8 +78,10 @@ static void run(int rows, int cols, int depth, int kc = std::min<int>(Blocking::Max_kc,depth); // cache block size along the K direction int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction - Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize); + Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc*8); + std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols; + Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); + Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr; // For each horizontal panel of the rhs, and corresponding panel of the lhs... // (==GEMM_VAR1) @@ -111,7 +113,7 @@ static void run(int rows, int cols, int depth, } ei_aligned_stack_delete(Scalar, blockA, kc*mc); - ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize); + ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB); } }; diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index 89cbc3ac0..785045db4 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -95,14 +95,14 @@ struct ei_symm_pack_rhs { for(int k=k2; k<end_k; k++) { - ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*rhs(k,j2+0))); - ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*rhs(k,j2+1))); + blockB[count+0] = alpha*rhs(k,j2+0); + blockB[count+1] = alpha*rhs(k,j2+1); if (nr==4) { - ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*rhs(k,j2+2))); - ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*rhs(k,j2+3))); + blockB[count+2] = alpha*rhs(k,j2+2); + blockB[count+3] = alpha*rhs(k,j2+3); } - count += nr*PacketSize; + count += nr; } } @@ -113,14 +113,14 @@ struct ei_symm_pack_rhs // transpose for(int k=k2; k<j2; k++) { - ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+0,k)))); - ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+1,k)))); + blockB[count+0] = alpha*ei_conj(rhs(j2+0,k)); + blockB[count+1] = alpha*ei_conj(rhs(j2+1,k)); if (nr==4) { - ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+2,k)))); - ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+3,k)))); + blockB[count+2] = alpha*ei_conj(rhs(j2+2,k)); + blockB[count+3] = alpha*ei_conj(rhs(j2+3,k)); } - count += nr*PacketSize; + count += nr; } // symmetric int h = 0; @@ -128,24 +128,24 @@ struct ei_symm_pack_rhs { // normal for (int w=0 ; w<h; ++w) - ei_pstore(&blockB[count+w*PacketSize], ei_pset1(alpha*rhs(k,j2+w))); + blockB[count+w] = alpha*rhs(k,j2+w); // transpose for (int w=h ; w<nr; ++w) - ei_pstore(&blockB[count+w*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+w,k)))); - count += nr*PacketSize; + blockB[count+w] = alpha*ei_conj(rhs(j2+w,k)); + count += nr; ++h; } // normal for(int k=j2+nr; k<end_k; k++) { - ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*rhs(k,j2+0))); - ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*rhs(k,j2+1))); + blockB[count+0] = alpha*rhs(k,j2+0); + blockB[count+1] = alpha*rhs(k,j2+1); if (nr==4) { - ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*rhs(k,j2+2))); - ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*rhs(k,j2+3))); + blockB[count+2] = alpha*rhs(k,j2+2); + blockB[count+3] = alpha*rhs(k,j2+3); } - count += nr*PacketSize; + count += nr; } } @@ -154,14 +154,14 @@ struct ei_symm_pack_rhs { for(int k=k2; k<end_k; k++) { - ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+0,k)))); - ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+1,k)))); + blockB[count+0] = alpha*ei_conj(rhs(j2+0,k)); + blockB[count+1] = alpha*ei_conj(rhs(j2+1,k)); if (nr==4) { - ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+2,k)))); - ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+3,k)))); + blockB[count+2] = alpha*ei_conj(rhs(j2+2,k)); + blockB[count+3] = alpha*ei_conj(rhs(j2+3,k)); } - count += nr*PacketSize; + count += nr; } } @@ -172,14 +172,14 @@ struct ei_symm_pack_rhs int half = std::min(end_k,j2); for(int k=k2; k<half; k++) { - ei_pstore(&blockB[count], ei_pset1(alpha*ei_conj(rhs(j2,k)))); - count += PacketSize; + blockB[count] = alpha*ei_conj(rhs(j2,k)); + count += 1; } // normal for(int k=half; k<k2+rows; k++) { - ei_pstore(&blockB[count], ei_pset1(alpha*rhs(k,j2))); - count += PacketSize; + blockB[count] = alpha*rhs(k,j2); + count += 1; } } } @@ -244,7 +244,9 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,true,ConjugateLhs, R int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize); + std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols; + Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); + Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr; ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp_kernel; @@ -292,7 +294,7 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,true,ConjugateLhs, R } ei_aligned_stack_delete(Scalar, blockA, kc*mc); - ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize); + ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB); } }; @@ -323,7 +325,9 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,false,ConjugateLhs, int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize); + std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols; + Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); + Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr; ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp_kernel; @@ -346,7 +350,7 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,false,ConjugateLhs, } ei_aligned_stack_delete(Scalar, blockA, kc*mc); - ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize); + ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB); } }; diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index 37617a915..27c7caf17 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -120,7 +120,10 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,true, int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize); + std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols; + Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); +// Scalar* allocatedBlockB = new Scalar[sizeB]; + Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr; Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer; triangularBuffer.setZero(); @@ -155,7 +158,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,true, // => GEBP with the micro triangular block // The trick is to pack this micro block while filling the opposite triangular part with zeros. - // To this end we do an extra triangular copy to small temporary buffer + // To this end we do an extra triangular copy to a small temporary buffer for (int k=0;k<actualPanelWidth;++k) { if (!(Mode&UnitDiag)) @@ -166,7 +169,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,true, pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.stride(), actualPanelWidth, actualPanelWidth); gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, - actualPanelWidth, actual_kc, 0, blockBOffset*Blocking::PacketSize); + actualPanelWidth, actual_kc, 0, blockBOffset); // GEBP with remaining micro panel if (lengthTarget>0) @@ -176,7 +179,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,true, pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget); gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, - actualPanelWidth, actual_kc, 0, blockBOffset*Blocking::PacketSize); + actualPanelWidth, actual_kc, 0, blockBOffset); } } } @@ -196,7 +199,8 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,true, } ei_aligned_stack_delete(Scalar, blockA, kc*mc); - ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize); + ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB); +// delete[] allocatedBlockB; } }; @@ -234,7 +238,9 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false, int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize); + std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols; + Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar,sizeB); + Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr; Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer; triangularBuffer.setZero(); @@ -252,7 +258,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false, const int actual_kc = std::min(IsLower ? size-k2 : k2, kc); int actual_k2 = IsLower ? k2 : k2-actual_kc; int rs = IsLower ? actual_k2 : size - k2; - Scalar* geb = blockB+actual_kc*actual_kc*Blocking::PacketSize; + Scalar* geb = blockB+actual_kc*actual_kc/**Blocking::PacketSize*/; pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, alpha, actual_kc, rs); @@ -265,7 +271,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false, int panelOffset = IsLower ? j2+actualPanelWidth : 0; int panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2; // general part - pack_rhs_panel(blockB+j2*actual_kc*Blocking::PacketSize, + pack_rhs_panel(blockB+j2*actual_kc/**Blocking::PacketSize*/, &rhs(actual_k2+panelOffset, actual_j2), rhsStride, alpha, panelLength, actualPanelWidth, actual_kc, panelOffset); @@ -279,7 +285,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false, triangularBuffer.coeffRef(k,j) = rhs(actual_j2+k,actual_j2+j); } - pack_rhs_panel(blockB+j2*actual_kc*Blocking::PacketSize, + pack_rhs_panel(blockB+j2*actual_kc/**Blocking::PacketSize*/, triangularBuffer.data(), triangularBuffer.stride(), alpha, actualPanelWidth, actualPanelWidth, actual_kc, j2); @@ -300,10 +306,10 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false, int blockOffset = IsLower ? j2 : 0; gebp_kernel(res+i2+(actual_k2+j2)*resStride, resStride, - blockA, blockB+j2*actual_kc*Blocking::PacketSize, + blockA, blockB+j2*actual_kc, actual_mc, panelLength, actualPanelWidth, actual_kc, actual_kc, // strides - blockOffset, blockOffset*Blocking::PacketSize);// offsets + blockOffset, blockOffset);// offsets } } gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride, @@ -312,7 +318,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false, } ei_aligned_stack_delete(Scalar, blockA, kc*mc); - ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize); + ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB); } }; diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h index 23a645d7c..e32a9929c 100644 --- a/Eigen/src/Core/products/TriangularSolverMatrix.h +++ b/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -67,7 +67,9 @@ struct ei_triangular_solve_matrix<Scalar,OnTheLeft,Mode,Conjugate,TriStorageOrde int mc = std::min<int>(Blocking::Max_mc,size); // cache block size along the M direction Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize); + std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols; + Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); + Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr; ei_conj_if<Conjugate> conj; ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<Conjugate,false> > gebp_kernel; @@ -146,7 +148,7 @@ struct ei_triangular_solve_matrix<Scalar,OnTheLeft,Mode,Conjugate,TriStorageOrde pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget); gebp_kernel(_other+startTarget, otherStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, - actualPanelWidth, actual_kc, 0, blockBOffset*Blocking::PacketSize); + actualPanelWidth, actual_kc, 0, blockBOffset); } } } @@ -169,7 +171,7 @@ struct ei_triangular_solve_matrix<Scalar,OnTheLeft,Mode,Conjugate,TriStorageOrde } ei_aligned_stack_delete(Scalar, blockA, kc*mc); - ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize); + ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB); } }; @@ -198,7 +200,9 @@ struct ei_triangular_solve_matrix<Scalar,OnTheRight,Mode,Conjugate,TriStorageOrd int mc = std::min<int>(Blocking::Max_mc,size); // cache block size along the M direction Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - Scalar* blockB = ei_aligned_stack_new(Scalar, kc*size*Blocking::PacketSize); + std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*size; + Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); + Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr; ei_conj_if<Conjugate> conj; ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<false,Conjugate> > gebp_kernel; @@ -215,7 +219,7 @@ struct ei_triangular_solve_matrix<Scalar,OnTheRight,Mode,Conjugate,TriStorageOrd int startPanel = IsLower ? 0 : k2+actual_kc; int rs = IsLower ? actual_k2 : size - actual_k2 - actual_kc; - Scalar* geb = blockB+actual_kc*actual_kc*Blocking::PacketSize; + Scalar* geb = blockB+actual_kc*actual_kc; if (rs>0) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, -1, actual_kc, rs); @@ -230,7 +234,7 @@ struct ei_triangular_solve_matrix<Scalar,OnTheRight,Mode,Conjugate,TriStorageOrd int panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2; if (panelLength>0) - pack_rhs_panel(blockB+j2*actual_kc*Blocking::PacketSize, + pack_rhs_panel(blockB+j2*actual_kc, &rhs(actual_k2+panelOffset, actual_j2), triStride, -1, panelLength, actualPanelWidth, actual_kc, panelOffset); @@ -260,10 +264,10 @@ struct ei_triangular_solve_matrix<Scalar,OnTheRight,Mode,Conjugate,TriStorageOrd if(panelLength>0) { gebp_kernel(&lhs(i2,absolute_j2), otherStride, - blockA, blockB+j2*actual_kc*Blocking::PacketSize, + blockA, blockB+j2*actual_kc, actual_mc, panelLength, actualPanelWidth, actual_kc, actual_kc, // strides - panelOffset, panelOffset*Blocking::PacketSize); // offsets + panelOffset, panelOffset); // offsets } // unblocked triangular solve @@ -298,7 +302,7 @@ struct ei_triangular_solve_matrix<Scalar,OnTheRight,Mode,Conjugate,TriStorageOrd } ei_aligned_stack_delete(Scalar, blockA, kc*mc); - ei_aligned_stack_delete(Scalar, blockB, kc*size*Blocking::PacketSize); + ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB); } }; diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index dc1aa150b..ba92f2370 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -232,7 +232,7 @@ using Eigen::ei_cos; #endif #ifndef EIGEN_STACK_ALLOCATION_LIMIT -#define EIGEN_STACK_ALLOCATION_LIMIT 1000000 +#define EIGEN_STACK_ALLOCATION_LIMIT 20000 #endif #ifndef EIGEN_DEFAULT_IO_FORMAT diff --git a/bench/bench_gemm.cpp b/bench/bench_gemm.cpp index e99fc2970..ccc155dc5 100644 --- a/bench/bench_gemm.cpp +++ b/bench/bench_gemm.cpp @@ -22,8 +22,8 @@ void gemm(const M& a, const M& b, M& c) int main(int argc, char ** argv) { - int rep = 2; - int s = 1024; + int rep = 1; + int s = 2048; int m = s; int n = s; int p = s; diff --git a/bench/bench_gemm_blas.cpp b/bench/bench_gemm_blas.cpp index a9dfaa66f..babf1ec2c 100644 --- a/bench/bench_gemm_blas.cpp +++ b/bench/bench_gemm_blas.cpp @@ -31,7 +31,6 @@ static int intone = 1; void blas_gemm(const MatrixXf& a, const MatrixXf& b, MatrixXf& c) { -// cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, c.rows(), c.cols(), a.cols(), 1, a.data(), a.rows(), b.data(), b.rows(), 1, c.data(), c.rows()); int M = c.rows(); int N = c.cols(); int K = a.cols(); @@ -39,17 +38,33 @@ void blas_gemm(const MatrixXf& a, const MatrixXf& b, MatrixXf& c) int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows(); - + sgemm_(¬rans,¬rans,&M,&N,&K,&fone, const_cast<float*>(a.data()),&lda, - const_cast<float*>(b.data()),&ldb,&fzero, + const_cast<float*>(b.data()),&ldb,&fone, + c.data(),&ldc); +} + +void blas_gemm(const MatrixXd& a, const MatrixXd& b, MatrixXd& c) +{ + int M = c.rows(); + int N = c.cols(); + int K = a.cols(); + + int lda = a.rows(); + int ldb = b.rows(); + int ldc = c.rows(); + + dgemm_(¬rans,¬rans,&M,&N,&K,&done, + const_cast<double*>(a.data()),&lda, + const_cast<double*>(b.data()),&ldb,&done, c.data(),&ldc); } int main(int argc, char **argv) { - int rep = 2; - int s = 1024; + int rep = 1; + int s = 2048; int m = s; int n = s; int p = s; |