From c8c81c1e7454dd824607132c78997adee62101fd Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 2 Jan 2014 16:18:32 -0800 Subject: Improved the efficiency if the block-panel matrix multiplication code: the change reduces the pressure on the L1 cache by removing the calls to gebp_traits::unpackRhs(). Instead the packetization of the rhs blocks is done on the fly in gebp_traits::loadRhs(). This adds numerous calls to pset1 (since we're packetizing on the fly in the inner loop) but this is more than compensated by the fact that we're decreasing the memory transfers by a factor RhsPacketSize. --- Eigen/src/Core/products/TriangularMatrixMatrix.h | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'Eigen/src/Core/products/TriangularMatrixMatrix.h') diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index 8110507b5..62575aff4 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -125,11 +125,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix triangularBuffer; triangularBuffer.setZero(); @@ -187,7 +185,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix0) @@ -197,7 +195,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix() (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW); + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0); } } } @@ -266,11 +264,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix triangularBuffer; triangularBuffer.setZero(); @@ -357,14 +353,13 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix Date: Fri, 25 Apr 2014 22:25:48 +0200 Subject: Make sure that calls to broadcast4 are 16 bytes aligned --- Eigen/src/Core/arch/SSE/PacketMath.h | 6 +++--- Eigen/src/Core/products/TriangularMatrixMatrix.h | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'Eigen/src/Core/products/TriangularMatrixMatrix.h') diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index ad935d5f1..6912f3bc3 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -486,7 +486,7 @@ template<> EIGEN_STRONG_INLINE void pbroadcast4(const float *a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) { - a3 = ploadu(a); + a3 = pload(a); a0 = vec4f_swizzle1(a3, 0,0,0,0); a1 = vec4f_swizzle1(a3, 1,1,1,1); a2 = vec4f_swizzle1(a3, 2,2,2,2); @@ -502,10 +502,10 @@ pbroadcast4(const double *a, a2 = _mm_loaddup_pd(a+2); a3 = _mm_loaddup_pd(a+3); #else - a1 = ploadu(a); + a1 = pload(a); a0 = vec2d_swizzle1(a1, 0,0); a1 = vec2d_swizzle1(a1, 1,1); - a3 = ploadu(a+2); + a3 = pload(a+2); a2 = vec2d_swizzle1(a3, 0,0); a3 = vec2d_swizzle1(a3, 1,1); #endif diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index 62575aff4..8088aa691 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -300,6 +300,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix=cols) ? 0 : actual_kc; Scalar* geb = blockB+ts*ts; + geb = geb + internal::first_aligned(geb,EIGEN_ALIGN_BYTES/sizeof(Scalar)); pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs); -- cgit v1.2.3 From e7ef26fa44999b054cbf36fb909f9737a111c4fb Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 25 Apr 2014 23:36:22 +0200 Subject: TRMM: Make sure we have enough memory in rhs block to enforce alignment. --- Eigen/src/Core/products/TriangularMatrixMatrix.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Eigen/src/Core/products/TriangularMatrixMatrix.h') diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index 8088aa691..db7b27f8e 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -263,7 +263,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix