aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/products/TriangularMatrixMatrix.h
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2010-02-23 13:06:49 +0100
committerGravatar Gael Guennebaud <g.gael@free.fr>2010-02-23 13:06:49 +0100
commiteb905500b6c654860aa9f9d9c77c7c2614e0ad10 (patch)
tree73d13d1389ffb7594777e26a52823f6c45a48eec /Eigen/src/Core/products/TriangularMatrixMatrix.h
parentd579d4cc37693823d03fbfedd2e48c40dcaf8938 (diff)
significant speedup in the matrix-matrix products
Diffstat (limited to 'Eigen/src/Core/products/TriangularMatrixMatrix.h')
-rw-r--r--Eigen/src/Core/products/TriangularMatrixMatrix.h30
1 files changed, 18 insertions, 12 deletions
diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h
index 37617a915..27c7caf17 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -120,7 +120,10 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,true,
int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
- Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize);
+ std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
+ Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
+// Scalar* allocatedBlockB = new Scalar[sizeB];
+ Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer;
triangularBuffer.setZero();
@@ -155,7 +158,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,true,
// => GEBP with the micro triangular block
// The trick is to pack this micro block while filling the opposite triangular part with zeros.
- // To this end we do an extra triangular copy to small temporary buffer
+ // To this end we do an extra triangular copy to a small temporary buffer
for (int k=0;k<actualPanelWidth;++k)
{
if (!(Mode&UnitDiag))
@@ -166,7 +169,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,true,
pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.stride(), actualPanelWidth, actualPanelWidth);
gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols,
- actualPanelWidth, actual_kc, 0, blockBOffset*Blocking::PacketSize);
+ actualPanelWidth, actual_kc, 0, blockBOffset);
// GEBP with remaining micro panel
if (lengthTarget>0)
@@ -176,7 +179,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,true,
pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget);
gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols,
- actualPanelWidth, actual_kc, 0, blockBOffset*Blocking::PacketSize);
+ actualPanelWidth, actual_kc, 0, blockBOffset);
}
}
}
@@ -196,7 +199,8 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,true,
}
ei_aligned_stack_delete(Scalar, blockA, kc*mc);
- ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize);
+ ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
+// delete[] allocatedBlockB;
}
};
@@ -234,7 +238,9 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false,
int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
- Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize);
+ std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
+ Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar,sizeB);
+ Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer;
triangularBuffer.setZero();
@@ -252,7 +258,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false,
const int actual_kc = std::min(IsLower ? size-k2 : k2, kc);
int actual_k2 = IsLower ? k2 : k2-actual_kc;
int rs = IsLower ? actual_k2 : size - k2;
- Scalar* geb = blockB+actual_kc*actual_kc*Blocking::PacketSize;
+ Scalar* geb = blockB+actual_kc*actual_kc/**Blocking::PacketSize*/;
pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, alpha, actual_kc, rs);
@@ -265,7 +271,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false,
int panelOffset = IsLower ? j2+actualPanelWidth : 0;
int panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
// general part
- pack_rhs_panel(blockB+j2*actual_kc*Blocking::PacketSize,
+ pack_rhs_panel(blockB+j2*actual_kc/**Blocking::PacketSize*/,
&rhs(actual_k2+panelOffset, actual_j2), rhsStride, alpha,
panelLength, actualPanelWidth,
actual_kc, panelOffset);
@@ -279,7 +285,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false,
triangularBuffer.coeffRef(k,j) = rhs(actual_j2+k,actual_j2+j);
}
- pack_rhs_panel(blockB+j2*actual_kc*Blocking::PacketSize,
+ pack_rhs_panel(blockB+j2*actual_kc/**Blocking::PacketSize*/,
triangularBuffer.data(), triangularBuffer.stride(), alpha,
actualPanelWidth, actualPanelWidth,
actual_kc, j2);
@@ -300,10 +306,10 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false,
int blockOffset = IsLower ? j2 : 0;
gebp_kernel(res+i2+(actual_k2+j2)*resStride, resStride,
- blockA, blockB+j2*actual_kc*Blocking::PacketSize,
+ blockA, blockB+j2*actual_kc,
actual_mc, panelLength, actualPanelWidth,
actual_kc, actual_kc, // strides
- blockOffset, blockOffset*Blocking::PacketSize);// offsets
+ blockOffset, blockOffset);// offsets
}
}
gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride,
@@ -312,7 +318,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false,
}
ei_aligned_stack_delete(Scalar, blockA, kc*mc);
- ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize);
+ ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
}
};