diff options
author | 2009-08-07 11:09:34 +0200 | |
---|---|---|
committer | 2009-08-07 11:09:34 +0200 | |
commit | d1dc088ef045dcee5747b5c722f5f4f6bb58e2d1 (patch) | |
tree | 6d6d012f9b9f9247bd743eabe5a65130aff3c7e3 /Eigen/src/Core/products/SelfadjointMatrixMatrix.h | |
parent | 543a7857562b2058718d39ce444f3c0495373fc8 (diff) |
* implement a second level of micro blocking (faster for small sizes)
* workaround GCC bad implementation of _mm_set1_p*
Diffstat (limited to 'Eigen/src/Core/products/SelfadjointMatrixMatrix.h')
-rw-r--r-- | Eigen/src/Core/products/SelfadjointMatrixMatrix.h | 50 |
1 files changed, 31 insertions, 19 deletions
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index 1e92ada27..358da3752 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -29,31 +29,43 @@ template<typename Scalar, int mr, int StorageOrder> struct ei_symm_pack_lhs { + enum { PacketSize = ei_packet_traits<Scalar>::size }; + template<int BlockRows> inline + void pack(Scalar* blockA, const ei_const_blas_data_mapper<Scalar,StorageOrder>& lhs, int cols, int i, int& count) + { + // normal copy + for(int k=0; k<i; k++) + for(int w=0; w<BlockRows; w++) + blockA[count++] = lhs(i+w,k); // normal + // symmetric copy + int h = 0; + for(int k=i; k<i+BlockRows; k++) + { + for(int w=0; w<h; w++) + blockA[count++] = ei_conj(lhs(k, i+w)); // transposed + for(int w=h; w<BlockRows; w++) + blockA[count++] = lhs(i+w, k); // normal + ++h; + } + // transposed copy + for(int k=i+BlockRows; k<cols; k++) + for(int w=0; w<BlockRows; w++) + blockA[count++] = ei_conj(lhs(k, i+w)); // transposed + } void operator()(Scalar* blockA, const Scalar* _lhs, int lhsStride, int cols, int rows) { ei_const_blas_data_mapper<Scalar,StorageOrder> lhs(_lhs,lhsStride); int count = 0; - const int peeled_mc = (rows/mr)*mr; + int peeled_mc = (rows/mr)*mr; for(int i=0; i<peeled_mc; i+=mr) { - // normal copy - for(int k=0; k<i; k++) - for(int w=0; w<mr; w++) - blockA[count++] = lhs(i+w,k); // normal - // symmetric copy - int h = 0; - for(int k=i; k<i+mr; k++) - { - for(int w=0; w<h; w++) - blockA[count++] = ei_conj(lhs(k, i+w)); // transposed - for(int w=h; w<mr; w++) - blockA[count++] = lhs(i+w, k); // normal - ++h; - } - // transposed copy - for(int k=i+mr; k<cols; k++) - for(int w=0; w<mr; w++) - blockA[count++] = ei_conj(lhs(k, i+w)); // transposed + pack<mr>(blockA, lhs, cols, i, count); + } + + if(rows-peeled_mc>=PacketSize) + { + pack<PacketSize>(blockA, lhs, cols, peeled_mc, count); + peeled_mc += PacketSize; } // do the same with mr==1 |