* implement a second level of micro blocking (faster for small sizes)

* workaround GCC bad implementation of _mm_set1_p*
author: Gael Guennebaud <g.gael@free.fr> 2009-08-07 11:09:34 +0200
committer: Gael Guennebaud <g.gael@free.fr> 2009-08-07 11:09:34 +0200
commit: d1dc088ef045dcee5747b5c722f5f4f6bb58e2d1 (patch)
tree: 6d6d012f9b9f9247bd743eabe5a65130aff3c7e3 /Eigen/src/Core/products/SelfadjointMatrixMatrix.h
parent: 543a7857562b2058718d39ce444f3c0495373fc8 (diff)
1 files changed, 31 insertions, 19 deletions
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
index 1e92ada27..358da3752 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -29,31 +29,43 @@
 template<typename Scalar, int mr, int StorageOrder>
 struct ei_symm_pack_lhs
 {
+  enum { PacketSize = ei_packet_traits<Scalar>::size };
+  template<int BlockRows> inline
+  void pack(Scalar* blockA, const ei_const_blas_data_mapper<Scalar,StorageOrder>& lhs, int cols, int i, int& count)
+  {
+    // normal copy
+    for(int k=0; k<i; k++)
+      for(int w=0; w<BlockRows; w++)
+        blockA[count++] = lhs(i+w,k);           // normal
+    // symmetric copy
+    int h = 0;
+    for(int k=i; k<i+BlockRows; k++)
+    {
+      for(int w=0; w<h; w++)
+        blockA[count++] = ei_conj(lhs(k, i+w)); // transposed
+      for(int w=h; w<BlockRows; w++)
+        blockA[count++] = lhs(i+w, k);          // normal
+      ++h;
+    }
+    // transposed copy
+    for(int k=i+BlockRows; k<cols; k++)
+      for(int w=0; w<BlockRows; w++)
+        blockA[count++] = ei_conj(lhs(k, i+w)); // transposed
+  }
   void operator()(Scalar* blockA, const Scalar* _lhs, int lhsStride, int cols, int rows)
   {
     ei_const_blas_data_mapper<Scalar,StorageOrder> lhs(_lhs,lhsStride);
     int count = 0;
-    const int peeled_mc = (rows/mr)*mr;
+    int peeled_mc = (rows/mr)*mr;
     for(int i=0; i<peeled_mc; i+=mr)
     {
-      // normal copy
-      for(int k=0; k<i; k++)
-        for(int w=0; w<mr; w++)
-          blockA[count++] = lhs(i+w,k);           // normal
-      // symmetric copy
-      int h = 0;
-      for(int k=i; k<i+mr; k++)
-      {
-        for(int w=0; w<h; w++)
-          blockA[count++] = ei_conj(lhs(k, i+w)); // transposed
-        for(int w=h; w<mr; w++)
-          blockA[count++] = lhs(i+w, k);          // normal
-        ++h;
-      }
-      // transposed copy
-      for(int k=i+mr; k<cols; k++)
-        for(int w=0; w<mr; w++)
-          blockA[count++] = ei_conj(lhs(k, i+w)); // transposed
+      pack<mr>(blockA, lhs, cols, i, count);
+    }
+
+    if(rows-peeled_mc>=PacketSize)
+    {
+      pack<PacketSize>(blockA, lhs, cols, peeled_mc, count);
+      peeled_mc += PacketSize;
     }
 
     // do the same with mr==1
author	Gael Guennebaud <g.gael@free.fr>	2009-08-07 11:09:34 +0200
committer	Gael Guennebaud <g.gael@free.fr>	2009-08-07 11:09:34 +0200
commit	d1dc088ef045dcee5747b5c722f5f4f6bb58e2d1 (patch)
tree	6d6d012f9b9f9247bd743eabe5a65130aff3c7e3 /Eigen/src/Core/products/SelfadjointMatrixMatrix.h
parent	543a7857562b2058718d39ce444f3c0495373fc8 (diff)