8 files changed, 101 insertions, 61 deletions
diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h
index 5074ea88f..379b5e1f4 100644
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -346,6 +346,17 @@ class Matrix : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCol
     {
       return *this;
     }
+
+    /** Override MatrixBase::swap() since for dynamic-sized matrices of same type it is enough to swap the
+      * data pointers.
+      */
+    void swap(Matrix& other)
+    {
+      if (Base::SizeAtCompileTime==Dynamic)
+        m_storage.swap(other.m_storage);
+      else
+        this->Base::swap(other);
+    }
 };
 
 #define EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, Size, SizeSuffix) \
diff --git a/Eigen/src/Core/MatrixStorage.h b/Eigen/src/Core/MatrixStorage.h
index 72b9965b6..8f5c247be 100644
--- a/Eigen/src/Core/MatrixStorage.h
+++ b/Eigen/src/Core/MatrixStorage.h
@@ -84,6 +84,7 @@ template<typename T, int Size, int _Rows, int _Cols> class ei_matrix_storage
   public:
     inline ei_matrix_storage() {}
     inline ei_matrix_storage(int,int,int) {}
+    inline void swap(ei_matrix_storage& other) { std::swap(m_data,other.m_data); }
     inline static int rows(void) {return _Rows;}
     inline static int cols(void) {return _Cols;}
     inline void resize(int,int,int) {}
@@ -100,6 +101,8 @@ template<typename T, int Size> class ei_matrix_storage<T, Size, Dynamic, Dynamic
   public:
     inline ei_matrix_storage(int, int rows, int cols) : m_rows(rows), m_cols(cols) {}
     inline ~ei_matrix_storage() {}
+    inline void swap(ei_matrix_storage& other)
+    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
     inline int rows(void) const {return m_rows;}
     inline int cols(void) const {return m_cols;}
     inline void resize(int, int rows, int cols)
@@ -119,6 +122,7 @@ template<typename T, int Size, int _Cols> class ei_matrix_storage<T, Size, Dynam
   public:
     inline ei_matrix_storage(int, int rows, int) : m_rows(rows) {}
     inline ~ei_matrix_storage() {}
+    inline void swap(ei_matrix_storage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
     inline int rows(void) const {return m_rows;}
     inline int cols(void) const {return _Cols;}
     inline void resize(int size, int rows, int)
@@ -137,6 +141,7 @@ template<typename T, int Size, int _Rows> class ei_matrix_storage<T, Size, _Rows
   public:
     inline ei_matrix_storage(int, int, int cols) : m_cols(cols) {}
     inline ~ei_matrix_storage() {}
+    inline void swap(ei_matrix_storage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
     inline int rows(void) const {return _Rows;}
     inline int cols(void) const {return m_cols;}
     inline void resize(int size, int, int cols)
@@ -157,6 +162,8 @@ template<typename T> class ei_matrix_storage<T, Dynamic, Dynamic, Dynamic>
     inline ei_matrix_storage(int size, int rows, int cols)
       : m_data(ei_aligned_malloc<T>(size)), m_rows(rows), m_cols(cols) {}
     inline ~ei_matrix_storage() { delete[] m_data; }
+    inline void swap(ei_matrix_storage& other)
+    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
     inline int rows(void) const {return m_rows;}
     inline int cols(void) const {return m_cols;}
     void resize(int size, int rows, int cols)
@@ -181,6 +188,7 @@ template<typename T, int _Rows> class ei_matrix_storage<T, Dynamic, _Rows, Dynam
   public:
     inline ei_matrix_storage(int size, int, int cols) : m_data(ei_aligned_malloc<T>(size)), m_cols(cols) {}
     inline ~ei_matrix_storage() { delete[] m_data; }
+    inline void swap(ei_matrix_storage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
     inline static int rows(void) {return _Rows;}
     inline int cols(void) const {return m_cols;}
     void resize(int size, int, int cols)
@@ -204,6 +212,7 @@ template<typename T, int _Cols> class ei_matrix_storage<T, Dynamic, Dynamic, _Co
   public:
     inline ei_matrix_storage(int size, int rows, int) : m_data(ei_aligned_malloc<T>(size)), m_rows(rows) {}
     inline ~ei_matrix_storage() { delete[] m_data; }
+    inline void swap(ei_matrix_storage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
     inline int rows(void) const {return m_rows;}
     inline static int cols(void) {return _Cols;}
     void resize(int size, int rows, int)
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 2bfd15878..90f9bdca4 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -30,11 +30,6 @@
 *** Forward declarations ***
 ***************************/
 
-enum {
-  ColMajorProduct,
-  RowMajorProduct
-};
-
 template<int VectorizationMode, int Index, typename Lhs, typename Rhs>
 struct ei_product_coeff_impl;
 
@@ -238,7 +233,7 @@ template<typename LhsNested, typename RhsNested, int ProductMode> class Product
     const PacketScalar packet(int row, int col) const
     {
       PacketScalar res;
-      ei_product_packet_impl<Flags&RowMajorBit ? RowMajorProduct : ColMajorProduct,
+      ei_product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor,
                                    Unroll ? InnerSize-1 : Dynamic,
                                    _LhsNested, _RhsNested, PacketScalar, LoadMode>
         ::run(row, col, m_lhs, m_rhs, res);
@@ -362,27 +357,27 @@ struct ei_product_coeff_impl<InnerVectorization, Index, Lhs, Rhs>
 *******************/
 
 template<int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
-struct ei_product_packet_impl<RowMajorProduct, Index, Lhs, Rhs, PacketScalar, LoadMode>
+struct ei_product_packet_impl<RowMajor, Index, Lhs, Rhs, PacketScalar, LoadMode>
 {
   inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
   {
-    ei_product_packet_impl<RowMajorProduct, Index-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res);
+    ei_product_packet_impl<RowMajor, Index-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res);
     res =  ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.template packet<LoadMode>(Index, col), res);
   }
 };
 
 template<int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
-struct ei_product_packet_impl<ColMajorProduct, Index, Lhs, Rhs, PacketScalar, LoadMode>
+struct ei_product_packet_impl<ColMajor, Index, Lhs, Rhs, PacketScalar, LoadMode>
 {
   inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
   {
-    ei_product_packet_impl<ColMajorProduct, Index-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res);
+    ei_product_packet_impl<ColMajor, Index-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res);
     res =  ei_pmadd(lhs.template packet<LoadMode>(row, Index), ei_pset1(rhs.coeff(Index, col)), res);
   }
 };
 
 template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
-struct ei_product_packet_impl<RowMajorProduct, 0, Lhs, Rhs, PacketScalar, LoadMode>
+struct ei_product_packet_impl<RowMajor, 0, Lhs, Rhs, PacketScalar, LoadMode>
 {
   inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
   {
@@ -391,7 +386,7 @@ struct ei_product_packet_impl<RowMajorProduct, 0, Lhs, Rhs, PacketScalar, LoadMo
 };
 
 template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
-struct ei_product_packet_impl<ColMajorProduct, 0, Lhs, Rhs, PacketScalar, LoadMode>
+struct ei_product_packet_impl<ColMajor, 0, Lhs, Rhs, PacketScalar, LoadMode>
 {
   inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
   {
@@ -399,8 +394,8 @@ struct ei_product_packet_impl<ColMajorProduct, 0, Lhs, Rhs, PacketScalar, LoadMo
   }
 };
 
-template<int StorageOrder, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
-struct ei_product_packet_impl<StorageOrder, Dynamic, Lhs, Rhs, PacketScalar, LoadMode>
+template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
+struct ei_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, PacketScalar, LoadMode>
 {
   inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res)
   {
@@ -411,7 +406,7 @@ struct ei_product_packet_impl<StorageOrder, Dynamic, Lhs, Rhs, PacketScalar, Loa
 };
 
 template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
-struct ei_product_packet_impl<ColMajorProduct, Dynamic, Lhs, Rhs, PacketScalar, LoadMode>
+struct ei_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, PacketScalar, LoadMode>
 {
   inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res)
   {
diff --git a/Eigen/src/Core/Swap.h b/Eigen/src/Core/Swap.h
index 5e3187071..35590b56f 100644
--- a/Eigen/src/Core/Swap.h
+++ b/Eigen/src/Core/Swap.h
@@ -73,7 +73,7 @@ struct ei_swap_selector<Derived,OtherDerived,true>
 template<typename Derived, typename OtherDerived>
 struct ei_swap_selector<Derived,OtherDerived,false>
 {
-  inline void run(Derived& src, OtherDerived& other)
+  inline static void run(Derived& src, OtherDerived& other)
   {
     typename Derived::Scalar tmp;
     for(int j = 0; j < src.cols(); j++)
diff --git a/bench/benchBlasGemm.cpp b/bench/benchBlasGemm.cpp
index d22af89da..02f067e1a 100644
--- a/bench/benchBlasGemm.cpp
+++ b/bench/benchBlasGemm.cpp
@@ -1,4 +1,8 @@
 
+// g++-4.2 -O3 -DNDEBUG -I.. benchBlasGemm.cpp /usr/lib/libcblas.so.3 - o benchBlasGemm
+// possible options:
+//    -DEIGEN_DONT_VECTORIZE
+//    -msse2
 
 // #define EIGEN_DEFAULT_TO_ROW_MAJOR
 #define _FLOAT
@@ -28,6 +32,8 @@ void check_product(void);
 
 int main(int argc, char *argv[])
 {
+  // disable SSE exceptions
+  #ifdef __GNUC__
   {
     int aux;
     asm(
@@ -36,6 +42,7 @@ int main(int argc, char *argv[])
     "ldmxcsr   %[aux]           \n\t"
     : : [aux] "m" (aux));
   }
+  #endif
 
   int nbtries=1, nbloops=1, M, N, K;
 
@@ -70,8 +77,18 @@ int main(int argc, char *argv[])
   }
   else
   {
+    std::cout << "Usage: " << argv[0] << " size  \n";
+    std::cout << "Usage: " << argv[0] << " auto size\n";
     std::cout << "Usage: " << argv[0] << " size nbloops nbtries\n";
     std::cout << "Usage: " << argv[0] << " M N K nbloops nbtries\n";
+    std::cout << "Usage: " << argv[0] << " check\n";
+    std::cout << "Options:\n"
+    std::cout << "    size       unique size of the 2 matrices (integer)\n";
+    std::cout << "    auto       automatically set the number of repetitions and tries\n";
+    std::cout << "    nbloops    number of times the GEMM routines is executed\n"
+    std::cout << "    nbtries    number of times the loop is benched (return the best try)\n"
+    std::cout << "    M N K      sizes of the matrices: MxN  =  MxK * KxN (integers)\n"
+    std::cout << "    check      check eigen product using cblas as a reference\n";
     exit(1);
   }
 
@@ -119,7 +136,7 @@ int main(int argc, char *argv[])
   mb = MyMatrix::random(K,N);
   mc = MyMatrix::random(M,N);
 
-  // eigen fast ?
+  // eigen
 //   if (!(std::string(argv[1])=="auto"))
   {
       timer.reset();
@@ -158,20 +175,10 @@ int main(int argc, char *argv[])
 
 using namespace Eigen;
 
-
-
 void bench_eigengemm(MyMatrix& mc, const MyMatrix& ma, const MyMatrix& mb, int nbloops)
 {
   for (uint j=0 ; j<nbloops ; ++j)
-    #ifdef EIGEN_WIP_PRODUCT_DIRTY
-      mc = ma * mb;
-    #else
       mc += (ma * mb).lazy();
-      /*static_cast<MatrixBase<MyMatrix>& >*//*(mc).operator+=( (ma * mb).lazy() );*/
-
-//       Flagged<Product<MyMatrix,MyMatrix,CacheFriendlyProduct>, 0, EvalBeforeNestingBit | EvalBeforeAssigningBit>(
-//         Product<MyMatrix,MyMatrix,CacheFriendlyProduct>(ma, mb)));
-    #endif
 }
 
 void bench_eigengemm_normal(MyMatrix& mc, const MyMatrix& ma, const MyMatrix& mb, int nbloops)
@@ -183,7 +190,6 @@ void bench_eigengemm_normal(MyMatrix& mc, const MyMatrix& ma, const MyMatrix& mb
 #define MYVERIFY(A,M) if (!(A)) { \
     std::cout << "FAIL: " << M << "\n"; \
   }
-
 void check_product(int M, int N, int K)
 {
   MyMatrix ma(M,K), mb(K,N), mc(M,N), maT(K,M), mbT(N,K), meigen(M,N), mref(M,N);
@@ -200,20 +206,20 @@ void check_product(int M, int N, int K)
   meigen += ma * mb;
   MYVERIFY(meigen.isApprox(mref, eps),". * .");
 
-//   meigen = mref = mc;
-//   CBLAS_GEMM(CblasColMajor, CblasTrans, CblasNoTrans, M, N, K, 1, maT.data(), K, mb.data(), K, 1, mref.data(), M);
-//   meigen += maT.transpose() * mb;
-//   MYVERIFY(meigen.isApprox(mref, eps),"T * .");
-// 
-//   meigen = mref = mc;
-//   CBLAS_GEMM(CblasColMajor, CblasTrans, CblasTrans, M, N, K, 1, maT.data(), K, mbT.data(), N, 1, mref.data(), M);
-//   meigen += (maT.transpose()) * (mbT.transpose());
-//   MYVERIFY(meigen.isApprox(mref, eps),"T * T");
-// 
-//   meigen = mref = mc;
-//   CBLAS_GEMM(CblasColMajor, CblasNoTrans, CblasTrans, M, N, K, 1, ma.data(), M, mbT.data(), N, 1, mref.data(), M);
-//   meigen += ma * mbT.transpose();
-//   MYVERIFY(meigen.isApprox(mref, eps),". * T");
+  meigen = mref = mc;
+  CBLAS_GEMM(CblasColMajor, CblasTrans, CblasNoTrans, M, N, K, 1, maT.data(), K, mb.data(), K, 1, mref.data(), M);
+  meigen += maT.transpose() * mb;
+  MYVERIFY(meigen.isApprox(mref, eps),"T * .");
+
+  meigen = mref = mc;
+  CBLAS_GEMM(CblasColMajor, CblasTrans, CblasTrans, M, N, K, 1, maT.data(), K, mbT.data(), N, 1, mref.data(), M);
+  meigen += (maT.transpose()) * (mbT.transpose());
+  MYVERIFY(meigen.isApprox(mref, eps),"T * T");
+
+  meigen = mref = mc;
+  CBLAS_GEMM(CblasColMajor, CblasNoTrans, CblasTrans, M, N, K, 1, ma.data(), M, mbT.data(), N, 1, mref.data(), M);
+  meigen += ma * mbT.transpose();
+  MYVERIFY(meigen.isApprox(mref, eps),". * T");
 }
 
 void check_product(void)
diff --git a/bench/benchmark.cpp b/bench/benchmark.cpp
index b48b21d68..5aeb2d315 100644
--- a/bench/benchmark.cpp
+++ b/bench/benchmark.cpp
@@ -18,17 +18,6 @@ USING_PART_OF_NAMESPACE_EIGEN
 
 int main(int argc, char *argv[])
 {
-    Matrix4i m1, m2, m3;
-    m1.setRandom();
-    m2.setConstant(2);
-    int s1 = 2;
-    m3 = m1;
-    std::cout << m1 << "\n\n";
-    std::cout << m2 << "\n\n";
-    m3 = m1.cwiseProduct(m2);
-    std::cout << m3 << "\n==\n" << m1*s1 << "\n\n";
-//     v(1,2,3,4);
-//     std::cout << v * 2 << "\n";
     Matrix<SCALAR,MATSIZE,MATSIZE> I = Matrix<SCALAR,MATSIZE,MATSIZE>::ones();
     Matrix<SCALAR,MATSIZE,MATSIZE> m;
     for(int i = 0; i < MATSIZE; i++)
@@ -39,7 +28,7 @@ int main(int argc, char *argv[])
     asm("#begin");
     for(int a = 0; a < REPEAT; a++)
     {
-        m = Matrix<SCALAR,MATSIZE,MATSIZE>::ones() + 0.00005 * (m + (m*m).eval());
+        m = Matrix<SCALAR,MATSIZE,MATSIZE>::ones() + 0.00005 * (m + (m*m));
     }
     asm("#end");
     cout << m << endl;
diff --git a/test/basicstuff.cpp b/test/basicstuff.cpp
index a2ad3dedf..4e3db3c1b 100644
--- a/test/basicstuff.cpp
+++ b/test/basicstuff.cpp
@@ -82,6 +82,12 @@ template<typename MatrixType> void basicStuff(const MatrixType& m)
   {
     VERIFY_RAISES_ASSERT(m1 = (m2.block(0,0, rows-1, cols-1)));
   }
+
+  // test swap
+  m3 = m1;
+  m1.swap(m2);
+  VERIFY_IS_APPROX(m3, m2);
+  VERIFY_IS_NOT_APPROX(m3, m1);
 }
 
 void test_basicstuff()
diff --git a/test/product.cpp b/test/product.cpp
index f1e26d20a..2fc50899f 100644
--- a/test/product.cpp
+++ b/test/product.cpp
@@ -31,8 +31,10 @@ template<typename MatrixType> void product(const MatrixType& m)
   */
 
   typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::FloatingPoint FloatingPoint;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
-  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> SquareMatrixType;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime> RowSquareMatrixType;
+  typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, MatrixType::ColsAtCompileTime> ColSquareMatrixType;
 
   int rows = m.rows();
   int cols = m.cols();
@@ -43,11 +45,13 @@ template<typename MatrixType> void product(const MatrixType& m)
              m2 = MatrixType::random(rows, cols),
              m3(rows, cols),
              mzero = MatrixType::zero(rows, cols);
-  SquareMatrixType
-             identity = Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime>
-                              ::identity(rows, rows),
-             square = Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime>
-                              ::random(rows, rows);
+  RowSquareMatrixType
+             identity = RowSquareMatrixType::identity(rows, rows),
+             square = RowSquareMatrixType::random(rows, rows),
+             res = RowSquareMatrixType::random(rows, rows);
+  ColSquareMatrixType
+             square2 = ColSquareMatrixType::random(cols, cols),
+             res2 = ColSquareMatrixType::random(cols, cols);
   VectorType v1 = VectorType::random(rows),
              v2 = VectorType::random(rows),
              vzero = VectorType::zero(rows);
@@ -83,6 +87,20 @@ template<typename MatrixType> void product(const MatrixType& m)
 
   if (rows!=cols)
     VERIFY_RAISES_ASSERT(m3 = m1*m1);
+
+  // test the previous tests were not screwed up because operator* returns 0
+  VERIFY_IS_NOT_APPROX((m1.transpose()*m2).template cast<FloatingPoint>(), (m2.transpose()*m1).template cast<FloatingPoint>());
+
+  // test optimized operator+= path
+  res = square;
+  res += (m1 * m2.transpose()).lazy();
+  VERIFY_IS_APPROX(res, square + m1 * m2.transpose());
+  VERIFY_IS_NOT_APPROX(res.template cast<FloatingPoint>(), (square + m2 * m1.transpose()).template cast<FloatingPoint>());
+
+  res2 = square2;
+  res2 += (m1.transpose() * m2).lazy();
+  VERIFY_IS_APPROX(res2, square2 + m1.transpose() * m2);
+  VERIFY_IS_NOT_APPROX(res2.template cast<FloatingPoint>(), (square2 + m2.transpose() * m1).template cast<FloatingPoint>());
 }
 
 void test_product()
@@ -93,11 +111,17 @@ void test_product()
     CALL_SUBTEST( product(Matrix4d()) );
     CALL_SUBTEST( product(Matrix4f()) );
     CALL_SUBTEST( product(MatrixXf(3,5)) );
+    CALL_SUBTEST( product(MatrixXi(28,39)) );
   }
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST( product(MatrixXf(ei_random<int>(1,320), ei_random<int>(1,320))) );
     CALL_SUBTEST( product(MatrixXd(ei_random<int>(1,320), ei_random<int>(1,320))) );
-    CALL_SUBTEST( product(MatrixXi(ei_random<int>(1,320), ei_random<int>(1,320))) );
+    CALL_SUBTEST( product(MatrixXi(ei_random<int>(1,256), ei_random<int>(1,256))) );
     CALL_SUBTEST( product(MatrixXcf(ei_random<int>(1,50), ei_random<int>(1,50))) );
+    #ifndef EIGEN_DEFAULT_TO_ROW_MAJOR
+    CALL_SUBTEST( product(Matrix<float,Dynamic,Dynamic,Dynamic,Dynamic,RowMajorBit>(ei_random<int>(1,320), ei_random<int>(1,320))) );
+    #else
+    CALL_SUBTEST( product(Matrix<float,Dynamic,Dynamic,Dynamic,Dynamic,0>(ei_random<int>(1,320), ei_random<int>(1,320))) );
+    #endif
   }
 }