8 files changed, 353 insertions, 201 deletions
diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h
index 98df25235..9dc7a3cf3 100644
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -27,110 +27,371 @@
 #ifndef EIGEN_ASSIGN_H
 #define EIGEN_ASSIGN_H
 
-template<typename Derived1, typename Derived2, int UnrollCount>
-struct ei_matrix_assignment_unroller
+/***************************************************************************
+* Part 1 : the logic deciding a strategy for vectorization and unrolling
+***************************************************************************/
+
+enum {
+  NoVectorization,
+  InnerVectorization,
+  Like1DVectorization,
+  SlicedVectorization
+};
+
+enum {
+  CompleteUnrolling,
+  InnerUnrolling,
+  NoUnrolling
+};
+
+template <typename Derived, typename OtherDerived>
+struct ei_assign_traits
+{
+private:
+  enum {
+    InnerSize = int(Derived::Flags)&RowMajorBit
+              ? Derived::ColsAtCompileTime
+              : Derived::RowsAtCompileTime,
+    PacketSize = ei_packet_traits<typename Derived::Scalar>::size
+  };
+
+  enum {
+    MightVectorize = (int(Derived::Flags) & int(OtherDerived::Flags) & VectorizableBit)
+             && ((int(Derived::Flags)&RowMajorBit)==(int(OtherDerived::Flags)&RowMajorBit)),
+    MayInnerVectorize = MightVectorize && InnerSize!=Dynamic && int(InnerSize)%int(PacketSize)==0,
+    MayLike1DVectorize = MightVectorize && (int(Derived::Flags) & int(OtherDerived::Flags) & Like1DArrayBit),
+    MaySlicedVectorize = MightVectorize && InnerSize==Dynamic
+  };
+
+public:
+  enum {
+    Vectorization = MayInnerVectorize  ? InnerVectorization
+                  : MayLike1DVectorize ? Like1DVectorization
+                  : MaySlicedVectorize ? SlicedVectorization
+                                       : NoVectorization
+  };
+
+private:
+  enum {
+    UnrollingLimit      = EIGEN_UNROLLING_LIMIT / (int(Vectorization) == int(NoVectorization) ? 1 : int(PacketSize)),
+    MayUnrollCompletely = int(Derived::SizeAtCompileTime) * int(OtherDerived::CoeffReadCost) <= int(UnrollingLimit),
+    MayUnrollInner      = int(InnerSize * OtherDerived::CoeffReadCost) <= int(UnrollingLimit)
+  };
+
+public:
+  enum {
+    Unrolling = (int(Vectorization) == int(InnerVectorization) || int(Vectorization) == int(NoVectorization))
+              ? (
+                   MayUnrollCompletely ? CompleteUnrolling
+                 : MayUnrollInner      ? InnerUnrolling
+                                       : NoUnrolling
+                )
+              : int(Vectorization) == int(Like1DVectorization)
+              ? ( MayUnrollCompletely ? CompleteUnrolling : NoUnrolling )
+              : NoUnrolling
+  };
+};
+
+/***************************************************************************
+* Part 2 : meta-unrollers
+***************************************************************************/
+
+/***********************
+*** No vectorization ***
+***********************/
+
+template<typename Derived1, typename Derived2, int Index, int Stop>
+struct ei_assign_novec_CompleteUnrolling
 {
   enum {
-    col = (UnrollCount-1) / Derived1::RowsAtCompileTime,
-    row = (UnrollCount-1) % Derived1::RowsAtCompileTime
+    row = int(Derived1::Flags)&RowMajorBit
+        ? Index / int(Derived1::ColsAtCompileTime)
+        : Index % Derived1::RowsAtCompileTime,
+    col = int(Derived1::Flags)&RowMajorBit
+        ? Index % int(Derived1::ColsAtCompileTime)
+        : Index / Derived1::RowsAtCompileTime
   };
 
   inline static void run(Derived1 &dst, const Derived2 &src)
   {
-    ei_matrix_assignment_unroller<Derived1, Derived2, UnrollCount-1>::run(dst, src);
     dst.coeffRef(row, col) = src.coeff(row, col);
+    ei_assign_novec_CompleteUnrolling<Derived1, Derived2, Index+1, Stop>::run(dst, src);
   }
 };
 
-template<typename Derived1, typename Derived2>
-struct ei_matrix_assignment_unroller<Derived1, Derived2, 1>
+template<typename Derived1, typename Derived2, int Stop>
+struct ei_assign_novec_CompleteUnrolling<Derived1, Derived2, Stop, Stop>
+{
+  inline static void run(Derived1 &, const Derived2 &) {}
+};
+
+template<typename Derived1, typename Derived2, int Index, int Stop>
+struct ei_assign_novec_InnerUnrolling
+{
+  inline static void run(Derived1 &dst, const Derived2 &src, int row_or_col)
+  {
+    const bool rowMajor = int(Derived1::Flags)&RowMajorBit;
+    const int row = rowMajor ? row_or_col : Index;
+    const int col = rowMajor ? Index : row_or_col;
+    dst.coeffRef(row, col) = src.coeff(row, col);
+    ei_assign_novec_InnerUnrolling<Derived1, Derived2, Index+1, Stop>::run(dst, src, row_or_col);
+  }
+};
+
+template<typename Derived1, typename Derived2, int Stop>
+struct ei_assign_novec_InnerUnrolling<Derived1, Derived2, Stop, Stop>
+{
+  inline static void run(Derived1 &, const Derived2 &, int) {}
+};
+
+/**************************
+*** Inner vectorization ***
+**************************/
+
+template<typename Derived1, typename Derived2, int Index, int Stop>
+struct ei_assign_innervec_CompleteUnrolling
 {
+  enum {
+    row = int(Derived1::Flags)&RowMajorBit
+        ? Index / int(Derived1::ColsAtCompileTime)
+        : Index % Derived1::RowsAtCompileTime,
+    col = int(Derived1::Flags)&RowMajorBit
+        ? Index % int(Derived1::ColsAtCompileTime)
+        : Index / Derived1::RowsAtCompileTime
+  };
+
   inline static void run(Derived1 &dst, const Derived2 &src)
   {
-    dst.coeffRef(0, 0) = src.coeff(0, 0);
+    dst.template writePacketCoeff<Aligned>(row, col, src.template packetCoeff<Aligned>(row, col));
+    ei_assign_innervec_CompleteUnrolling<Derived1, Derived2,
+      Index+ei_packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src);
   }
 };
 
-// prevent buggy user code from causing an infinite recursion
-template<typename Derived1, typename Derived2>
-struct ei_matrix_assignment_unroller<Derived1, Derived2, 0>
+template<typename Derived1, typename Derived2, int Stop>
+struct ei_assign_innervec_CompleteUnrolling<Derived1, Derived2, Stop, Stop>
 {
   inline static void run(Derived1 &, const Derived2 &) {}
 };
 
-// Dynamic col-major
+template<typename Derived1, typename Derived2, int Index, int Stop>
+struct ei_assign_innervec_InnerUnrolling
+{
+  inline static void run(Derived1 &dst, const Derived2 &src, int row_or_col)
+  {
+    const int row = int(Derived1::Flags)&RowMajorBit ? row_or_col : Index;
+    const int col = int(Derived1::Flags)&RowMajorBit ? Index : row_or_col;
+    dst.template writePacketCoeff<Aligned>(row, col, src.template packetCoeff<Aligned>(row, col));
+    ei_assign_innervec_InnerUnrolling<Derived1, Derived2,
+      Index+ei_packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src, row_or_col);
+  }
+};
+
+template<typename Derived1, typename Derived2, int Stop>
+struct ei_assign_innervec_InnerUnrolling<Derived1, Derived2, Stop, Stop>
+{
+  inline static void run(Derived1 &, const Derived2 &, int) {}
+};
+
+/***************************************************************************
+* Part 3 : implementation of all cases
+***************************************************************************/
+
+template<typename Derived1, typename Derived2,
+         int Vectorization = ei_assign_traits<Derived1, Derived2>::Vectorization,
+         int Unrolling = ei_assign_traits<Derived1, Derived2>::Unrolling>
+struct ei_assign_impl;
+
+/***********************
+*** No vectorization ***
+***********************/
+
 template<typename Derived1, typename Derived2>
-struct ei_matrix_assignment_unroller<Derived1, Derived2, -1>
+struct ei_assign_impl<Derived1, Derived2, NoVectorization, NoUnrolling>
 {
-  inline static void run(Derived1 &dst, const Derived2 &src)
+  static void run(Derived1 &dst, const Derived2 &src)
   {
-    for(int j = 0; j < dst.cols(); j++)
-      for(int i = 0; i < dst.rows(); i++)
-        dst.coeffRef(i, j) = src.coeff(i, j);
+    const bool rowMajor = int(Derived1::Flags)&RowMajorBit;
+    const int innerSize = rowMajor ? dst.cols() : dst.rows();
+    const int outerSize = rowMajor ? dst.rows() : dst.cols();
+    for(int j = 0; j < outerSize; j++)
+      for(int i = 0; i < innerSize; i++)
+      {
+        const int row = rowMajor ? j : i;
+        const int col = rowMajor ? i : j;
+        dst.coeffRef(row, col) = src.coeff(row, col);
+      }
   }
 };
 
-// Dynamic row-major
 template<typename Derived1, typename Derived2>
-struct ei_matrix_assignment_unroller<Derived1, Derived2, -2>
+struct ei_assign_impl<Derived1, Derived2, NoVectorization, CompleteUnrolling>
 {
   inline static void run(Derived1 &dst, const Derived2 &src)
   {
-    // traverse in row-major order
-    // in order to allow the compiler to unroll the inner loop
-    for(int i = 0; i < dst.rows(); i++)
-      for(int j = 0; j < dst.cols(); j++)
-        dst.coeffRef(i, j) = src.coeff(i, j);
+    ei_assign_novec_CompleteUnrolling<Derived1, Derived2, 0, Derived1::SizeAtCompileTime>
+      ::run(dst, src);
+  }
+};
+
+template<typename Derived1, typename Derived2>
+struct ei_assign_impl<Derived1, Derived2, NoVectorization, InnerUnrolling>
+{
+  static void run(Derived1 &dst, const Derived2 &src)
+  {
+    const bool rowMajor = int(Derived1::Flags)&RowMajorBit;
+    const int innerSize = rowMajor ? Derived1::ColsAtCompileTime : Derived1::RowsAtCompileTime;
+    const int outerSize = rowMajor ? dst.rows() : dst.cols();
+    for(int j = 0; j < outerSize; j++)
+      ei_assign_novec_InnerUnrolling<Derived1, Derived2, 0, innerSize>
+        ::run(dst, src, j);
   }
 };
 
-//----
+/**************************
+*** Inner vectorization ***
+**************************/
 
-template<typename Derived1, typename Derived2, int Index>
-struct ei_matrix_assignment_packet_unroller
+template<typename Derived1, typename Derived2>
+struct ei_assign_impl<Derived1, Derived2, InnerVectorization, NoUnrolling>
 {
-  enum {
-    row = int(Derived1::Flags)&RowMajorBit ? Index / int(Derived1::ColsAtCompileTime) : Index % Derived1::RowsAtCompileTime,
-    col = int(Derived1::Flags)&RowMajorBit ? Index % int(Derived1::ColsAtCompileTime) : Index / Derived1::RowsAtCompileTime
-  };
+  static void run(Derived1 &dst, const Derived2 &src)
+  {
+    const bool rowMajor = int(Derived1::Flags)&RowMajorBit;
+    const int innerSize = rowMajor ? dst.cols() : dst.rows();
+    const int outerSize = rowMajor ? dst.rows() : dst.cols();
+    const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
+    for(int j = 0; j < outerSize; j++)
+    {
+      for(int i = 0; i < innerSize; i+=packetSize)
+      {
+        const int row = rowMajor ? j : i;
+        const int col = rowMajor ? i : j;
+        dst.template writePacketCoeff<Aligned>(row, col, src.template packetCoeff<Aligned>(row, col));
+      }  
+    }
+  }
+};
 
+template<typename Derived1, typename Derived2>
+struct ei_assign_impl<Derived1, Derived2, InnerVectorization, CompleteUnrolling>
+{
   inline static void run(Derived1 &dst, const Derived2 &src)
   {
-    ei_matrix_assignment_packet_unroller<Derived1, Derived2,
-      Index-ei_packet_traits<typename Derived1::Scalar>::size>::run(dst, src);
-    dst.template writePacketCoeff<Aligned>(row, col, src.template packetCoeff<Aligned>(row, col));
+    ei_assign_innervec_CompleteUnrolling<Derived1, Derived2, 0, Derived1::SizeAtCompileTime>
+      ::run(dst, src);
   }
 };
 
 template<typename Derived1, typename Derived2>
-struct ei_matrix_assignment_packet_unroller<Derived1, Derived2, 0 >
+struct ei_assign_impl<Derived1, Derived2, InnerVectorization, InnerUnrolling>
 {
-  inline static void run(Derived1 &dst, const Derived2 &src)
+  static void run(Derived1 &dst, const Derived2 &src)
   {
-    dst.template writePacketCoeff<Aligned>(0, 0, src.template packetCoeff<Aligned>(0, 0));
+    const bool rowMajor = int(Derived1::Flags)&RowMajorBit;
+    const int innerSize = rowMajor ? Derived1::ColsAtCompileTime : Derived1::RowsAtCompileTime;
+    const int outerSize = rowMajor ? dst.rows() : dst.cols();
+    for(int j = 0; j < outerSize; j++)
+      ei_assign_innervec_InnerUnrolling<Derived1, Derived2, 0, innerSize>
+        ::run(dst, src, j);
   }
 };
 
+/***************************
+*** Like1D vectorization ***
+***************************/
+
 template<typename Derived1, typename Derived2>
-struct ei_matrix_assignment_packet_unroller<Derived1, Derived2, Dynamic>
+struct ei_assign_impl<Derived1, Derived2, Like1DVectorization, NoUnrolling>
 {
-  inline static void run(Derived1 &, const Derived2 &)
-  { ei_internal_assert(false && "ei_matrix_assignment_packet_unroller"); }
+  static void run(Derived1 &dst, const Derived2 &src)
+  {
+    const int size = dst.size();
+    const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
+    const int alignedSize = (size/packetSize)*packetSize;
+    const bool rowMajor = Derived1::Flags&RowMajorBit;
+    const int innerSize = rowMajor ? dst.cols() : dst.rows();
+    const int outerSize = rowMajor ? dst.rows() : dst.cols();
+    int index = 0;
+
+    // do the vectorizable part of the assignment
+    for ( ; index<alignedSize ; index+=packetSize)
+    {
+      // FIXME the following is not really efficient
+      const int row = rowMajor ? index/innerSize : index%innerSize;
+      const int col = rowMajor ? index%innerSize : index/innerSize;
+      dst.template writePacketCoeff<Aligned>(row, col, src.template packetCoeff<Aligned>(row, col));
+    }
+
+    // now we must do the rest without vectorization.
+    if(alignedSize == size) return;
+    const int k = alignedSize/innerSize;
+
+    // do the remainder of the current row or col
+    for(int i = alignedSize%innerSize; i < innerSize; i++)
+    {
+      const int row = rowMajor ? k : i;
+      const int col = rowMajor ? i : k;
+      dst.coeffRef(row, col) = src.coeff(row, col);
+    }
+
+    // do the remaining rows or cols
+    for(int j = k+1; j < outerSize; j++)
+      for(int i = 0; i < innerSize; i++)
+      {
+        const int row = rowMajor ? i : j;
+        const int col = rowMajor ? j : i;
+        dst.coeffRef(row, col) = src.coeff(row, col);
+      }
+  }
+};
+
+template<typename Derived1, typename Derived2>
+struct ei_assign_impl<Derived1, Derived2, Like1DVectorization, CompleteUnrolling>
+{
+  inline static void run(Derived1 &dst, const Derived2 &src)
+  {
+    const int size = Derived1::SizeAtCompileTime;
+    const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
+    const int alignedSize = (size/packetSize)*packetSize;
+    const bool rowMajor = Derived1::Flags&RowMajorBit;
+    const int innerSize = rowMajor ? Derived1::ColsAtCompileTime : Derived1::RowsAtCompileTime;
+    const int outerSize = rowMajor ? Derived1::RowsAtCompileTime : Derived1::ColsAtCompileTime;
+    int index = 0;
+
+    // do the vectorizable part of the assignment
+    ei_assign_innervec_CompleteUnrolling<Derived1, Derived2, 0, alignedSize>::run(dst, src);
+
+    // now we must do the rest without vectorization.
+    const int k = alignedSize/innerSize;
+    const int i = alignedSize%innerSize;
+
+    // do the remainder of the current row or col
+    ei_assign_novec_InnerUnrolling<Derived1, Derived2, i, innerSize>::run(dst, src, k);
+
+    // do the remaining rows or cols
+    for(int j = k+1; j < outerSize; j++)
+      ei_assign_novec_InnerUnrolling<Derived1, Derived2, 0, innerSize>::run(dst, src, j);
+  }
 };
 
-//----
+/***************************
+*** Sliced vectorization ***
+***************************/
+
+template<typename Derived1, typename Derived2>
+struct ei_assign_impl<Derived1, Derived2, SlicedVectorization, NoUnrolling>
+{
+  static void run(Derived1 &dst, const Derived2 &src)
+  {
+    //FIXME unimplemented
+    ei_assign_impl<Derived1, Derived2, NoVectorization, NoUnrolling>::run(dst, src);
+  }
+};
 
-template <typename Derived, typename OtherDerived,
-bool Vectorize = (int(Derived::Flags) & int(OtherDerived::Flags) & VectorizableBit)
-              && ((int(Derived::Flags)&RowMajorBit)==(int(OtherDerived::Flags)&RowMajorBit))
-              && (   (int(Derived::Flags) & int(OtherDerived::Flags) & Like1DArrayBit)
-                  || ((int(Derived::Flags) & RowMajorBit)
-                    ?     int(Derived::ColsAtCompileTime)!=Dynamic
-                      && (int(Derived::ColsAtCompileTime)%ei_packet_traits<typename Derived::Scalar>::size==0)
-                    :     int(Derived::RowsAtCompileTime)!=Dynamic
-                      && (int(Derived::RowsAtCompileTime)%ei_packet_traits<typename Derived::Scalar>::size==0)) ),
-bool Unroll = Derived::SizeAtCompileTime * OtherDerived::CoeffReadCost <= EIGEN_UNROLLING_LIMIT>
-struct ei_assignment_impl;
+/***************************************************************************
+* Part 4 : implementation of MatrixBase methods
+***************************************************************************/
 
 template<typename Derived>
 template<typename OtherDerived>
@@ -139,16 +400,17 @@ inline Derived& MatrixBase<Derived>
 {
   EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Derived,OtherDerived);
   ei_assert(rows() == other.rows() && cols() == other.cols());
-  ei_assignment_impl<Derived, OtherDerived>::run(derived(),other.derived());
+  ei_assign_impl<Derived, OtherDerived>::run(derived(),other.derived());
   return derived();
 }
 
 template<typename Derived, typename OtherDerived,
-         bool EvalBeforeAssigning = (OtherDerived::Flags & EvalBeforeAssigningBit),
+         bool EvalBeforeAssigning = int(OtherDerived::Flags) & EvalBeforeAssigningBit,
          bool NeedToTranspose = Derived::IsVectorAtCompileTime
                 && OtherDerived::IsVectorAtCompileTime
-                && (int)Derived::RowsAtCompileTime != (int)OtherDerived::RowsAtCompileTime
-                && (int)Derived::ColsAtCompileTime != (int)OtherDerived::ColsAtCompileTime>
+                && int(Derived::RowsAtCompileTime) == int(OtherDerived::ColsAtCompileTime)
+                && int(Derived::ColsAtCompileTime) == int(OtherDerived::RowsAtCompileTime)
+                && int(Derived::SizeAtCompileTime) != 1>
 struct ei_assign_selector;
 
 template<typename Derived, typename OtherDerived>
@@ -176,120 +438,4 @@ inline Derived& MatrixBase<Derived>
   return ei_assign_selector<Derived,OtherDerived>::run(derived(), other.derived());
 }
 
-//----
-
-// no vectorization
-template <typename Derived, typename OtherDerived, bool Unroll>
-struct ei_assignment_impl<Derived, OtherDerived, false, Unroll>
-{
-  static void run(Derived & dst, const OtherDerived & src)
-  {
-    ei_matrix_assignment_unroller
-      <Derived, OtherDerived,
-      Unroll ? int(Derived::SizeAtCompileTime)
-      : Derived::ColsAtCompileTime == Dynamic || Derived::RowsAtCompileTime != Dynamic ? -1 // col-major
-      : -2 // row-major
-      >::run(dst.derived(), src.derived());
-  }
-};
-
-//----
-
-template <typename Derived, typename OtherDerived>
-struct ei_assignment_impl<Derived, OtherDerived, true, true> // vec + unrolling
-{
-  static void run(Derived & dst, const OtherDerived & src)
-  {
-    ei_matrix_assignment_packet_unroller
-      <Derived, OtherDerived,
-       int(Derived::SizeAtCompileTime)-int(ei_packet_traits<typename Derived::Scalar>::size)
-      >::run(dst.const_cast_derived(), src.derived());
-  }
-};
-
-template <typename Derived, typename OtherDerived,
-bool RowMajor = OtherDerived::Flags&RowMajorBit,
-bool Complex1DArray = RowMajor
-  ? (  (Derived::Flags & OtherDerived::Flags & Like1DArrayBit)
-    && (   Derived::ColsAtCompileTime==Dynamic
-       || Derived::ColsAtCompileTime%ei_packet_traits<typename Derived::Scalar>::size!=0) )
-  : (  (Derived::Flags & OtherDerived::Flags & Like1DArrayBit)
-    && (  Derived::RowsAtCompileTime==Dynamic
-       || Derived::RowsAtCompileTime%ei_packet_traits<typename Derived::Scalar>::size!=0))>
-struct ei_packet_assignment_seclector;
-
-template <typename Derived, typename OtherDerived>
-struct ei_assignment_impl<Derived, OtherDerived, true, false> // vec + no-unrolling
-{
-  static void run(Derived & dst, const OtherDerived & src)
-  {
-    ei_packet_assignment_seclector<Derived,OtherDerived>::run(dst,src);
-  }
-};
-
-template <typename Derived, typename OtherDerived>
-struct ei_packet_assignment_seclector<Derived, OtherDerived, true, true> // row-major + complex 1D array like
-{
-  static void run(Derived & dst, const OtherDerived & src)
-  {
-    const int size = dst.rows() * dst.cols();
-    const int alignedSize = (size/ei_packet_traits<typename Derived::Scalar>::size)
-                            * ei_packet_traits<typename Derived::Scalar>::size;
-    int index = 0;
-    for ( ; index<alignedSize ; index+=ei_packet_traits<typename Derived::Scalar>::size)
-    {
-      // FIXME the following is not really efficient
-      int i = index/dst.cols();
-      int j = index%dst.cols();
-      dst.template writePacketCoeff<Aligned>(i, j, src.template packetCoeff<Aligned>(i, j));
-    }
-    for(int i = alignedSize/dst.cols(); i < dst.rows(); i++)
-      for(int j = alignedSize%dst.cols(); j < dst.cols(); j++)
-        dst.coeffRef(i, j) = src.coeff(i, j);
-  }
-};
-
-template <typename Derived, typename OtherDerived>
-struct ei_packet_assignment_seclector<Derived, OtherDerived, true, false> // row-major + normal
-{
-  static void run(Derived & dst, const OtherDerived & src)
-  {
-    for(int i = 0; i < dst.rows(); i++)
-      for(int j = 0; j < dst.cols(); j+=ei_packet_traits<typename Derived::Scalar>::size)
-        dst.template writePacketCoeff<Aligned>(i, j, src.template packetCoeff<Aligned>(i, j));
-  }
-};
-
-template <typename Derived, typename OtherDerived>
-struct ei_packet_assignment_seclector<Derived, OtherDerived, false, true> // col-major + complex 1D array like
-{
-  static void run(Derived & dst, const OtherDerived & src)
-  {
-    const int size = dst.rows() * dst.cols();
-    const int alignedSize = (size/ei_packet_traits<typename Derived::Scalar>::size)*ei_packet_traits<typename Derived::Scalar>::size;
-    int index = 0;
-    for ( ; index<alignedSize ; index+=ei_packet_traits<typename Derived::Scalar>::size)
-    {
-      // FIXME the following is not really efficient
-      int i = index%dst.rows();
-      int j = index/dst.rows();
-      dst.template writePacketCoeff<Aligned>(i, j, src.template packetCoeff<Aligned>(i, j));
-    }
-    for(int j = alignedSize/dst.rows(); j < dst.cols(); j++)
-      for(int i = alignedSize%dst.rows(); i < dst.rows(); i++)
-        dst.coeffRef(i, j) = src.coeff(i, j);
-  }
-};
-
-template <typename Derived, typename OtherDerived>
-struct ei_packet_assignment_seclector<Derived, OtherDerived, false, false> // col-major + normal
-{
-  static void run(Derived & dst, const OtherDerived & src)
-  {
-    for(int j = 0; j < dst.cols(); j++)
-      for(int i = 0; i < dst.rows(); i+=ei_packet_traits<typename Derived::Scalar>::size)
-        dst.template writePacketCoeff<Aligned>(i, j, src.template packetCoeff<Aligned>(i, j));
-  }
-};
-
 #endif // EIGEN_ASSIGN_H
diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h
index baaae57e4..6fcc76719 100644
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -89,7 +89,7 @@ struct ei_traits<Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCols, _Flags> >
     MaxColsAtCompileTime = _MaxCols,
     Flags = ei_corrected_matrix_flags<
                 _Scalar,
-                ei_size_at_compile_time<_MaxRows,_MaxCols>::ret,
+                _Rows, _Cols, _MaxRows, _MaxCols,
                 _Flags
             >::ret,
     CoeffReadCost = NumTraits<Scalar>::ReadCost
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 6875e3158..857a389d6 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -155,20 +155,20 @@ template<typename Lhs, typename Rhs> struct ei_product_eval_mode
 template<typename T> class ei_product_eval_to_column_major
 {
     typedef typename ei_traits<T>::Scalar _Scalar;
-    enum {_MaxRows = ei_traits<T>::MaxRowsAtCompileTime,
+    enum {
+          _Rows = ei_traits<T>::RowsAtCompileTime,
+          _Cols = ei_traits<T>::ColsAtCompileTime,
+          _MaxRows = ei_traits<T>::MaxRowsAtCompileTime,
           _MaxCols = ei_traits<T>::MaxColsAtCompileTime,
           _Flags = ei_traits<T>::Flags
     };
 
   public:
     typedef Matrix<_Scalar,
-                  ei_traits<T>::RowsAtCompileTime,
-                  ei_traits<T>::ColsAtCompileTime,
-                  ei_traits<T>::MaxRowsAtCompileTime,
-                  ei_traits<T>::MaxColsAtCompileTime,
+                  _Rows, _Cols, _MaxRows, _MaxCols,
                   ei_corrected_matrix_flags<
                       _Scalar,
-                      ei_size_at_compile_time<_MaxRows,_MaxCols>::ret,
+                      _Rows, _Cols, _MaxRows, _MaxCols,
                       _Flags
                   >::ret & ~RowMajorBit
             > type;
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index 23749d67c..86eecadd5 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -48,8 +48,8 @@ struct ei_traits<Transpose<MatrixType> >
     ColsAtCompileTime = MatrixType::RowsAtCompileTime,
     MaxRowsAtCompileTime = MatrixType::MaxColsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-    Flags = (int(_MatrixTypeNested::Flags) ^ RowMajorBit)
-          & ~( Like1DArrayBit | LowerTriangularBit | UpperTriangularBit)
+    Flags = ((int(_MatrixTypeNested::Flags) ^ RowMajorBit)
+          & ~( Like1DArrayBit | LowerTriangularBit | UpperTriangularBit))
           | (int(_MatrixTypeNested::Flags)&UpperTriangularBit ? LowerTriangularBit : 0)
           | (int(_MatrixTypeNested::Flags)&LowerTriangularBit ? UpperTriangularBit : 0),
     CoeffReadCost = _MatrixTypeNested::CoeffReadCost
diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index 163832394..fab5b1321 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -94,12 +94,12 @@ const unsigned int SelfAdjointBit = 0x100;
 
 /** \ingroup flags
   *
-  * means the strictly triangular lower part is 0 */
+  * means the strictly lower triangular part is 0 */
 const unsigned int UpperTriangularBit = 0x200;
 
 /** \ingroup flags
   *
-  * means the strictly triangular upper part is 0 */
+  * means the strictly upper triangular part is 0 */
 const unsigned int LowerTriangularBit = 0x400;
 
 /** \ingroup flags
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index 3e2b504c5..f586b15d9 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -28,15 +28,13 @@
 template<typename T> struct ei_traits;
 template<typename Lhs, typename Rhs> struct ei_product_eval_mode;
 template<typename T> struct NumTraits;
-template<typename Scalar, int Size, unsigned int SuggestedFlags> class ei_corrected_matrix_flags;
-
-template<int _Rows, int _Cols> struct ei_size_at_compile_time;
+template<typename Scalar, int Rows, int Cols, int MaxRows, int MaxCols, unsigned int SuggestedFlags> class ei_corrected_matrix_flags;
 
 template<typename _Scalar, int _Rows, int _Cols,
          int _MaxRows = _Rows, int _MaxCols = _Cols,
          unsigned int _Flags = ei_corrected_matrix_flags<
                                    _Scalar,
-                                   ei_size_at_compile_time<_MaxRows,_MaxCols>::ret,
+                                   _Rows, _Cols, _MaxRows, _MaxCols,
                                    EIGEN_DEFAULT_MATRIX_FLAGS
                                >::ret
 >
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 5df6d89d0..e50b3bb81 100644
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -147,19 +147,28 @@ template<typename T> struct ei_packet_traits
   enum {size=1};
 };
 
-template<typename Scalar, int Size, unsigned int SuggestedFlags>
+template<typename Scalar, int Rows, int Cols, int MaxRows, int MaxCols, unsigned int SuggestedFlags>
 class ei_corrected_matrix_flags
 {
-    enum { is_vectorizable
+    enum { row_major_bit = (Rows != 1 && Cols != 1)  // if this is not a vector,
+                                                     // then the storage order really matters,
+                                                     // so let us strictly honor the user's choice.
+                         ? SuggestedFlags&RowMajorBit
+                         : Cols > 1 ? RowMajorBit : 0,
+           is_big = MaxRows == Dynamic || MaxCols == Dynamic,
+           inner_size = row_major_bit ? Cols : Rows,
+           vectorizable_bit
             = ei_packet_traits<Scalar>::size > 1
-              && (Size%ei_packet_traits<Scalar>::size==0),
-          _flags1 = (SuggestedFlags & ~(EvalBeforeNestingBit | EvalBeforeAssigningBit)) | Like1DArrayBit | DirectAccessBit
+              && (is_big || inner_size%ei_packet_traits<Scalar>::size==0)
+              ? VectorizableBit : 0,
+          
+          _flags1 = (SuggestedFlags & ~(EvalBeforeNestingBit | EvalBeforeAssigningBit | VectorizableBit | RowMajorBit))
+                                    | Like1DArrayBit | DirectAccessBit
     };
 
   public:
-    enum { ret = int(is_vectorizable)
-                  ? int(_flags1) | int(VectorizableBit)
-                  : int(_flags1) & ~int(VectorizableBit)
+    enum { ret = (SuggestedFlags & ~(EvalBeforeNestingBit | EvalBeforeAssigningBit | VectorizableBit | RowMajorBit))
+                                    | Like1DArrayBit | DirectAccessBit | vectorizable_bit | row_major_bit
     };
 };
 
@@ -171,20 +180,19 @@ template<int _Rows, int _Cols> struct ei_size_at_compile_time
 template<typename T> class ei_eval
 {
     typedef typename ei_traits<T>::Scalar _Scalar;
-    enum {_MaxRows = ei_traits<T>::MaxRowsAtCompileTime,
+    enum {_Rows = ei_traits<T>::RowsAtCompileTime,
+          _Cols = ei_traits<T>::ColsAtCompileTime,
+          _MaxRows = ei_traits<T>::MaxRowsAtCompileTime,
           _MaxCols = ei_traits<T>::MaxColsAtCompileTime,
           _Flags = ei_traits<T>::Flags
     };
 
   public:
     typedef Matrix<_Scalar,
-                  ei_traits<T>::RowsAtCompileTime,
-                  ei_traits<T>::ColsAtCompileTime,
-                  ei_traits<T>::MaxRowsAtCompileTime,
-                  ei_traits<T>::MaxColsAtCompileTime,
+                  _Rows, _Cols, _MaxRows, _MaxCols,
                   ei_corrected_matrix_flags<
                       _Scalar,
-                      ei_size_at_compile_time<_MaxRows,_MaxCols>::ret,
+                      _Rows, _Cols, _MaxRows, _MaxCols,
                       _Flags
                   >::ret
             > type;
diff --git a/bench/benchmark.cpp b/bench/benchmark.cpp
index abdfbd55a..b48b21d68 100644
--- a/bench/benchmark.cpp
+++ b/bench/benchmark.cpp
@@ -1,5 +1,5 @@
 // g++ -O3 -DNDEBUG -DMATSIZE=<x> benchmark.cpp -o benchmark && time ./benchmark
-#include <Eigen/Core>
+#include <Eigen/Array>
 
 #ifndef MATSIZE
 #define MATSIZE 3