23 files changed, 558 insertions, 264 deletions
diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h
index c28a0371b..85694b35f 100644
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -301,45 +301,14 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
     const int size = dst.size();
     const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
     const int alignedSize = (size/packetSize)*packetSize;
-    const bool rowMajor = Derived1::Flags&RowMajorBit;
-    const int innerSize = rowMajor ? dst.cols() : dst.rows();
-    const int outerSize = rowMajor ? dst.rows() : dst.cols();
-    int index = 0;
-
-    // do the vectorizable part of the assignment
-    int row = 0;
-    int col = 0;
-    while (index<alignedSize)
-    {
-      int start = rowMajor ? col : row;
-      int end = std::min(innerSize, start + alignedSize-index);
-      for ( ; (rowMajor ? col : row)<end; (rowMajor ? col : row)+=packetSize)
-        dst.template writePacket<Aligned>(row, col, src.template packet<Aligned>(row, col));
-      index += (rowMajor ? col : row) - start;
-      row = rowMajor ? index/innerSize : index%innerSize;
-      col = rowMajor ? index%innerSize : index/innerSize;
-    }
-
-    // now we must do the rest without vectorization.
-    if(alignedSize == size) return;
-    const int k = alignedSize/innerSize;
 
-    // do the remainder of the current row or col
-    for(int i = alignedSize%innerSize; i < innerSize; i++)
+    for(int index = 0; index < alignedSize; index += packetSize)
     {
-      const int row = rowMajor ? k : i;
-      const int col = rowMajor ? i : k;
-      dst.coeffRef(row, col) = src.coeff(row, col);
+      dst.template writePacket<Aligned>(index, src.template packet<Aligned>(index));
     }
 
-    // do the remaining rows or cols
-    for(int j = k+1; j < outerSize; j++)
-      for(int i = 0; i < innerSize; i++)
-      {
-        const int row = rowMajor ? i : j;
-        const int col = rowMajor ? j : i;
-        dst.coeffRef(row, col) = src.coeff(row, col);
-      }
+    for(int index = alignedSize; index < size; index++)
+      dst.coeffRef(index) = src.coeff(index);
   }
 };
 
@@ -351,23 +320,9 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorization, CompleteUnrolling
     const int size = Derived1::SizeAtCompileTime;
     const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
     const int alignedSize = (size/packetSize)*packetSize;
-    const bool rowMajor = int(Derived1::Flags)&RowMajorBit;
-    const int innerSize = rowMajor ? int(Derived1::ColsAtCompileTime) : int(Derived1::RowsAtCompileTime);
-    const int outerSize = rowMajor ? int(Derived1::RowsAtCompileTime) : int(Derived1::ColsAtCompileTime);
 
-    // do the vectorizable part of the assignment
     ei_assign_innervec_CompleteUnrolling<Derived1, Derived2, 0, alignedSize>::run(dst, src);
-
-    // now we must do the rest without vectorization.
-    const int k = alignedSize/innerSize;
-    const int i = alignedSize%innerSize;
-
-    // do the remainder of the current row or col
-    ei_assign_novec_InnerUnrolling<Derived1, Derived2, i, k<outerSize ? innerSize : 0>::run(dst, src, k);
-
-    // do the remaining rows or cols
-    for(int j = k+1; j < outerSize; j++)
-      ei_assign_novec_InnerUnrolling<Derived1, Derived2, 0, innerSize>::run(dst, src, j);
+    ei_assign_novec_CompleteUnrolling<Derived1, Derived2, alignedSize, size>::run(dst, src);
   }
 };
 
@@ -432,8 +387,8 @@ template<typename Derived, typename OtherDerived,
 struct ei_assign_selector;
 
 template<typename Derived, typename OtherDerived>
-struct ei_assign_selector<Derived,OtherDerived,true,true> {
-  static Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose().eval()); }
+struct ei_assign_selector<Derived,OtherDerived,false,false> {
+  static Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.derived()); }
 };
 template<typename Derived, typename OtherDerived>
 struct ei_assign_selector<Derived,OtherDerived,true,false> {
@@ -444,8 +399,8 @@ struct ei_assign_selector<Derived,OtherDerived,false,true> {
   static Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose()); }
 };
 template<typename Derived, typename OtherDerived>
-struct ei_assign_selector<Derived,OtherDerived,false,false> {
-  static Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.derived()); }
+struct ei_assign_selector<Derived,OtherDerived,true,true> {
+  static Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose().eval()); }
 };
 
 template<typename Derived>
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 5c9ad69d5..e9f5bab29 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -71,13 +71,12 @@ struct ei_traits<Block<MatrixType, BlockRows, BlockCols> >
                   || (ColsAtCompileTime != Dynamic && MatrixType::ColsAtCompileTime == Dynamic))
                    ? ~LargeBit
                    : ~(unsigned int)0,
-    MaskPacketAccessBit = ei_corrected_matrix_flags<
-                            Scalar, RowsAtCompileTime, ColsAtCompileTime,
-                            MaxRowsAtCompileTime, MaxColsAtCompileTime, MatrixType::Flags
-                          >::ret & PacketAccessBit,
-    FlagsLinearAccessBit = MatrixType::Flags & RowMajorBit
-                        ? (RowsAtCompileTime == 1 ? LinearAccessBit : 0)
-                        : (ColsAtCompileTime == 1 ? LinearAccessBit : 0),
+    RowMajor = int(MatrixType::Flags)&RowMajorBit,
+    InnerSize = RowMajor ? ColsAtCompileTime : RowsAtCompileTime,
+    InnerMaxSize = RowMajor ? MaxColsAtCompileTime : MaxRowsAtCompileTime,
+    MaskPacketAccessBit = (InnerMaxSize == Dynamic || (InnerSize % ei_packet_traits<Scalar>::size) == 0)
+                        ? PacketAccessBit : 0,
+    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0,
     Flags = (MatrixType::Flags & (HereditaryBits | MaskPacketAccessBit | DirectAccessBit) & MaskLargeBit)
           | FlagsLinearAccessBit,
     CoeffReadCost = MatrixType::CoeffReadCost
@@ -153,6 +152,21 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block
       return m_matrix.coeff(row + m_startRow.value(), col + m_startCol.value());
     }
 
+    inline Scalar& _coeffRef(int index)
+    {
+      return m_matrix.const_cast_derived()
+             .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                       m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+      
+    }
+
+    inline const Scalar _coeff(int index) const
+    {
+      return m_matrix
+             .coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                    m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+    }
+
     template<int LoadMode>
     inline PacketScalar _packet(int row, int col) const
     {
@@ -165,6 +179,21 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block
       m_matrix.const_cast_derived().template writePacket<UnAligned>(row + m_startRow.value(), col + m_startCol.value(), x);
     }
 
+    template<int LoadMode>
+    inline PacketScalar _packet(int index) const
+    {
+      return m_matrix.template packet<UnAligned>(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                                                 m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+    }
+
+    template<int LoadMode>
+    inline void _writePacket(int index, const PacketScalar& x)
+    {
+      m_matrix.const_cast_derived().template writePacket<UnAligned>
+         (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+          m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0), x);
+    }
+
   protected:
 
     const typename MatrixType::Nested m_matrix;
@@ -260,22 +289,30 @@ inline const Block<Derived> MatrixBase<Derived>
   * \sa class Block, block(int,int)
   */
 template<typename Derived>
-inline Block<Derived> MatrixBase<Derived>::start(int size)
+inline typename MatrixBase<Derived>::template SubVectorReturnType<Dynamic>::Type
+MatrixBase<Derived>::start(int size)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
-  return Block<Derived>(derived(), 0, 0,
-                        RowsAtCompileTime == 1 ? 1 : size,
-                        ColsAtCompileTime == 1 ? 1 : size);
+  return Block<Derived,
+               RowsAtCompileTime == 1 ? 1 : Dynamic,
+               ColsAtCompileTime == 1 ? 1 : Dynamic>
+              (derived(), 0, 0,
+               RowsAtCompileTime == 1 ? 1 : size,
+               ColsAtCompileTime == 1 ? 1 : size);
 }
 
 /** This is the const version of start(int).*/
 template<typename Derived>
-inline const Block<Derived> MatrixBase<Derived>::start(int size) const
+inline const typename MatrixBase<Derived>::template SubVectorReturnType<Dynamic>::Type
+MatrixBase<Derived>::start(int size) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
-  return Block<Derived>(derived(), 0, 0,
-                        RowsAtCompileTime == 1 ? 1 : size,
-                        ColsAtCompileTime == 1 ? 1 : size);
+  return Block<Derived,
+               RowsAtCompileTime == 1 ? 1 : Dynamic,
+               ColsAtCompileTime == 1 ? 1 : Dynamic>
+              (derived(), 0, 0,
+               RowsAtCompileTime == 1 ? 1 : size,
+               ColsAtCompileTime == 1 ? 1 : size);
 }
 
 /** \returns a dynamic-size expression of the last coefficients of *this.
@@ -294,26 +331,34 @@ inline const Block<Derived> MatrixBase<Derived>::start(int size) const
   * \sa class Block, block(int,int)
   */
 template<typename Derived>
-inline Block<Derived> MatrixBase<Derived>::end(int size)
+inline typename MatrixBase<Derived>::template SubVectorReturnType<Dynamic>::Type
+MatrixBase<Derived>::end(int size)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
-  return Block<Derived>(derived(),
-                        RowsAtCompileTime == 1 ? 0 : rows() - size,
-                        ColsAtCompileTime == 1 ? 0 : cols() - size,
-                        RowsAtCompileTime == 1 ? 1 : size,
-                        ColsAtCompileTime == 1 ? 1 : size);
+  return Block<Derived,
+               RowsAtCompileTime == 1 ? 1 : Dynamic,
+               ColsAtCompileTime == 1 ? 1 : Dynamic>
+              (derived(),
+               RowsAtCompileTime == 1 ? 0 : rows() - size,
+               ColsAtCompileTime == 1 ? 0 : cols() - size,
+               RowsAtCompileTime == 1 ? 1 : size,
+               ColsAtCompileTime == 1 ? 1 : size);
 }
 
 /** This is the const version of end(int).*/
 template<typename Derived>
-inline const Block<Derived> MatrixBase<Derived>::end(int size) const
+inline const typename MatrixBase<Derived>::template SubVectorReturnType<Dynamic>::Type
+MatrixBase<Derived>::end(int size) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
-  return Block<Derived>(derived(),
-                        RowsAtCompileTime == 1 ? 0 : rows() - size,
-                        ColsAtCompileTime == 1 ? 0 : cols() - size,
-                        RowsAtCompileTime == 1 ? 1 : size,
-                        ColsAtCompileTime == 1 ? 1 : size);
+  return Block<Derived,
+               RowsAtCompileTime == 1 ? 1 : Dynamic,
+               ColsAtCompileTime == 1 ? 1 : Dynamic>
+              (derived(),
+               RowsAtCompileTime == 1 ? 0 : rows() - size,
+               ColsAtCompileTime == 1 ? 0 : cols() - size,
+               RowsAtCompileTime == 1 ? 1 : size,
+               ColsAtCompileTime == 1 ? 1 : size);
 }
 
 /** \returns a fixed-size expression of the first coefficients of *this.
diff --git a/Eigen/src/Core/Coeffs.h b/Eigen/src/Core/Coeffs.h
index deb015136..53b551b19 100644
--- a/Eigen/src/Core/Coeffs.h
+++ b/Eigen/src/Core/Coeffs.h
@@ -104,7 +104,7 @@ inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
   * \link operator[](int) const \endlink, but without the assertion.
   * Use this for limiting the performance cost of debugging code when doing
   * repeated coefficient access. Only use this when it is guaranteed that the
-  * parameters \a row and \a col are in range.
+  * parameter \a index is in range.
   *
   * If EIGEN_INTERNAL_DEBUGGING is defined, an assertion will be made, making this
   * function equivalent to \link operator[](int) const \endlink.
@@ -115,22 +115,13 @@ template<typename Derived>
 inline const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
   ::coeff(int index) const
 {
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
-  if(RowsAtCompileTime == 1)
-  {
-    ei_internal_assert(index >= 0 && index < cols());
-    return coeff(0, index);
-  }
-  else
-  {
-    ei_internal_assert(index >= 0 && index < rows());
-    return coeff(index, 0);
-  }
+  ei_internal_assert(index >= 0 && index < size());
+  return derived()._coeff(index);
 }
 
 /** \returns the coefficient at given index.
   *
-  * \only_for_vectors
+  * This method is allowed only for vector expressions, and for matrix expressions having the LinearAccessBit.
   *
   * \sa operator[](int), operator()(int,int) const, x() const, y() const,
   * z() const, w() const
@@ -139,17 +130,8 @@ template<typename Derived>
 inline const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
   ::operator[](int index) const
 {
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
-  if(RowsAtCompileTime == 1)
-  {
-    ei_assert(index >= 0 && index < cols());
-    return coeff(0, index);
-  }
-  else
-  {
-    ei_assert(index >= 0 && index < rows());
-    return coeff(index, 0);
-  }
+  ei_assert(index >= 0 && index < size());
+  return derived()._coeff(index);
 }
 
 /** Short version: don't use this function, use
@@ -170,22 +152,13 @@ template<typename Derived>
 inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
   ::coeffRef(int index)
 {
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
-  if(RowsAtCompileTime == 1)
-  {
-    ei_internal_assert(index >= 0 && index < cols());
-    return coeffRef(0, index);
-  }
-  else
-  {
-    ei_internal_assert(index >= 0 && index < rows());
-    return coeffRef(index, 0);
-  }
+  ei_internal_assert(index >= 0 && index < size());
+  return derived()._coeffRef(index);
 }
 
 /** \returns a reference to the coefficient at given index.
   *
-  * \only_for_vectors
+  * This method is allowed only for vector expressions, and for matrix expressions having the LinearAccessBit.
   *
   * \sa operator[](int) const, operator()(int,int), x(), y(), z(), w()
   */
@@ -193,70 +166,119 @@ template<typename Derived>
 inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
   ::operator[](int index)
 {
-  ei_assert(IsVectorAtCompileTime);
-  if(RowsAtCompileTime == 1)
-  {
-    ei_assert(index >= 0 && index < cols());
-    return coeffRef(0, index);
-  }
-  else
-  {
-    ei_assert(index >= 0 && index < rows());
-    return coeffRef(index, 0);
-  }
+  ei_assert(index >= 0 && index < size());
+  return derived()._coeffRef(index);
 }
 
-/** equivalent to operator[](0). \only_for_vectors */
+/** equivalent to operator[](0).  */
 template<typename Derived>
 inline const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
   ::x() const { return (*this)[0]; }
 
-/** equivalent to operator[](1). \only_for_vectors */
+/** equivalent to operator[](1).  */
 template<typename Derived>
 inline const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
   ::y() const { return (*this)[1]; }
 
-/** equivalent to operator[](2). \only_for_vectors */
+/** equivalent to operator[](2).  */
 template<typename Derived>
 inline const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
   ::z() const { return (*this)[2]; }
 
-/** equivalent to operator[](3). \only_for_vectors */
+/** equivalent to operator[](3).  */
 template<typename Derived>
 inline const typename ei_traits<Derived>::Scalar MatrixBase<Derived>
   ::w() const { return (*this)[3]; }
 
-/** equivalent to operator[](0). \only_for_vectors */
+/** equivalent to operator[](0).  */
 template<typename Derived>
 inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
   ::x() { return (*this)[0]; }
 
-/** equivalent to operator[](1). \only_for_vectors */
+/** equivalent to operator[](1).  */
 template<typename Derived>
 inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
   ::y() { return (*this)[1]; }
 
-/** equivalent to operator[](2). \only_for_vectors */
+/** equivalent to operator[](2).  */
 template<typename Derived>
 inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
   ::z() { return (*this)[2]; }
 
-/** equivalent to operator[](3). \only_for_vectors */
+/** equivalent to operator[](3).  */
 template<typename Derived>
 inline typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
   ::w() { return (*this)[3]; }
 
+/** \returns the packet of coefficients starting at the given row and column. It is your responsibility
+  * to ensure that a packet really starts there. This method is only available on expressions having the
+  * PacketAccessBit.
+  *
+  * The \a LoadMode parameter may have the value \a Aligned or \a UnAligned. Its effect is to select
+  * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
+  * starting at an address which is a multiple of the packet size.
+  */
 template<typename Derived>
 template<int LoadMode>
 inline typename ei_packet_traits<typename ei_traits<Derived>::Scalar>::type
 MatrixBase<Derived>::packet(int row, int col) const
-{ return derived().template _packet<LoadMode>(row,col); }
+{
+  ei_internal_assert(row >= 0 && row < rows()
+                     && col >= 0 && col < cols());
+  return derived().template _packet<LoadMode>(row,col);
+}
 
+/** Stores the given packet of coefficients, at the given row and column of this expression. It is your responsibility
+  * to ensure that a packet really starts there. This method is only available on expressions having the
+  * PacketAccessBit.
+  *
+  * The \a LoadMode parameter may have the value \a Aligned or \a UnAligned. Its effect is to select
+  * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
+  * starting at an address which is a multiple of the packet size.
+  */
 template<typename Derived>
 template<int StoreMode>
 inline void MatrixBase<Derived>::writePacket
 (int row, int col, const typename ei_packet_traits<typename ei_traits<Derived>::Scalar>::type& x)
-{ derived().template _writePacket<StoreMode>(row,col,x); }
+{
+  ei_internal_assert(row >= 0 && row < rows()
+                     && col >= 0 && col < cols());
+  derived().template _writePacket<StoreMode>(row,col,x);
+}
+
+/** \returns the packet of coefficients starting at the given index. It is your responsibility
+  * to ensure that a packet really starts there. This method is only available on expressions having the
+  * PacketAccessBit and the LinearAccessBit.
+  *
+  * The \a LoadMode parameter may have the value \a Aligned or \a UnAligned. Its effect is to select
+  * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
+  * starting at an address which is a multiple of the packet size.
+  */
+template<typename Derived>
+template<int LoadMode>
+inline typename ei_packet_traits<typename ei_traits<Derived>::Scalar>::type
+MatrixBase<Derived>::packet(int index) const
+{
+  ei_internal_assert(index >= 0 && index < size());
+  return derived().template _packet<LoadMode>(index);
+}
+
+/** Stores the given packet of coefficients, at the given index in this expression. It is your responsibility
+  * to ensure that a packet really starts there. This method is only available on expressions having the
+  * PacketAccessBit and the LinearAccessBit.
+  *
+  * The \a LoadMode parameter may have the value \a Aligned or \a UnAligned. Its effect is to select
+  * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
+  * starting at an address which is a multiple of the packet size.
+  */
+template<typename Derived>
+template<int StoreMode>
+inline void MatrixBase<Derived>::writePacket
+(int index, const typename ei_packet_traits<typename ei_traits<Derived>::Scalar>::type& x)
+{
+  ei_internal_assert(index >= 0 && index < size());
+  derived().template _writePacket<StoreMode>(index,x);
+}
 
 
 #endif // EIGEN_COEFFS_H
diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h
index ec4619781..6672edcbe 100644
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@@ -69,7 +69,7 @@ struct ei_traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
         HereditaryBits
       | (int(LhsFlags) & int(RhsFlags) & LinearAccessBit)
       | (ei_functor_traits<BinaryOp>::PacketAccess && ((int(LhsFlags) & RowMajorBit)==(int(RhsFlags) & RowMajorBit))
-        ? int(LhsFlags) & int(RhsFlags) & PacketAccessBit : 0)),
+        ? (int(LhsFlags) & int(RhsFlags) & PacketAccessBit) : 0)),
     CoeffReadCost = LhsCoeffReadCost + RhsCoeffReadCost + ei_functor_traits<BinaryOp>::Cost
   };
 };
@@ -108,6 +108,17 @@ class CwiseBinaryOp : ei_no_assignment_operator,
       return m_functor.packetOp(m_lhs.template packet<LoadMode>(row, col), m_rhs.template packet<LoadMode>(row, col));
     }
 
+    inline const Scalar _coeff(int index) const
+    {
+      return m_functor(m_lhs.coeff(index), m_rhs.coeff(index));
+    }
+
+    template<int LoadMode>
+    inline PacketScalar _packet(int index) const
+    {
+      return m_functor.packetOp(m_lhs.template packet<LoadMode>(index), m_rhs.template packet<LoadMode>(index));
+    }
+
   protected:
     const LhsNested m_lhs;
     const RhsNested m_rhs;
diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h
index 069fad8a2..998b7ce56 100644
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -50,7 +50,9 @@ struct ei_traits<CwiseNullaryOp<NullaryOp, MatrixType> >
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
     Flags = (MatrixType::Flags
-      & (HereditaryBits | LinearAccessBit | (ei_functor_traits<NullaryOp>::PacketAccess ? PacketAccessBit : 0)))
+      & (  HereditaryBits
+         | (ei_functor_has_linear_access<NullaryOp>::ret ? LinearAccessBit : 0)
+         | (ei_functor_traits<NullaryOp>::PacketAccess ? PacketAccessBit : 0)))
       | (ei_functor_traits<NullaryOp>::IsRepeatable ? 0 : EvalBeforeNestingBit),
     CoeffReadCost = ei_functor_traits<NullaryOp>::Cost
   };
@@ -89,6 +91,17 @@ class CwiseNullaryOp : ei_no_assignment_operator,
       return m_functor.packetOp();
     }
 
+    const Scalar _coeff(int index) const
+    {
+      return m_functor(index);
+    }
+
+    template<int LoadMode>
+    PacketScalar _packet(int) const
+    {
+      return m_functor.packetOp();
+    }
+
   protected:
     const ei_int_if_dynamic<RowsAtCompileTime> m_rows;
     const ei_int_if_dynamic<ColsAtCompileTime> m_cols;
diff --git a/Eigen/src/Core/CwiseUnaryOp.h b/Eigen/src/Core/CwiseUnaryOp.h
index 7c466d8c2..881853d28 100644
--- a/Eigen/src/Core/CwiseUnaryOp.h
+++ b/Eigen/src/Core/CwiseUnaryOp.h
@@ -90,6 +90,17 @@ class CwiseUnaryOp : ei_no_assignment_operator,
       return m_functor.packetOp(m_matrix.template packet<LoadMode>(row, col));
     }
 
+    inline const Scalar _coeff(int index) const
+    {
+      return m_functor(m_matrix.coeff(index));
+    }
+
+    template<int LoadMode>
+    inline PacketScalar _packet(int index) const
+    {
+      return m_functor.packetOp(m_matrix.template packet<LoadMode>(index));
+    }
+
   protected:
     const typename MatrixType::Nested m_matrix;
     const UnaryOp m_functor;
diff --git a/Eigen/src/Core/DiagonalCoeffs.h b/Eigen/src/Core/DiagonalCoeffs.h
index b7d3ef475..516d52526 100644
--- a/Eigen/src/Core/DiagonalCoeffs.h
+++ b/Eigen/src/Core/DiagonalCoeffs.h
@@ -56,7 +56,8 @@ struct ei_traits<DiagonalCoeffs<MatrixType> >
     MaxColsAtCompileTime = 1,
     Flags = (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic
             ? (unsigned int)_MatrixTypeNested::Flags
-            : (unsigned int)_MatrixTypeNested::Flags &~ LargeBit) & HereditaryBits,
+            : (unsigned int)_MatrixTypeNested::Flags &~ LargeBit)
+          & (HereditaryBits | LinearAccessBit),
     CoeffReadCost = _MatrixTypeNested::CoeffReadCost
   };
 };
@@ -87,6 +88,16 @@ template<typename MatrixType> class DiagonalCoeffs
       return m_matrix.coeff(row, row);
     }
 
+    inline Scalar& _coeffRef(int index)
+    {
+      return m_matrix.const_cast_derived().coeffRef(index, index);
+    }
+
+    inline const Scalar _coeff(int index) const
+    {
+      return m_matrix.coeff(index, index);
+    }
+
   protected:
 
     const typename MatrixType::Nested m_matrix;
diff --git a/Eigen/src/Core/DiagonalProduct.h b/Eigen/src/Core/DiagonalProduct.h
index f902d7bbe..38d6ad46b 100644
--- a/Eigen/src/Core/DiagonalProduct.h
+++ b/Eigen/src/Core/DiagonalProduct.h
@@ -52,8 +52,7 @@ struct ei_traits<Product<LhsNested, RhsNested, DiagonalProduct> >
                      && (RowsAtCompileTime % ei_packet_traits<Scalar>::size == 0),
 
     RemovedBits = ~(((RhsFlags & RowMajorBit) && (!CanVectorizeLhs) ? 0 : RowMajorBit)
-                | ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit))
-                | LinearAccessBit,
+                | ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)),
 
     Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
           | (CanVectorizeLhs || CanVectorizeRhs ? PacketAccessBit : 0),
diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h
index 8f38ee946..275a40ff2 100644
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -175,27 +175,20 @@ struct ei_dot_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
     const int size = v1.size();
     const int packetSize = ei_packet_traits<Scalar>::size;
     const int alignedSize = (size/packetSize)*packetSize;
-    const bool rowVector1 = Derived1::RowsAtCompileTime == 1;
-    const bool rowVector2 = Derived2::RowsAtCompileTime == 1;
     Scalar res;
 
     // do the vectorizable part of the sum
     if(size >= packetSize)
     {
-      PacketScalar packet_res;
-      packet_res = ei_pmul(
-                         v1.template packet<Aligned>(0, 0),
-                         v2.template packet<Aligned>(0, 0)
-                   );
+      PacketScalar packet_res = ei_pmul(
+                                  v1.template packet<Aligned>(0),
+                                  v2.template packet<Aligned>(0)
+                                );
       for(int index = packetSize; index<alignedSize; index += packetSize)
       {
-        const int row1 = rowVector1 ? 0 : index;
-        const int col1 = rowVector1 ? index : 0;
-        const int row2 = rowVector2 ? 0 : index;
-        const int col2 = rowVector2 ? index : 0;
         packet_res = ei_pmadd(
-                       v1.template packet<Aligned>(row1, col1),
-                       v2.template packet<Aligned>(row2, col2),
+                       v1.template packet<Aligned>(index),
+                       v2.template packet<Aligned>(index),
                        packet_res
                      );
       }
@@ -213,11 +206,7 @@ struct ei_dot_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
     // do the remainder of the vector
     for(int index = alignedSize; index < size; index++)
     {
-      const int row1 = rowVector1 ? 0 : index;
-      const int col1 = rowVector1 ? index : 0;
-      const int row2 = rowVector2 ? 0 : index;
-      const int col2 = rowVector2 ? index : 0;
-      res += v1.coeff(row1, col1) * v2.coeff(row2, col2);
+      res += v1.coeff(index) * v2.coeff(index);
     }
 
     return res;
diff --git a/Eigen/src/Core/Flagged.h b/Eigen/src/Core/Flagged.h
index edf1b3fd1..8afd068c4 100644
--- a/Eigen/src/Core/Flagged.h
+++ b/Eigen/src/Core/Flagged.h
@@ -84,6 +84,16 @@ template<typename ExpressionType, unsigned int Added, unsigned int Removed> clas
       return m_matrix.const_cast_derived().coeffRef(row, col);
     }
 
+    inline const Scalar _coeff(int index) const
+    {
+      return m_matrix.coeff(index);
+    }
+
+    inline Scalar& _coeffRef(int index)
+    {
+      return m_matrix.const_cast_derived().coeffRef(index);
+    }
+
     template<int LoadMode>
     inline const PacketScalar _packet(int row, int col) const
     {
@@ -96,6 +106,18 @@ template<typename ExpressionType, unsigned int Added, unsigned int Removed> clas
       m_matrix.const_cast_derived().template writePacket<LoadMode>(row, col, x);
     }
 
+    template<int LoadMode>
+    inline const PacketScalar _packet(int index) const
+    {
+      return m_matrix.template packet<LoadMode>(index);
+    }
+
+    template<int LoadMode>
+    inline void _writePacket(int index, const PacketScalar& x)
+    {
+      m_matrix.const_cast_derived().template writePacket<LoadMode>(index, x);
+    }
+
   protected:
     ExpressionTypeNested m_matrix;
 };
diff --git a/Eigen/src/Core/Functors.h b/Eigen/src/Core/Functors.h
index ab841bae7..2f00fdd05 100644
--- a/Eigen/src/Core/Functors.h
+++ b/Eigen/src/Core/Functors.h
@@ -302,13 +302,13 @@ struct ei_scalar_quotient1_op : ei_scalar_quotient1_impl<Scalar, NumTraits<Scala
 
 // nullary functors
 
-template<typename Scalar, bool PacketAccess = (int(ei_packet_traits<Scalar>::size)>1?true:false) > struct ei_scalar_constant_op;
+template<typename Scalar, bool PacketAccess = (int(ei_packet_traits<Scalar>::size)>1) > struct ei_scalar_constant_op;
 
 template<typename Scalar>
 struct ei_scalar_constant_op<Scalar,true> {
   typedef typename ei_packet_traits<Scalar>::type PacketScalar;
   inline ei_scalar_constant_op(const Scalar& other) : m_other(ei_pset1(other)) { }
-  inline const Scalar operator() (int, int) const { return ei_pfirst(m_other); }
+  inline const Scalar operator() (int, int = 0) const { return ei_pfirst(m_other); }
   inline const PacketScalar packetOp() const
   { return m_other; }
   const PacketScalar m_other;
@@ -316,7 +316,7 @@ struct ei_scalar_constant_op<Scalar,true> {
 template<typename Scalar>
 struct ei_scalar_constant_op<Scalar,false> {
   inline ei_scalar_constant_op(const Scalar& other) : m_other(other) { }
-  inline const Scalar operator() (int, int) const { return m_other; }
+  inline const Scalar operator() (int, int = 0) const { return m_other; }
   const Scalar m_other;
 };
 template<typename Scalar>
@@ -331,4 +331,11 @@ template<typename Scalar>
 struct ei_functor_traits<ei_scalar_identity_op<Scalar> >
 { enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false, IsRepeatable = true }; };
 
+// NOTE quick hack:
+// all functors allow linear access, except ei_scalar_identity_op. So we fix here a quick meta
+// to indicate whether a functor allows linear access, just always answering 'yes' except for
+// ei_scalar_identity_op.
+template<typename Functor> struct ei_functor_has_linear_access { enum { ret = 1 }; };
+template<typename Scalar> struct ei_functor_has_linear_access<ei_scalar_identity_op<Scalar> > { enum { ret = 0 }; };
+
 #endif // EIGEN_FUNCTORS_H
diff --git a/Eigen/src/Core/IO.h b/Eigen/src/Core/IO.h
index f7094a4a1..762e1fb31 100644
--- a/Eigen/src/Core/IO.h
+++ b/Eigen/src/Core/IO.h
@@ -31,14 +31,14 @@
   */
 template<typename Derived>
 std::ostream & operator <<
-( std::ostream & s,
-  const MatrixBase<Derived> & m )
+(std::ostream & s,
+ const MatrixBase<Derived> & m)
 {
-  for( int i = 0; i < m.rows(); i++ )
+  for(int i = 0; i < m.rows(); i++)
   {
-    s << m( i, 0 );
-    for (int j = 1; j < m.cols(); j++ )
-      s << " " << m( i, j );
+    s << m.coeff(i, 0);
+    for(int j = 1; j < m.cols(); j++)
+      s << " " << m.coeff(i, j);
     if( i < m.rows() - 1)
       s << "\n";
   }
diff --git a/Eigen/src/Core/InverseProduct.h b/Eigen/src/Core/InverseProduct.h
index 4a3579bbf..40496e01d 100755
--- a/Eigen/src/Core/InverseProduct.h
+++ b/Eigen/src/Core/InverseProduct.h
@@ -55,33 +55,33 @@ typename OtherDerived::Eval MatrixBase<Derived>::inverseProduct(const MatrixBase
     {
       // forward substitution
       if(Flags & UnitDiagBit)
-	res.coeffRef(0,c) = other.coeff(0,c);
+        res.coeffRef(0,c) = other.coeff(0,c);
       else
-	res.coeffRef(0,c) = other.coeff(0,c)/coeff(0, 0);
+        res.coeffRef(0,c) = other.coeff(0,c)/coeff(0, 0);
       for(int i=1; i<rows(); ++i)
       {
-	Scalar tmp = other.coeff(i,c) - ((this->row(i).start(i)) * res.col(c).start(i)).coeff(0,0);
-	if (Flags & UnitDiagBit)
-	  res.coeffRef(i,c) = tmp;
-	else
-	  res.coeffRef(i,c) = tmp/coeff(i,i);
+        Scalar tmp = other.coeff(i,c) - ((this->row(i).start(i)) * res.col(c).start(i)).coeff(0,0);
+        if (Flags & UnitDiagBit)
+          res.coeffRef(i,c) = tmp;
+        else
+          res.coeffRef(i,c) = tmp/coeff(i,i);
       }
     }
     else
     {
       // backward substitution
       if(Flags & UnitDiagBit)
-	res.coeffRef(cols()-1,c) = other.coeff(cols()-1,c);
+        res.coeffRef(cols()-1,c) = other.coeff(cols()-1,c);
       else
-	res.coeffRef(cols()-1,c) = other.coeff(cols()-1, c)/coeff(rows()-1, cols()-1);
+        res.coeffRef(cols()-1,c) = other.coeff(cols()-1, c)/coeff(rows()-1, cols()-1);
       for(int i=rows()-2 ; i>=0 ; --i)
       {
-	Scalar tmp = other.coeff(i,c)
-                     - ((this->row(i).end(cols()-i-1)) * res.col(c).end(cols()-i-1)).coeff(0,0);
-	if (Flags & UnitDiagBit)
-	  res.coeffRef(i,c) = tmp;
-	else
-	  res.coeffRef(i,c) = tmp/coeff(i,i);
+        Scalar tmp = other.coeff(i,c)
+                   - ((this->row(i).end(cols()-i-1)) * res.col(c).end(cols()-i-1)).coeff(0,0);
+        if (Flags & UnitDiagBit)
+          res.coeffRef(i,c) = tmp;
+        else
+          res.coeffRef(i,c) = tmp/coeff(i,i);
       }
     }
   }
diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h
index f4c660af9..bcd72bdb8 100644
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -80,6 +80,16 @@ template<typename MatrixType> class Map
         return const_cast<Scalar*>(m_data)[row + col * m_rows];
     }
 
+    inline const Scalar& _coeff(int index) const
+    {
+      return m_data[index];
+    }
+
+    inline Scalar& _coeffRef(int index)
+    {
+      return m_data[index];
+    }
+
   public:
     inline Map(const Scalar* data, int rows, int cols) : m_data(data), m_rows(rows), m_cols(cols)
     {
diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h
index 11fadf49b..3d037cbee 100644
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -128,6 +128,11 @@ class Matrix : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCol
         return m_storage.data()[row + col * m_storage.rows()];
     }
 
+    inline const Scalar& _coeff(int index) const
+    {
+      return m_storage.data()[index];
+    }
+
     inline Scalar& _coeffRef(int row, int col)
     {
       if(Flags & RowMajorBit)
@@ -136,20 +141,33 @@ class Matrix : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCol
         return m_storage.data()[row + col * m_storage.rows()];
     }
 
+    inline Scalar& _coeffRef(int index)
+    {
+      return m_storage.data()[index];
+    }
+
     template<int LoadMode>
     inline PacketScalar _packet(int row, int col) const
     {
-      ei_internal_assert(Flags & PacketAccessBit);
       if(Flags & RowMajorBit)
         if (LoadMode==Aligned)
-          return ei_pload(&m_storage.data()[col + row * m_storage.cols()]);
+          return ei_pload(m_storage.data() + col + row * m_storage.cols());
         else
-          return ei_ploadu(&m_storage.data()[col + row * m_storage.cols()]);
+          return ei_ploadu(m_storage.data() + col + row * m_storage.cols());
       else
         if (LoadMode==Aligned)
-          return ei_pload(&m_storage.data()[row + col * m_storage.rows()]);
+          return ei_pload(m_storage.data() + row + col * m_storage.rows());
         else
-          return ei_ploadu(&m_storage.data()[row + col * m_storage.rows()]);
+          return ei_ploadu(m_storage.data() + row + col * m_storage.rows());
+    }
+
+    template<int LoadMode>
+    inline PacketScalar _packet(int index) const
+    {
+      if (LoadMode==Aligned)
+        return ei_pload(m_storage.data() + index);
+      else
+        return ei_ploadu(m_storage.data() + index);
     }
 
     template<int StoreMode>
@@ -158,14 +176,23 @@ class Matrix : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _MaxRows, _MaxCol
       ei_internal_assert(Flags & PacketAccessBit);
       if(Flags & RowMajorBit)
         if (StoreMode==Aligned)
-          ei_pstore(&m_storage.data()[col + row * m_storage.cols()], x);
+          ei_pstore(m_storage.data() + col + row * m_storage.cols(), x);
         else
-          ei_pstoreu(&m_storage.data()[col + row * m_storage.cols()], x);
+          ei_pstoreu(m_storage.data() + col + row * m_storage.cols(), x);
       else
         if (StoreMode==Aligned)
-          ei_pstore(&m_storage.data()[row + col * m_storage.rows()], x);
+          ei_pstore(m_storage.data() + row + col * m_storage.rows(), x);
+        else
+          ei_pstoreu(m_storage.data() + row + col * m_storage.rows(), x);
+    }
+
+    template<int StoreMode>
+    inline void _writePacket(int index, const PacketScalar& x)
+    {
+        if (StoreMode==Aligned)
+          ei_pstore(m_storage.data() + index, x);
         else
-          ei_pstoreu(&m_storage.data()[row + col * m_storage.rows()], x);
+          ei_pstoreu(m_storage.data() + index, x);
     }
 
   public:
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index 5698b7602..f503ebcbf 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -228,6 +228,11 @@ template<typename Derived> class MatrixBase
     template<int StoreMode>
     void writePacket(int row, int col, const PacketScalar& x);
 
+    template<int LoadMode>
+    PacketScalar packet(int index) const;
+    template<int StoreMode>
+    void writePacket(int index, const PacketScalar& x);
+
     const Scalar x() const;
     const Scalar y() const;
     const Scalar z() const;
@@ -307,11 +312,11 @@ template<typename Derived> class MatrixBase
     Block<Derived> block(int start, int size);
     const Block<Derived> block(int start, int size) const;
 
-    Block<Derived> start(int size);
-    const Block<Derived> start(int size) const;
+    typename SubVectorReturnType<Dynamic>::Type start(int size);
+    const typename SubVectorReturnType<Dynamic>::Type start(int size) const;
 
-    Block<Derived> end(int size);
-    const Block<Derived> end(int size) const;
+    typename SubVectorReturnType<Dynamic>::Type end(int size);
+    const typename SubVectorReturnType<Dynamic>::Type end(int size) const;
 
     Block<Derived> corner(CornerType type, int cRows, int cCols);
     const Block<Derived> corner(CornerType type, int cRows, int cCols) const;
diff --git a/Eigen/src/Core/NestByValue.h b/Eigen/src/Core/NestByValue.h
index 0c6cdb114..a63202dfd 100644
--- a/Eigen/src/Core/NestByValue.h
+++ b/Eigen/src/Core/NestByValue.h
@@ -76,6 +76,16 @@ template<typename ExpressionType> class NestByValue
       return m_expression.const_cast_derived().coeffRef(row, col);
     }
 
+    inline const Scalar _coeff(int index) const
+    {
+      return m_expression.coeff(index);
+    }
+
+    inline Scalar& _coeffRef(int index)
+    {
+      return m_expression.const_cast_derived().coeffRef(index);
+    }
+
     template<int LoadMode>
     inline const PacketScalar _packet(int row, int col) const
     {
@@ -88,6 +98,18 @@ template<typename ExpressionType> class NestByValue
       m_expression.const_cast_derived().template writePacket<LoadMode>(row, col, x);
     }
 
+    template<int LoadMode>
+    inline const PacketScalar _packet(int index) const
+    {
+      return m_expression.template packet<LoadMode>(index);
+    }
+
+    template<int LoadMode>
+    inline void _writePacket(int index, const PacketScalar& x)
+    {
+      m_expression.const_cast_derived().template writePacket<LoadMode>(index, x);
+    }
+
   protected:
     const ExpressionType m_expression;
 };
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 1e90d2ef9..df35ffc4f 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -151,8 +151,7 @@ struct ei_traits<Product<LhsNested, RhsNested, ProductMode> >
                    && (ProductMode==(int)CacheFriendlyProduct ? (int)LhsFlags & RowMajorBit : (!CanVectorizeLhs)),
 
     RemovedBits = ~((EvalToRowMajor ? 0 : RowMajorBit)
-                | ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)
-                | LinearAccessBit),
+                | ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)),
 
     Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
           | EvalBeforeAssigningBit
@@ -224,6 +223,18 @@ template<typename LhsNested, typename RhsNested, int ProductMode> class Product
       return res;
     }
 
+    /* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
+     * which is why we don't set the LinearAccessBit.
+     */
+    const Scalar _coeff(int index) const
+    {
+      Scalar res;
+      const int row = RowsAtCompileTime == 1 ? 0 : index;
+      const int col = RowsAtCompileTime == 1 ? index : 0;
+      ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
+      return res;
+    }
+
     template<int LoadMode>
     const PacketScalar _packet(int row, int col) const
     {
@@ -235,9 +246,6 @@ template<typename LhsNested, typename RhsNested, int ProductMode> class Product
       return res;
     }
 
-    template<typename Lhs_, typename Rhs_, int ProductMode_, typename DestDerived_, bool DirectAccess_>
-    friend struct ei_cache_friendly_selector;
-
   protected:
     const LhsNested m_lhs;
     const RhsNested m_rhs;
diff --git a/Eigen/src/Core/Sum.h b/Eigen/src/Core/Sum.h
index ccdb6f4a3..af35d20f9 100644
--- a/Eigen/src/Core/Sum.h
+++ b/Eigen/src/Core/Sum.h
@@ -186,34 +186,15 @@ struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling>
     const int size = mat.size();
     const int packetSize = ei_packet_traits<Scalar>::size;
     const int alignedSize = (size/packetSize)*packetSize;
-    const bool rowMajor = Derived::Flags&RowMajorBit;
-    const int innerSize = rowMajor ? mat.cols() : mat.rows();
-    const int outerSize = rowMajor ? mat.rows() : mat.cols();
     Scalar res;
 
-    // do the vectorizable part of the sum
     if(size >= packetSize)
     {
-      PacketScalar packet_res;
-      packet_res = mat.template packet<Aligned>(0, 0);
-      int row = 0;
-      int col = 0;
-      int index = packetSize;
-      while (index<alignedSize)
-      {
-        row = rowMajor ? index/innerSize : index%innerSize;
-        col = rowMajor ? index%innerSize : index/innerSize;
-        int start = rowMajor ? col : row;
-        int end = std::min(innerSize, start+alignedSize-index);
-        if (end<start) getchar();
-        for ( ; (rowMajor ? col : row)<end; (rowMajor ? col : row)+=packetSize)
-          packet_res = ei_padd(packet_res, mat.template packet<Aligned>(row, col));
-        index += (rowMajor ? col : row) - start;
-      }
-      res = ei_predux(packet_res);
+      PacketScalar packet_res = mat.template packet<Aligned>(0, 0);
+      for(int index = packetSize; index < alignedSize; index += packetSize)
+        packet_res = ei_padd(packet_res, mat.template packet<Aligned>(index));
 
-      // now we must do the rest without vectorization.
-      if(alignedSize == size) return res;
+      res = ei_predux(packet_res);
     }
     else // too small to vectorize anything.
          // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
@@ -221,25 +202,11 @@ struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling>
       res = Scalar(0);
     }
 
-    const int k = alignedSize/innerSize;
-
-    // do the remainder of the current row or col
-    for(int i = alignedSize%innerSize; i < innerSize; i++)
+    for(int index = alignedSize; index < size; index++)
     {
-      const int row = rowMajor ? k : i;
-      const int col = rowMajor ? i : k;
-      res += mat.coeff(row, col);
+      res += mat.coeff(index);
     }
 
-    // do the remaining rows or cols
-    for(int j = k+1; j < outerSize; j++)
-      for(int i = 0; i < innerSize; i++)
-      {
-        const int row = rowMajor ? i : j;
-        const int col = rowMajor ? j : i;
-        res += mat.coeff(row, col);
-      }
-
     return res;
   }
 };
diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index ac1b583fa..716d86243 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -56,26 +56,60 @@ const unsigned int EvalBeforeNestingBit = 0x2;
   * means the expression should be evaluated before any assignement */
 const unsigned int EvalBeforeAssigningBit = 0x4;
 
-/** \ingroup flags
-  *
-  * currently unused. Means the matrix probably has a very big size.
-  * Could eventually be used as a hint to determine which algorithms
-  * to use. */
-const unsigned int LargeBit = 0x8;
-
 #ifdef EIGEN_VECTORIZE
 /** \ingroup flags
   *
-  * means the expression might be vectorized */
-const unsigned int PacketAccessBit = 0x10;
+  * Short version: means the expression might be vectorized
+  *
+  * Long version: means that the coefficients can be handled by packets
+  * and start at a memory location whose alignment meets the requirements
+  * of the present CPU architecture for optimized packet access. In the fixed-size
+  * case, there is the additional condition that the total size of the coefficients
+  * array is a multiple of the packet size, so that it is possible to access all the
+  * coefficients by packets. In the dynamic-size case, there is no such condition
+  * on the total size, so it might not be possible to access the few last coeffs
+  * by packets.
+  *
+  * \note If vectorization is not enabled (EIGEN_VECTORIZE is not defined) this constant
+  * is set to the value 0.
+  */
+const unsigned int PacketAccessBit = 0x8;
 #else
 const unsigned int PacketAccessBit = 0x0;
 #endif
 
 /** \ingroup flags
   *
-  * means the expression can be seen as 1D vector (used for explicit vectorization) */
-const unsigned int LinearAccessBit = 0x20;
+  * Short version: means the expression can be seen as 1D vector.
+  *
+  * Long version: means that one can access the coefficients
+  * of this expression by coeff(int), and coeffRef(int) in the case of a lvalue expression. These
+  * index-based access methods are guaranteed
+  * to not have to do any runtime computation of a (row, col)-pair from the index, so that it
+  * is guaranteed that whenever it is available, index-based access is at least as fast as
+  * (row,col)-based access. Expressions for which that isn't possible don't have the LinearAccessBit.
+  *
+  * If both PacketAccessBit and LinearAccessBit are set, then the
+  * packets of this expression can be accessed by packet(int), and writePacket(int) in the case of a
+  * lvalue expression.
+  *
+  * Typically, all vector expressions have the LinearAccessBit, but there is one exception:
+  * Product expressions don't have it, because it would be troublesome for vectorization, even when the
+  * Product is a vector expression. Thus, vector Product expressions allow index-based coefficient access but
+  * not index-based packet access, so they don't have the LinearAccessBit.
+  */
+const unsigned int LinearAccessBit = 0x10;
+
+/** \ingroup flags
+  *
+  * Means that the underlying array of coefficients can be directly accessed. This means two things.
+  * First, references to the coefficients must be available through coeffRef(int, int). This rules out read-only
+  * expressions whose coefficients are computed on demand by coeff(int, int). Second, the memory layout of the
+  * array of coefficients must be exactly the natural one suggested by rows(), cols(), stride(), and the RowMajorBit.
+  * This rules out expressions such as DiagonalCoeffs, whose coefficients, though referencable, do not have
+  * such a regular memory layout.
+  */
+const unsigned int DirectAccessBit = 0x20;
 
 /** \ingroup flags
   *
@@ -104,17 +138,17 @@ const unsigned int LowerTriangularBit = 0x400;
 
 /** \ingroup flags
   *
-  * means the underlying matrix data can be direclty accessed (contrary to certain
-  * expressions where the matrix coefficients need to be computed rather than just read from
-  * memory) */
-const unsigned int DirectAccessBit = 0x800;
-
-/** \ingroup flags
-  *
   * means the object is just an array of scalars, and operations on it are regarded as operations
   * on every of these scalars taken separately.
   */
-const unsigned int ArrayBit = 0x1000;
+const unsigned int ArrayBit = 0x800;
+
+/** \ingroup flags
+  *
+  * currently unused. Means the matrix probably has a very big size.
+  * Could eventually be used as a hint to determine which algorithms
+  * to use. */
+const unsigned int LargeBit = 0x1000;
 
 // list of flags that are inherited by default
 const unsigned int HereditaryBits = RowMajorBit
@@ -135,7 +169,6 @@ const unsigned int UnitUpper = UpperTriangularBit | UnitDiagBit;
 const unsigned int UnitLower = LowerTriangularBit | UnitDiagBit;
 const unsigned int Diagonal = Upper | Lower;
 
-
 enum { Aligned=0, UnAligned=1 };
 enum { ConditionalJumpCost = 5 };
 enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight };
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 078beb681..5d809f619 100644
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -155,7 +155,8 @@ class ei_corrected_matrix_flags
                                                      // so let us strictly honor the user's choice.
                          ? SuggestedFlags&RowMajorBit
                          : Cols > 1 ? RowMajorBit : 0,
-           is_big = MaxRows == Dynamic || MaxCols == Dynamic,
+           inner_max_size = row_major_bit ? MaxCols : MaxRows,
+           is_big = inner_max_size == Dynamic,
            linear_size = Cols * Rows,
            packet_access_bit
             = ei_packet_traits<Scalar>::size > 1
diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h
index 37ae1ed82..f74bc7775 100644
--- a/Eigen/src/Core/util/StaticAssert.h
+++ b/Eigen/src/Core/util/StaticAssert.h
@@ -75,7 +75,9 @@
 
 
 // static assertion failling if the type \a TYPE is not a vector type
-#define EIGEN_STATIC_ASSERT_VECTOR_ONLY(TYPE) EIGEN_STATIC_ASSERT(TYPE::IsVectorAtCompileTime,you_tried_calling_a_vector_method_on_a_matrix)
+#define EIGEN_STATIC_ASSERT_VECTOR_ONLY(TYPE) \
+  EIGEN_STATIC_ASSERT(TYPE::IsVectorAtCompileTime, \
+                      you_tried_calling_a_vector_method_on_a_matrix)
 
 // static assertion failling if the two vector expression types are not compatible (same fixed-size or dynamic size)
 #define EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(TYPE0,TYPE1) \
diff --git a/bench/benchVecAdd.cpp b/bench/benchVecAdd.cpp
new file mode 100644
index 000000000..aa211dce0
--- /dev/null
+++ b/bench/benchVecAdd.cpp
@@ -0,0 +1,134 @@
+
+#include <Eigen/Core>
+#include <bench/BenchTimer.h>
+using namespace Eigen;
+
+#ifndef SIZE
+#define SIZE 50
+#endif
+
+#ifndef REPEAT
+#define REPEAT 10000
+#endif
+
+typedef float Scalar;
+
+__attribute__ ((noinline)) void benchVec(Scalar* a, Scalar* b, Scalar* c, int size);
+__attribute__ ((noinline)) void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c);
+__attribute__ ((noinline)) void benchVec(VectorXf& a, VectorXf& b, VectorXf& c);
+
+int main(int argc, char* argv[])
+{
+    int size = SIZE * 8;
+    int size2 = size * size;
+    Scalar* a = ei_aligned_malloc<Scalar>(size2);
+    Scalar* b = ei_aligned_malloc<Scalar>(size2);
+    Scalar* c = ei_aligned_malloc<Scalar>(size2); 
+    
+    for (int i=0; i<size; ++i)
+    {
+        a[i] = b[i] = c[i] = 0;
+    }
+    
+    BenchTimer timer;
+    
+    timer.reset();
+    for (int k=0; k<3; ++k)
+    {
+        timer.start();
+        benchVec(a, b, c, size2);
+        timer.stop();
+    }
+    std::cout << timer.value() << "s  " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
+    
+    for (int innersize = size; innersize>2 ; --innersize)
+    {
+        if (size2%innersize==0)
+        {
+            int outersize = size2/innersize;
+            MatrixXf ma = MatrixXf::map(a, innersize, outersize );
+            MatrixXf mb = MatrixXf::map(b, innersize, outersize );
+            MatrixXf mc = MatrixXf::map(c, innersize, outersize );
+            timer.reset();
+            for (int k=0; k<3; ++k)
+            {
+                timer.start();
+                benchVec(ma, mb, mc);
+                timer.stop();
+            }
+            std::cout << innersize << " x " << outersize << "  " << timer.value() << "s   " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
+        }
+    }
+    
+    VectorXf va = VectorXf::map(a, size2);
+    VectorXf vb = VectorXf::map(b, size2);
+    VectorXf vc = VectorXf::map(c, size2);
+    timer.reset();
+    for (int k=0; k<3; ++k)
+    {
+        timer.start();
+        benchVec(va, vb, vc);
+        timer.stop();
+    }
+    std::cout << timer.value() << "s   " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
+
+    return 0;
+}
+
+void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c)
+{
+    for (int k=0; k<REPEAT; ++k)
+        a = a + b;
+}
+
+void benchVec(VectorXf& a, VectorXf& b, VectorXf& c)
+{
+    for (int k=0; k<REPEAT; ++k)
+        a = a + b;
+}
+
+void benchVec(Scalar* a, Scalar* b, Scalar* c, int size)
+{
+    typedef ei_packet_traits<Scalar>::type PacketScalar;
+    const int PacketSize = ei_packet_traits<Scalar>::size;
+    PacketScalar a0, a1, a2, a3, b0, b1, b2, b3;
+    for (int k=0; k<REPEAT; ++k)
+        for (int i=0; i<size; i+=PacketSize*8)
+        {
+            a0 = ei_pload(&a[i]);
+            b0 = ei_pload(&b[i]);
+            a1 = ei_pload(&a[i+1*PacketSize]);
+            b1 = ei_pload(&b[i+1*PacketSize]);
+            a2 = ei_pload(&a[i+2*PacketSize]);
+            b2 = ei_pload(&b[i+2*PacketSize]);
+            a3 = ei_pload(&a[i+3*PacketSize]);
+            b3 = ei_pload(&b[i+3*PacketSize]);
+            ei_pstore(&a[i], ei_padd(a0, b0));
+            a0 = ei_pload(&a[i+4*PacketSize]);
+            b0 = ei_pload(&b[i+4*PacketSize]);
+            
+            ei_pstore(&a[i+1*PacketSize], ei_padd(a1, b1));
+            a1 = ei_pload(&a[i+5*PacketSize]);
+            b1 = ei_pload(&b[i+5*PacketSize]);
+            
+            ei_pstore(&a[i+2*PacketSize], ei_padd(a2, b2));
+            a2 = ei_pload(&a[i+6*PacketSize]);
+            b2 = ei_pload(&b[i+6*PacketSize]);
+            
+            ei_pstore(&a[i+3*PacketSize], ei_padd(a3, b3));
+            a3 = ei_pload(&a[i+7*PacketSize]);
+            b3 = ei_pload(&b[i+7*PacketSize]);
+            
+            ei_pstore(&a[i+4*PacketSize], ei_padd(a0, b0));
+            ei_pstore(&a[i+5*PacketSize], ei_padd(a1, b1));
+            ei_pstore(&a[i+6*PacketSize], ei_padd(a2, b2));
+            ei_pstore(&a[i+7*PacketSize], ei_padd(a3, b3));
+            
+//             ei_pstore(&a[i+2*PacketSize], ei_padd(ei_pload(&a[i+2*PacketSize]), ei_pload(&b[i+2*PacketSize])));
+//             ei_pstore(&a[i+3*PacketSize], ei_padd(ei_pload(&a[i+3*PacketSize]), ei_pload(&b[i+3*PacketSize])));
+//             ei_pstore(&a[i+4*PacketSize], ei_padd(ei_pload(&a[i+4*PacketSize]), ei_pload(&b[i+4*PacketSize])));
+//             ei_pstore(&a[i+5*PacketSize], ei_padd(ei_pload(&a[i+5*PacketSize]), ei_pload(&b[i+5*PacketSize])));
+//             ei_pstore(&a[i+6*PacketSize], ei_padd(ei_pload(&a[i+6*PacketSize]), ei_pload(&b[i+6*PacketSize])));
+//             ei_pstore(&a[i+7*PacketSize], ei_padd(ei_pload(&a[i+7*PacketSize]), ei_pload(&b[i+7*PacketSize])));
+        }
+}