30 files changed, 326 insertions, 260 deletions
diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h
index 009fd845d..c7dfedae4 100644
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -300,9 +300,10 @@ template<typename Derived>
 bool DenseBase<Derived>::isApproxToConstant
 (const Scalar& val, const RealScalar& prec) const
 {
+  typename internal::nested_eval<Derived,1>::type self(derived());
   for(Index j = 0; j < cols(); ++j)
     for(Index i = 0; i < rows(); ++i)
-      if(!internal::isApprox(this->coeff(i, j), val, prec))
+      if(!internal::isApprox(self.coeff(i, j), val, prec))
         return false;
   return true;
 }
@@ -484,9 +485,10 @@ DenseBase<Derived>::Zero()
 template<typename Derived>
 bool DenseBase<Derived>::isZero(const RealScalar& prec) const
 {
+  typename internal::nested_eval<Derived,1>::type self(derived());
   for(Index j = 0; j < cols(); ++j)
     for(Index i = 0; i < rows(); ++i)
-      if(!internal::isMuchSmallerThan(this->coeff(i, j), static_cast<Scalar>(1), prec))
+      if(!internal::isMuchSmallerThan(self.coeff(i, j), static_cast<Scalar>(1), prec))
         return false;
   return true;
 }
@@ -719,18 +721,19 @@ template<typename Derived>
 bool MatrixBase<Derived>::isIdentity
 (const RealScalar& prec) const
 {
+  typename internal::nested_eval<Derived,1>::type self(derived());
   for(Index j = 0; j < cols(); ++j)
   {
     for(Index i = 0; i < rows(); ++i)
     {
       if(i == j)
       {
-        if(!internal::isApprox(this->coeff(i, j), static_cast<Scalar>(1), prec))
+        if(!internal::isApprox(self.coeff(i, j), static_cast<Scalar>(1), prec))
           return false;
       }
       else
       {
-        if(!internal::isMuchSmallerThan(this->coeff(i, j), static_cast<RealScalar>(1), prec))
+        if(!internal::isMuchSmallerThan(self.coeff(i, j), static_cast<RealScalar>(1), prec))
           return false;
       }
     }
diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h
index ab41641f4..8fcc83a5a 100644
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@@ -35,22 +35,22 @@ void check_static_allocation_size()
 }
 
 template<typename T, int Size, typename Packet = typename packet_traits<T>::type,
-         bool Match     = bool((Size%unpacket_traits<Packet>::size)==0),
-         bool TryHalf   =  bool(int(unpacket_traits<Packet>::size) > Size)
+         bool Match     =  bool((Size%unpacket_traits<Packet>::size)==0),
+         bool TryHalf   =  bool(int(unpacket_traits<Packet>::size) > 1)
                         && bool(int(unpacket_traits<Packet>::size) > int(unpacket_traits<typename unpacket_traits<Packet>::half>::size)) >
 struct compute_default_alignment
 {
   enum { value = 0 };
 };
 
-template<typename T, int Size, typename Packet>
-struct compute_default_alignment<T, Size, Packet, true, false> // Match
+template<typename T, int Size, typename Packet, bool TryHalf>
+struct compute_default_alignment<T, Size, Packet, true, TryHalf> // Match
 {
   enum { value = sizeof(T) * unpacket_traits<Packet>::size };
 };
 
 template<typename T, int Size, typename Packet>
-struct compute_default_alignment<T, Size, Packet, false, true>
+struct compute_default_alignment<T, Size, Packet, false, true> // Try-half
 {
   // current packet too large, try with an half-packet
   enum { value = compute_default_alignment<T, Size, typename unpacket_traits<Packet>::half>::value };
diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h
index 68e9c2660..6228f71bd 100644
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -224,13 +224,13 @@ bool MatrixBase<Derived>::isOrthogonal
 template<typename Derived>
 bool MatrixBase<Derived>::isUnitary(const RealScalar& prec) const
 {
-  typename Derived::Nested nested(derived());
+  typename internal::nested_eval<Derived,1>::type self(derived());
   for(Index i = 0; i < cols(); ++i)
   {
-    if(!internal::isApprox(nested.col(i).squaredNorm(), static_cast<RealScalar>(1), prec))
+    if(!internal::isApprox(self.col(i).squaredNorm(), static_cast<RealScalar>(1), prec))
       return false;
     for(Index j = 0; j < i; ++j)
-      if(!internal::isMuchSmallerThan(nested.col(i).dot(nested.col(j)), static_cast<Scalar>(1), prec))
+      if(!internal::isMuchSmallerThan(self.col(i).dot(self.col(j)), static_cast<Scalar>(1), prec))
         return false;
   }
   return true;
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index e1b233d82..3c240c272 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -328,6 +328,7 @@ struct hypot_impl
       p = _y;
       qp = _x / p;
     }
+    if(p==RealScalar(0)) return RealScalar(0);
     return p * sqrt(RealScalar(1) + qp*qp);
   }
 };
diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index d84e7776b..22b5e024b 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -409,7 +409,8 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
     
     LhsCoeffReadCost = LhsEtorType::CoeffReadCost,
     RhsCoeffReadCost = RhsEtorType::CoeffReadCost,
-    CoeffReadCost = (InnerSize == Dynamic || LhsCoeffReadCost==Dynamic || RhsCoeffReadCost==Dynamic || NumTraits<Scalar>::AddCost==Dynamic || NumTraits<Scalar>::MulCost==Dynamic) ? Dynamic
+    CoeffReadCost = InnerSize==0 ? NumTraits<Scalar>::ReadCost
+                  : (InnerSize == Dynamic || LhsCoeffReadCost==Dynamic || RhsCoeffReadCost==Dynamic || NumTraits<Scalar>::AddCost==Dynamic || NumTraits<Scalar>::MulCost==Dynamic) ? Dynamic
                   : InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost)
                     + (InnerSize - 1) * NumTraits<Scalar>::AddCost,
 
@@ -484,7 +485,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
   {
     PacketScalar res;
     typedef etor_product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor,
-                                     Unroll ? InnerSize-1 : Dynamic,
+                                     Unroll ? InnerSize : Dynamic,
                                      LhsEtorType, RhsEtorType, PacketScalar, LoadMode> PacketImpl;
 
     PacketImpl::run(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res);
@@ -527,7 +528,7 @@ struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
   {
     etor_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
-    res =  pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex)), rhs.template packet<LoadMode>(UnrollingIndex, col), res);
+    res =  pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex-1)), rhs.template packet<LoadMode>(UnrollingIndex-1, col), res);
   }
 };
 
@@ -537,12 +538,12 @@ struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, Load
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
   {
     etor_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
-    res =  pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex), pset1<Packet>(rhs.coeff(UnrollingIndex, col)), res);
+    res =  pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex-1), pset1<Packet>(rhs.coeff(UnrollingIndex-1, col)), res);
   }
 };
 
 template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
+struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode>
 {
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
   {
@@ -551,7 +552,7 @@ struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
 };
 
 template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
+struct etor_product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode>
 {
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
   {
@@ -560,13 +561,30 @@ struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
 };
 
 template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
+  {
+    res = pset1<Packet>(0);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
+  {
+    res = pset1<Packet>(0);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
 {
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
   {
-    eigen_assert(innerDim>0 && "you are using a non initialized matrix");
-    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
-    for(Index i = 1; i < innerDim; ++i)
+    res = pset1<Packet>(0);
+    for(Index i = 0; i < innerDim; ++i)
       res =  pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
   }
 };
@@ -576,9 +594,8 @@ struct etor_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
 {
   static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
   {
-    eigen_assert(innerDim>0 && "you are using a non initialized matrix");
-    res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
-    for(Index i = 1; i < innerDim; ++i)
+    res = pset1<Packet>(0);
+    for(Index i = 0; i < innerDim; ++i)
       res =  pmadd(lhs.template packet<LoadMode>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
   }
 };
@@ -678,8 +695,7 @@ public:
     //_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagFlags)&PacketAccessBit))),
     _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
     _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0,
-    Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0) | AlignedBit
-            //(int(MatrixFlags)&int(DiagFlags)&AlignedBit),
+    Flags = ((HereditaryBits|_LinearAccessMask|AlignedBit) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0)
   };
   
   diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
diff --git a/Eigen/src/Core/Reverse.h b/Eigen/src/Core/Reverse.h
index 291300a4a..5237fbf1c 100644
--- a/Eigen/src/Core/Reverse.h
+++ b/Eigen/src/Core/Reverse.h
@@ -200,17 +200,82 @@ DenseBase<Derived>::reverse() const
   * In most cases it is probably better to simply use the reversed expression
   * of a matrix. However, when reversing the matrix data itself is really needed,
   * then this "in-place" version is probably the right choice because it provides
-  * the following additional features:
+  * the following additional benefits:
   *  - less error prone: doing the same operation with .reverse() requires special care:
   *    \code m = m.reverse().eval(); \endcode
-  *  - this API allows to avoid creating a temporary (the current implementation creates a temporary, but that could be avoided using swap)
+  *  - this API enables reverse operations without the need for a temporary
   *  - it allows future optimizations (cache friendliness, etc.)
   *
-  * \sa reverse() */
+  * \sa VectorwiseOp::reverseInPlace(), reverse() */
 template<typename Derived>
 inline void DenseBase<Derived>::reverseInPlace()
 {
-  derived() = derived().reverse().eval();
+  if(cols()>rows())
+  {
+    Index half = cols()/2;
+    leftCols(half).swap(rightCols(half).reverse());
+    if((cols()%2)==1)
+    {
+      Index half2 = rows()/2;
+      col(half).head(half2).swap(col(half).tail(half2).reverse());
+    }
+  }
+  else
+  {
+    Index half = rows()/2;
+    topRows(half).swap(bottomRows(half).reverse());
+    if((rows()%2)==1)
+    {
+      Index half2 = cols()/2;
+      row(half).head(half2).swap(row(half).tail(half2).reverse());
+    }
+  }
+}
+
+namespace internal {
+  
+template<int Direction>
+struct vectorwise_reverse_inplace_impl;
+
+template<>
+struct vectorwise_reverse_inplace_impl<Vertical>
+{
+  template<typename ExpressionType>
+  static void run(ExpressionType &xpr)
+  {
+    Index half = xpr.rows()/2;
+    xpr.topRows(half).swap(xpr.bottomRows(half).colwise().reverse());
+  }
+};
+
+template<>
+struct vectorwise_reverse_inplace_impl<Horizontal>
+{
+  template<typename ExpressionType>
+  static void run(ExpressionType &xpr)
+  {
+    Index half = xpr.cols()/2;
+    xpr.leftCols(half).swap(xpr.rightCols(half).rowwise().reverse());
+  }
+};
+
+} // end namespace internal
+
+/** This is the "in place" version of VectorwiseOp::reverse: it reverses each column or row of \c *this.
+  *
+  * In most cases it is probably better to simply use the reversed expression
+  * of a matrix. However, when reversing the matrix data itself is really needed,
+  * then this "in-place" version is probably the right choice because it provides
+  * the following additional benefits:
+  *  - less error prone: doing the same operation with .reverse() requires special care:
+  *    \code m = m.reverse().eval(); \endcode
+  *  - this API enables reverse operations without the need for a temporary
+  *
+  * \sa DenseBase::reverseInPlace(), reverse() */
+template<typename ExpressionType, int Direction>
+void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
+{
+  internal::vectorwise_reverse_inplace_impl<Direction>::run(_expression().const_cast_derived());
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/Swap.h b/Eigen/src/Core/Swap.h
index dcb42821f..3880f7b78 100644
--- a/Eigen/src/Core/Swap.h
+++ b/Eigen/src/Core/Swap.h
@@ -38,13 +38,17 @@ public:
   template<int StoreMode, int LoadMode>
   void assignPacket(Index row, Index col)
   {
-    m_functor.template swapPacket<StoreMode,LoadMode,PacketScalar>(&m_dst.coeffRef(row,col), &const_cast<SrcEvaluatorTypeT&>(m_src).coeffRef(row,col));
+    PacketScalar tmp = m_src.template packet<LoadMode>(row,col);
+    const_cast<SrcEvaluatorTypeT&>(m_src).template writePacket<LoadMode>(row,col, m_dst.template packet<StoreMode>(row,col));
+    m_dst.template writePacket<StoreMode>(row,col,tmp);
   }
   
   template<int StoreMode, int LoadMode>
   void assignPacket(Index index)
   {
-    m_functor.template swapPacket<StoreMode,LoadMode,PacketScalar>(&m_dst.coeffRef(index), &const_cast<SrcEvaluatorTypeT&>(m_src).coeffRef(index));
+    PacketScalar tmp = m_src.template packet<LoadMode>(index);
+    const_cast<SrcEvaluatorTypeT&>(m_src).template writePacket<LoadMode>(index, m_dst.template packet<StoreMode>(index));
+    m_dst.template writePacket<StoreMode>(index,tmp);
   }
   
   // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I mean no CRTP (Gael)
diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h
index a15777a5e..ea3d8f4b1 100644
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@@ -562,6 +562,8 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
     void normalize() {
       m_matrix = this->normalized();
     }
+    
+    inline void reverseInPlace();
 
 /////////// Geometry module ///////////
 
diff --git a/Eigen/src/Core/functors/AssignmentFunctors.h b/Eigen/src/Core/functors/AssignmentFunctors.h
index 161b0aa93..d55ae6096 100644
--- a/Eigen/src/Core/functors/AssignmentFunctors.h
+++ b/Eigen/src/Core/functors/AssignmentFunctors.h
@@ -150,14 +150,6 @@ template<typename Scalar> struct swap_assign_op {
     swap(a,const_cast<Scalar&>(b));
 #endif
   }
-  
-  template<int LhsAlignment, int RhsAlignment, typename Packet>
-  EIGEN_STRONG_INLINE void swapPacket(Scalar* a, Scalar* b) const
-  {
-    Packet tmp = internal::ploadt<Packet,RhsAlignment>(b);
-    internal::pstoret<Scalar,Packet,RhsAlignment>(b, internal::ploadt<Packet,LhsAlignment>(a));
-    internal::pstoret<Scalar,Packet,LhsAlignment>(a, tmp);
-  }
 };
 template<typename Scalar>
 struct functor_traits<swap_assign_op<Scalar> > {
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index d32377a00..428527820 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -249,10 +249,9 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
         actual_lm = l2;
         max_mc = 576;
       }
-      
       Index mc = (std::min<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
       if (mc > Traits::mr) mc -= mc % Traits::mr;
-      
+      else if (mc==0) return;
       m = (m%mc)==0 ? mc
                     : (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1))));
     }
diff --git a/Eigen/src/Core/products/LookupBlockingSizesTable.h b/Eigen/src/Core/products/LookupBlockingSizesTable.h
index 5ab4525df..39a53c8f1 100644
--- a/Eigen/src/Core/products/LookupBlockingSizesTable.h
+++ b/Eigen/src/Core/products/LookupBlockingSizesTable.h
@@ -79,6 +79,14 @@ template <typename LhsScalar,
           typename RhsScalar>
 bool lookupBlockingSizesFromTable(Index& k, Index& m, Index& n, Index num_threads)
 {
+  if (num_threads > 1) {
+    // We don't currently have lookup tables recorded for multithread performance,
+    // and we have confirmed experimentally that our single-thread-recorded LUTs are
+    // poor for multithread performance, and our LUTs don't currently contain
+    // any annotation about multithread status (FIXME - we need that).
+    // So for now, we just early-return here.
+    return false;
+  }
   return LookupBlockingSizesFromTableImpl<LhsScalar, RhsScalar>::run(k, m, n, num_threads);
 }
 
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 6b294e77f..7cedb4c97 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -213,7 +213,8 @@
 #endif
 
 /// \internal EIGEN_OS_ANDROID set to 1 if the OS is Android
-#if defined(__ANDROID__)
+// note: ANDROID is defined when using ndk_build, __ANDROID__ is defined when using a standalone toolchain.
+#if defined(__ANDROID__) || defined(ANDROID)
   #define EIGEN_OS_ANDROID 1
 #else
   #define EIGEN_OS_ANDROID 0
diff --git a/Eigen/src/Eigenvalues/EigenSolver.h b/Eigen/src/Eigenvalues/EigenSolver.h
index 167cd99ab..b866544b4 100644
--- a/Eigen/src/Eigenvalues/EigenSolver.h
+++ b/Eigen/src/Eigenvalues/EigenSolver.h
@@ -417,7 +417,7 @@ EigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvect
         {
           Scalar t0 = m_matT.coeff(i+1, i);
           Scalar t1 = m_matT.coeff(i, i+1);
-          Scalar maxval = numext::maxi(abs(p),numext::maxi(abs(t0),abs(t1)));
+          Scalar maxval = numext::maxi<Scalar>(abs(p),numext::maxi<Scalar>(abs(t0),abs(t1)));
           t0 /= maxval;
           t1 /= maxval;
           Scalar p0 = p/maxval;
@@ -608,7 +608,7 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
           }
 
           // Overflow control
-          Scalar t = numext::maxi(abs(m_matT.coeff(i,n-1)),abs(m_matT.coeff(i,n)));
+          Scalar t = numext::maxi<Scalar>(abs(m_matT.coeff(i,n-1)),abs(m_matT.coeff(i,n)));
           if ((eps * t) * t > Scalar(1))
             m_matT.block(i, n-1, size-i, 2) /= t;
 
diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h
index ca75f2f50..677c7c0bb 100644
--- a/Eigen/src/Eigenvalues/RealQZ.h
+++ b/Eigen/src/Eigenvalues/RealQZ.h
@@ -240,10 +240,10 @@ namespace Eigen {
             m_S.coeffRef(i,j) = Scalar(0.0);
             m_S.rightCols(dim-j-1).applyOnTheLeft(i-1,i,G.adjoint());
             m_T.rightCols(dim-i+1).applyOnTheLeft(i-1,i,G.adjoint());
+            // update Q
+            if (m_computeQZ)
+              m_Q.applyOnTheRight(i-1,i,G);
           }
-          // update Q
-          if (m_computeQZ)
-            m_Q.applyOnTheRight(i-1,i,G);
           // kill T(i,i-1)
           if(m_T.coeff(i,i-1)!=Scalar(0))
           {
@@ -251,10 +251,10 @@ namespace Eigen {
             m_T.coeffRef(i,i-1) = Scalar(0.0);
             m_S.applyOnTheRight(i,i-1,G);
             m_T.topRows(i).applyOnTheRight(i,i-1,G);
+            // update Z
+            if (m_computeQZ)
+              m_Z.applyOnTheLeft(i,i-1,G.adjoint());
           }
-          // update Z
-          if (m_computeQZ)
-            m_Z.applyOnTheLeft(i,i-1,G.adjoint());
         }
       }
     }
diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h
index e90ce77eb..a89d75958 100644
--- a/Eigen/src/Geometry/Quaternion.h
+++ b/Eigen/src/Geometry/Quaternion.h
@@ -161,8 +161,8 @@ class QuaternionBase : public RotationBase<Derived, 3>
   bool isApprox(const QuaternionBase<OtherDerived>& other, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const
   { return coeffs().isApprox(other.coeffs(), prec); }
 
-	/** return the result vector of \a v through the rotation*/
-  EIGEN_STRONG_INLINE Vector3 _transformVector(Vector3 v) const;
+  /** return the result vector of \a v through the rotation*/
+  EIGEN_STRONG_INLINE Vector3 _transformVector(const Vector3& v) const;
 
   /** \returns \c *this with scalar type casted to \a NewScalarType
     *
@@ -462,7 +462,7 @@ EIGEN_STRONG_INLINE Derived& QuaternionBase<Derived>::operator*= (const Quaterni
   */
 template <class Derived>
 EIGEN_STRONG_INLINE typename QuaternionBase<Derived>::Vector3
-QuaternionBase<Derived>::_transformVector(Vector3 v) const
+QuaternionBase<Derived>::_transformVector(const Vector3& v) const
 {
     // Note that this algorithm comes from the optimization by hand
     // of the conversion to a Matrix followed by a Matrix/Vector product.
diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h
index fd7c8a4b2..9b141c8df 100644
--- a/Eigen/src/SVD/BDCSVD.h
+++ b/Eigen/src/SVD/BDCSVD.h
@@ -84,6 +84,8 @@ public:
   typedef Matrix<RealScalar, Dynamic, 1> VectorType;
   typedef Array<RealScalar, Dynamic, 1> ArrayXr;
   typedef Array<Index,1,Dynamic> ArrayXi;
+  typedef Ref<ArrayXr> ArrayRef;
+  typedef Ref<ArrayXi> IndicesRef;
 
   /** \brief Default Constructor.
    *
@@ -159,21 +161,23 @@ private:
   void allocate(Index rows, Index cols, unsigned int computationOptions);
   void divide(Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift);
   void computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V);
-  void computeSingVals(const ArrayXr& col0, const ArrayXr& diag, const ArrayXi& perm, VectorType& singVals, ArrayXr& shifts, ArrayXr& mus);
-  void perturbCol0(const ArrayXr& col0, const ArrayXr& diag, const ArrayXi& perm, const VectorType& singVals, const ArrayXr& shifts, const ArrayXr& mus, ArrayXr& zhat);
-  void computeSingVecs(const ArrayXr& zhat, const ArrayXr& diag, const ArrayXi& perm, const VectorType& singVals, const ArrayXr& shifts, const ArrayXr& mus, MatrixXr& U, MatrixXr& V);
+  void computeSingVals(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, VectorType& singVals, ArrayRef shifts, ArrayRef mus);
+  void perturbCol0(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, const VectorType& singVals, const ArrayRef& shifts, const ArrayRef& mus, ArrayRef zhat);
+  void computeSingVecs(const ArrayRef& zhat, const ArrayRef& diag, const IndicesRef& perm, const VectorType& singVals, const ArrayRef& shifts, const ArrayRef& mus, MatrixXr& U, MatrixXr& V);
   void deflation43(Index firstCol, Index shift, Index i, Index size);
   void deflation44(Index firstColu , Index firstColm, Index firstRowW, Index firstColW, Index i, Index j, Index size);
   void deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW, Index shift);
   template<typename HouseholderU, typename HouseholderV, typename NaiveU, typename NaiveV>
   void copyUV(const HouseholderU &householderU, const HouseholderV &householderV, const NaiveU &naiveU, const NaiveV &naivev);
-  static void structured_update(Block<MatrixXr,Dynamic,Dynamic> A, const MatrixXr &B, Index n1);
-  static RealScalar secularEq(RealScalar x, const ArrayXr& col0, const ArrayXr& diag, const ArrayXi &perm, const ArrayXr& diagShifted, RealScalar shift);
+  void structured_update(Block<MatrixXr,Dynamic,Dynamic> A, const MatrixXr &B, Index n1);
+  static RealScalar secularEq(RealScalar x, const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm, const ArrayRef& diagShifted, RealScalar shift);
 
 protected:
   MatrixXr m_naiveU, m_naiveV;
   MatrixXr m_computed;
   Index m_nRec;
+  ArrayXr m_workspace;
+  ArrayXi m_workspaceI;
   int m_algoswap;
   bool m_isTranspose, m_compU, m_compV;
   
@@ -212,6 +216,9 @@ void BDCSVD<MatrixType>::allocate(Index rows, Index cols, unsigned int computati
   else         m_naiveU = MatrixXr::Zero(2, m_diagSize + 1 );
   
   if (m_compV) m_naiveV = MatrixXr::Zero(m_diagSize, m_diagSize);
+  
+  m_workspace.resize((m_diagSize+1)*(m_diagSize+1)*3);
+  m_workspaceI.resize(3*m_diagSize);
 }// end allocate
 
 template<typename MatrixType>
@@ -223,6 +230,19 @@ BDCSVD<MatrixType>& BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsign
   allocate(matrix.rows(), matrix.cols(), computationOptions);
   using std::abs;
   
+  //**** step -1 - If the problem is too small, directly falls back to JacobiSVD and return
+  if(matrix.cols() < m_algoswap)
+  {
+    // FIXME this line involves temporaries
+    JacobiSVD<MatrixType> jsvd(matrix,computationOptions);
+    if(computeU()) m_matrixU = jsvd.matrixU();
+    if(computeV()) m_matrixV = jsvd.matrixV();
+    m_singularValues = jsvd.singularValues();
+    m_nonzeroSingularValues = jsvd.nonzeroSingularValues();
+    m_isInitialized = true;
+    return *this;
+  }
+  
   //**** step 0 - Copy the input matrix and apply scaling to reduce over/under-flows
   RealScalar scale = matrix.cwiseAbs().maxCoeff();
   if(scale==RealScalar(0)) scale = RealScalar(1);
@@ -231,11 +251,13 @@ BDCSVD<MatrixType>& BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsign
   else               copy = matrix/scale;
   
   //**** step 1 - Bidiagonalization
+  // FIXME this line involves temporaries
   internal::UpperBidiagonalization<MatrixX> bid(copy);
 
   //**** step 2 - Divide & Conquer
   m_naiveU.setZero();
   m_naiveV.setZero();
+  // FIXME this line involves a temporary matrix
   m_computed.topRows(m_diagSize) = bid.bidiagonal().toDenseMatrix().transpose();
   m_computed.template bottomRows<1>().setZero();
   divide(0, m_diagSize - 1, 0, 0, 0);
@@ -257,6 +279,7 @@ BDCSVD<MatrixType>& BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsign
       break;
     }
   }
+
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
 //   std::cout << "m_naiveU\n" << m_naiveU << "\n\n";
 //   std::cout << "m_naiveV\n" << m_naiveV << "\n\n";
@@ -279,14 +302,14 @@ void BDCSVD<MatrixType>::copyUV(const HouseholderU &householderU, const Househol
     Index Ucols = m_computeThinU ? m_diagSize : householderU.cols();
     m_matrixU = MatrixX::Identity(householderU.cols(), Ucols);
     m_matrixU.topLeftCorner(m_diagSize, m_diagSize) = naiveV.template cast<Scalar>().topLeftCorner(m_diagSize, m_diagSize);
-    householderU.applyThisOnTheLeft(m_matrixU);
+    householderU.applyThisOnTheLeft(m_matrixU); // FIXME this line involves a temporary buffer
   }
   if (computeV())
   {
     Index Vcols = m_computeThinV ? m_diagSize : householderV.cols();
     m_matrixV = MatrixX::Identity(householderV.cols(), Vcols);
     m_matrixV.topLeftCorner(m_diagSize, m_diagSize) = naiveU.template cast<Scalar>().topLeftCorner(m_diagSize, m_diagSize);
-    householderV.applyThisOnTheLeft(m_matrixV);
+    householderV.applyThisOnTheLeft(m_matrixV); // FIXME this line involves a temporary buffer
   }
 }
 
@@ -307,7 +330,10 @@ void BDCSVD<MatrixType>::structured_update(Block<MatrixXr,Dynamic,Dynamic> A, co
     // If the matrices are large enough, let's exploit the sparse structure of A by
     // splitting it in half (wrt n1), and packing the non-zero columns.
     Index n2 = n - n1;
-    MatrixXr A1(n1,n), A2(n2,n), B1(n,n), B2(n,n);
+    Map<MatrixXr> A1(m_workspace.data()      , n1, n);
+    Map<MatrixXr> A2(m_workspace.data()+ n1*n, n2, n);
+    Map<MatrixXr> B1(m_workspace.data()+  n*n, n,  n);
+    Map<MatrixXr> B2(m_workspace.data()+2*n*n, n,  n);
     Index k1=0, k2=0;
     for(Index j=0; j<n; ++j)
     {
@@ -329,7 +355,11 @@ void BDCSVD<MatrixType>::structured_update(Block<MatrixXr,Dynamic,Dynamic> A, co
     A.bottomRows(n2).noalias() = A2.leftCols(k2) * B2.topRows(k2);
   }
   else
-    A *= B;  // FIXME this requires a temporary
+  {
+    Map<MatrixXr,Aligned> tmp(m_workspace.data(),n,n);
+    tmp.noalias() = A*B;
+    A = tmp;
+  }
 }
 
 // The divide algorithm is done "in place", we are always working on subsets of the same matrix. The divide methods takes as argument the 
@@ -360,7 +390,8 @@ void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW,
   // matrices.
   if (n < m_algoswap)
   {
-    JacobiSVD<MatrixXr> b(m_computed.block(firstCol, firstCol, n + 1, n), ComputeFullU | (m_compV ? ComputeFullV : 0)) ;
+    // FIXME this line involves temporaries
+    JacobiSVD<MatrixXr> b(m_computed.block(firstCol, firstCol, n + 1, n), ComputeFullU | (m_compV ? ComputeFullV : 0));
     if (m_compU)
       m_naiveU.block(firstCol, firstCol, n + 1, n + 1).real() = b.matrixU();
     else 
@@ -438,7 +469,7 @@ void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW,
   } 
   else 
   {
-    RealScalar q1 = (m_naiveU(0, firstCol + k));
+    RealScalar q1 = m_naiveU(0, firstCol + k);
     // we shift Q1 to the right
     for (Index i = firstCol + k - 1; i >= firstCol; i--) 
       m_naiveU(0, i + 1) = m_naiveU(0, i);
@@ -491,8 +522,14 @@ void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW,
   assert(VofSVD.allFinite());
 #endif
   
-  if (m_compU)  structured_update(m_naiveU.block(firstCol, firstCol, n + 1, n + 1), UofSVD, (n+2)/2);
-  else          m_naiveU.middleCols(firstCol, n + 1) *= UofSVD; // FIXME this requires a temporary, and exploit that there are 2 rows at compile time
+  if (m_compU)
+    structured_update(m_naiveU.block(firstCol, firstCol, n + 1, n + 1), UofSVD, (n+2)/2);
+  else
+  {
+    Map<Matrix<RealScalar,2,Dynamic>,Aligned> tmp(m_workspace.data(),2,n+1);
+    tmp.noalias() = m_naiveU.middleCols(firstCol, n+1) * UofSVD;
+    m_naiveU.middleCols(firstCol, n + 1) = tmp;
+  }
   
   if (m_compV)  structured_update(m_naiveV.block(firstRowW, firstColW, n, n), VofSVD, (n+1)/2);
   
@@ -517,10 +554,9 @@ void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW,
 template <typename MatrixType>
 void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V)
 {
-  // TODO Get rid of these copies (?)
-  // FIXME at least preallocate them
-  ArrayXr col0 = m_computed.col(firstCol).segment(firstCol, n);
-  ArrayXr diag = m_computed.block(firstCol, firstCol, n, n).diagonal();
+  ArrayRef col0 = m_computed.col(firstCol).segment(firstCol, n);
+  m_workspace.head(n) =  m_computed.block(firstCol, firstCol, n, n).diagonal();
+  ArrayRef diag = m_workspace.head(n);
   diag(0) = 0;
 
   // Allocate space for singular values and vectors
@@ -539,13 +575,14 @@ void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec
   Index actual_n = n;
   while(actual_n>1 && diag(actual_n-1)==0) --actual_n;
   Index m = 0; // size of the deflated problem
-  ArrayXi perm(actual_n);
   for(Index k=0;k<actual_n;++k)
     if(col0(k)!=0)
-      perm(m++) = k;
-  perm.conservativeResize(m);
+      m_workspaceI(m++) = k;
+  Map<ArrayXi> perm(m_workspaceI.data(),m);
   
-  ArrayXr shifts(n), mus(n), zhat(n);
+  Map<ArrayXr> shifts(m_workspace.data()+1*n, n);
+  Map<ArrayXr> mus(m_workspace.data()+2*n, n);
+  Map<ArrayXr> zhat(m_workspace.data()+3*n, n);
 
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
   std::cout << "computeSVDofM using:\n";
@@ -622,8 +659,8 @@ void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec
   // Reverse order so that singular values in increased order
   // Because of deflation, the zeros singular-values are already at the end
   singVals.head(actual_n).reverseInPlace();
-  U.leftCols(actual_n) = U.leftCols(actual_n).rowwise().reverse().eval();               // FIXME this requires a temporary
-  if (m_compV) V.leftCols(actual_n) = V.leftCols(actual_n).rowwise().reverse().eval();  // FIXME this requires a temporary
+  U.leftCols(actual_n).rowwise().reverseInPlace();
+  if (m_compV) V.leftCols(actual_n).rowwise().reverseInPlace();
   
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
   JacobiSVD<MatrixXr> jsvd(m_computed.block(firstCol, firstCol, n, n) );
@@ -634,7 +671,7 @@ void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec
 }
 
 template <typename MatrixType>
-typename BDCSVD<MatrixType>::RealScalar BDCSVD<MatrixType>::secularEq(RealScalar mu, const ArrayXr& col0, const ArrayXr& diag, const ArrayXi &perm, const ArrayXr& diagShifted, RealScalar shift)
+typename BDCSVD<MatrixType>::RealScalar BDCSVD<MatrixType>::secularEq(RealScalar mu, const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm, const ArrayRef& diagShifted, RealScalar shift)
 {
   Index m = perm.size();
   RealScalar res = 1;
@@ -647,8 +684,8 @@ typename BDCSVD<MatrixType>::RealScalar BDCSVD<MatrixType>::secularEq(RealScalar
 }
 
 template <typename MatrixType>
-void BDCSVD<MatrixType>::computeSingVals(const ArrayXr& col0, const ArrayXr& diag, const ArrayXi &perm,
-                                         VectorType& singVals, ArrayXr& shifts, ArrayXr& mus)
+void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm,
+                                         VectorType& singVals, ArrayRef shifts, ArrayRef mus)
 {
   using std::abs;
   using std::swap;
@@ -703,7 +740,8 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayXr& col0, const ArrayXr& dia
     RealScalar shift = (k == actual_n-1 || fMid > 0) ? left : right;
     
     // measure everything relative to shift
-    ArrayXr diagShifted = diag - shift;
+    Map<ArrayXr> diagShifted(m_workspace.data()+4*n, n);
+    diagShifted = diag - shift;
     
     // initial guess
     RealScalar muPrev, muCur;
@@ -730,7 +768,7 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayXr& col0, const ArrayXr& dia
     // rational interpolation: fit a function of the form a / mu + b through the two previous
     // iterates and use its zero to compute the next iterate
     bool useBisection = fPrev*fCur>0;
-    while (fCur!=0 && abs(muCur - muPrev) > 8 * NumTraits<RealScalar>::epsilon() * numext::maxi(abs(muCur), abs(muPrev)) && abs(fCur - fPrev)>NumTraits<RealScalar>::epsilon() && !useBisection)
+    while (fCur!=0 && abs(muCur - muPrev) > 8 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(muCur), abs(muPrev)) && abs(fCur - fPrev)>NumTraits<RealScalar>::epsilon() && !useBisection)
     {
       ++m_numIters;
 
@@ -773,7 +811,10 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayXr& col0, const ArrayXr& dia
       }
       
       RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift);
+
+#if defined EIGEN_INTERNAL_DEBUGGING || defined EIGEN_BDCSVD_DEBUG_VERBOSE
       RealScalar fRight = secularEq(rightShifted, col0, diag, perm, diagShifted, shift);
+#endif
 
 #ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
       if(!(fLeft * fRight<0))
@@ -781,14 +822,13 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayXr& col0, const ArrayXr& dia
 #endif
       eigen_internal_assert(fLeft * fRight < 0);
       
-      while (rightShifted - leftShifted > 2 * NumTraits<RealScalar>::epsilon() * numext::maxi(abs(leftShifted), abs(rightShifted)))
+      while (rightShifted - leftShifted > 2 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(leftShifted), abs(rightShifted)))
       {
         RealScalar midShifted = (leftShifted + rightShifted) / 2;
         RealScalar fMid = secularEq(midShifted, col0, diag, perm, diagShifted, shift);
         if (fLeft * fMid < 0)
         {
           rightShifted = midShifted;
-          fRight = fMid;
         }
         else
         {
@@ -816,8 +856,8 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayXr& col0, const ArrayXr& dia
 // zhat is perturbation of col0 for which singular vectors can be computed stably (see Section 3.1)
 template <typename MatrixType>
 void BDCSVD<MatrixType>::perturbCol0
-   (const ArrayXr& col0, const ArrayXr& diag, const ArrayXi &perm, const VectorType& singVals,
-    const ArrayXr& shifts, const ArrayXr& mus, ArrayXr& zhat)
+   (const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm, const VectorType& singVals,
+    const ArrayRef& shifts, const ArrayRef& mus, ArrayRef zhat)
 {
   using std::sqrt;
   Index n = col0.size();
@@ -865,8 +905,8 @@ void BDCSVD<MatrixType>::perturbCol0
 // compute singular vectors
 template <typename MatrixType>
 void BDCSVD<MatrixType>::computeSingVecs
-   (const ArrayXr& zhat, const ArrayXr& diag, const ArrayXi &perm, const VectorType& singVals,
-    const ArrayXr& shifts, const ArrayXr& mus, MatrixXr& U, MatrixXr& V)
+   (const ArrayRef& zhat, const ArrayRef& diag, const IndicesRef &perm, const VectorType& singVals,
+    const ArrayRef& shifts, const ArrayRef& mus, MatrixXr& U, MatrixXr& V)
 {
   Index n = zhat.size();
   Index m = perm.size();
@@ -991,7 +1031,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
   
   RealScalar maxDiag = diag.tail((std::max)(Index(1),length-1)).cwiseAbs().maxCoeff();
   RealScalar epsilon_strict = NumTraits<RealScalar>::epsilon() * maxDiag;
-  RealScalar epsilon_coarse = 8 * NumTraits<RealScalar>::epsilon() * numext::maxi(col0.cwiseAbs().maxCoeff(), maxDiag);
+  RealScalar epsilon_coarse = 8 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(col0.cwiseAbs().maxCoeff(), maxDiag);
   
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
   assert(m_naiveU.allFinite());
@@ -1047,7 +1087,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
     
     // Sort the diagonal entries, since diag(1:k-1) and diag(k:length) are already sorted, let's do a sorted merge.
     // First, compute the respective permutation.
-    Index *permutation = new Index[length]; // FIXME avoid repeated dynamic memory allocation
+    Index *permutation = m_workspaceI.data();
     {
       permutation[0] = 0;
       Index p = 1;
@@ -1084,8 +1124,8 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
     }
     
     // Current index of each col, and current column of each index
-    Index *realInd = new Index[length];  // FIXME avoid repeated dynamic memory allocation
-    Index *realCol = new Index[length];  // FIXME avoid repeated dynamic memory allocation
+    Index *realInd = m_workspaceI.data()+length;
+    Index *realCol = m_workspaceI.data()+2*length;
     
     for(int pos = 0; pos< length; pos++)
     {
@@ -1115,9 +1155,6 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
       realInd[J] = realI;
       realInd[i] = pi;
     }
-    delete[] permutation;
-    delete[] realInd;
-    delete[] realCol;
   }
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
   std::cout << "sorted: " << diag.transpose().format(bdcsvdfmt) << "\n";
diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h
index fcf01f518..a46a47104 100644
--- a/Eigen/src/SVD/JacobiSVD.h
+++ b/Eigen/src/SVD/JacobiSVD.h
@@ -425,12 +425,13 @@ void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
     // If d!=0, then t/d cannot overflow because the magnitude of the
     // entries forming d are not too small compared to the ones forming t.
     RealScalar u = t / d;
-    rot1.s() = RealScalar(1) / sqrt(RealScalar(1) + numext::abs2(u));
-    rot1.c() = rot1.s() * u;
+    RealScalar tmp = sqrt(RealScalar(1) + numext::abs2(u));
+    rot1.s() = RealScalar(1) / tmp;
+    rot1.c() = u / tmp;
   }
   m.applyOnTheLeft(0,1,rot1);
   j_right->makeJacobi(m,0,1);
-  *j_left  = rot1 * j_right->transpose();
+  *j_left = rot1 * j_right->transpose();
 }
 
 template<typename _MatrixType, int QRPreconditioner> 
@@ -680,6 +681,8 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
   const RealScalar precision = RealScalar(2) * NumTraits<Scalar>::epsilon();
 
   // limit for very small denormal numbers to be considered zero in order to avoid infinite loops (see bug 286)
+  // FIXME What about considerering any denormal numbers as zero, using:
+  // const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
   const RealScalar considerAsZero = RealScalar(2) * std::numeric_limits<RealScalar>::denorm_min();
 
   // Scaling factor to reduce over/under-flows
@@ -719,8 +722,9 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
         // if this 2x2 sub-matrix is not diagonal already...
         // notice that this comparison will evaluate to false if any NaN is involved, ensuring that NaN's don't
         // keep us iterating forever. Similarly, small denormal numbers are considered zero.
-        RealScalar threshold = numext::maxi(considerAsZero, precision * numext::maxi(abs(m_workMatrix.coeff(p,p)),
-                                                                       abs(m_workMatrix.coeff(q,q))));
+        RealScalar threshold = numext::maxi<RealScalar>(considerAsZero,
+                   precision * numext::maxi<RealScalar>(abs(m_workMatrix.coeff(p,p)),
+                                                        abs(m_workMatrix.coeff(q,q))));
         // We compare both values to threshold instead of calling max to be robust to NaN (See bug 791)
         if(abs(m_workMatrix.coeff(p,q))>threshold || abs(m_workMatrix.coeff(q,p)) > threshold)
         {
diff --git a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
index 244f1b50e..d25a161f7 100644
--- a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
+++ b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
@@ -30,16 +30,16 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r
   
   std::memset(mask,0,sizeof(bool)*rows);
 
+  typename evaluator<Lhs>::type lhsEval(lhs);
+  typename evaluator<Rhs>::type rhsEval(rhs);
+  
   // estimate the number of non zero entries
   // given a rhs column containing Y non zeros, we assume that the respective Y columns
   // of the lhs differs in average of one non zeros, thus the number of non zeros for
   // the product of a rhs column with the lhs is X+Y where X is the average number of non zero
   // per column of the lhs.
   // Therefore, we have nnz(lhs*rhs) = nnz(lhs) + nnz(rhs)
-  Index estimated_nnz_prod = lhs.nonZeros() + rhs.nonZeros();
-  
-  typename evaluator<Lhs>::type lhsEval(lhs);
-  typename evaluator<Rhs>::type rhsEval(rhs);
+  Index estimated_nnz_prod = lhsEval.nonZerosEstimate() + rhsEval.nonZerosEstimate();
 
   res.setZero();
   res.reserve(Index(estimated_nnz_prod));
diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h
index 2b31716a3..778939791 100644
--- a/Eigen/src/SparseCore/SparseBlock.h
+++ b/Eigen/src/SparseCore/SparseBlock.h
@@ -90,7 +90,8 @@ class sparse_matrix_block_impl
     typedef Block<SparseMatrixType, BlockRows, BlockCols, true> BlockType;
 public:
     enum { IsRowMajor = internal::traits<BlockType>::IsRowMajor };
-    EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
+    typedef SparseCompressedBase<Block<SparseMatrixType,BlockRows,BlockCols,true> > Base;
+    _EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
 protected:
     typedef typename Base::IndexVector IndexVector;
     enum { OuterSize = IsRowMajor ? BlockRows : BlockCols };
@@ -198,20 +199,9 @@ public:
     { return m_matrix.const_cast_derived().outerIndexPtr() + m_outerStart; }
     
     inline const StorageIndex* innerNonZeroPtr() const
-    { return isCompressed() ? 0 : m_matrix.innerNonZeroPtr(); }
+    { return isCompressed() ? 0 : (m_matrix.innerNonZeroPtr()+m_outerStart); }
     inline StorageIndex* innerNonZeroPtr()
-    { return isCompressed() ? 0 : m_matrix.const_cast_derived().innerNonZeroPtr(); }
-
-    Index nonZeros() const
-    {
-      if(m_matrix.isCompressed())
-        return  (  (m_matrix.outerIndexPtr()[m_outerStart+m_outerSize.value()])
-                 - (m_matrix.outerIndexPtr()[m_outerStart]));
-      else if(m_outerSize.value()==0)
-        return 0;
-      else
-        return Map<const IndexVector>(m_matrix.innerNonZeroPtr()+m_outerStart, m_outerSize.value()).sum();
-    }
+    { return isCompressed() ? 0 : (m_matrix.const_cast_derived().innerNonZeroPtr()+m_outerStart); }
     
     bool isCompressed() const { return m_matrix.innerNonZeroPtr()==0; }
     
@@ -233,7 +223,7 @@ public:
     const Scalar& lastCoeff() const
     {
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(sparse_matrix_block_impl);
-      eigen_assert(nonZeros()>0);
+      eigen_assert(Base::nonZeros()>0);
       if(m_matrix.isCompressed())
         return m_matrix.valuePtr()[m_matrix.outerIndexPtr()[m_outerStart+1]-1];
       else
@@ -339,17 +329,6 @@ SparseMatrixBase<Derived>::innerVectors(Index outerStart, Index outerSize) const
   
 }
 
-namespace internal {
-  
-template< typename XprType, int BlockRows, int BlockCols, bool InnerPanel,
-          bool OuterVector =  (BlockCols==1 && XprType::IsRowMajor)
-                               | // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
-                                 // revert to || as soon as not needed anymore. 
-                              (BlockRows==1 && !XprType::IsRowMajor)>
-class GenericSparseBlockInnerIteratorImpl;
-
-}
-
 /** Generic implementation of sparse Block expression.
   * Real-only. 
   */
@@ -415,8 +394,11 @@ public:
     Index blockCols() const { return m_blockCols.value(); }
     
   protected:
-    friend class internal::GenericSparseBlockInnerIteratorImpl<XprType,BlockRows,BlockCols,InnerPanel>;
+//     friend class internal::GenericSparseBlockInnerIteratorImpl<XprType,BlockRows,BlockCols,InnerPanel>;
     friend class ReverseInnerIterator;
+    friend struct internal::unary_evaluator<Block<XprType,BlockRows,BlockCols,InnerPanel>, internal::IteratorBased, Scalar >;
+    
+    Index nonZeros() const { return Dynamic; }
     
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
 
@@ -429,94 +411,6 @@ public:
 };
 
 namespace internal {
-  template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
-  class GenericSparseBlockInnerIteratorImpl<XprType,BlockRows,BlockCols,InnerPanel,false> : public Block<XprType, BlockRows, BlockCols, InnerPanel>::_MatrixTypeNested::InnerIterator
-  {
-    typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
-    enum {
-      IsRowMajor = BlockType::IsRowMajor
-    };
-    typedef typename BlockType::_MatrixTypeNested _MatrixTypeNested;
-    typedef typename BlockType::StorageIndex StorageIndex;
-    typedef typename _MatrixTypeNested::InnerIterator Base;
-    const BlockType& m_block;
-    Index m_end;
-  public:
-
-    EIGEN_STRONG_INLINE GenericSparseBlockInnerIteratorImpl(const BlockType& block, Index outer)
-      : Base(block.derived().nestedExpression(), outer + (IsRowMajor ? block.m_startRow.value() : block.m_startCol.value())),
-        m_block(block),
-        m_end(IsRowMajor ? block.m_startCol.value()+block.m_blockCols.value() : block.m_startRow.value()+block.m_blockRows.value())
-    {
-      while( (Base::operator bool()) && (Base::index() < (IsRowMajor ? m_block.m_startCol.value() : m_block.m_startRow.value())) )
-        Base::operator++();
-    }
-    
-    inline Index index()  const { return Base::index() - (IsRowMajor ? m_block.m_startCol.value() : m_block.m_startRow.value()); }
-    inline Index outer()  const { return Base::outer() - (IsRowMajor ? m_block.m_startRow.value() : m_block.m_startCol.value()); }
-    inline Index row()    const { return Base::row()   - m_block.m_startRow.value(); }
-    inline Index col()    const { return Base::col()   - m_block.m_startCol.value(); }
-    
-    inline operator bool() const { return Base::operator bool() && Base::index() < m_end; }
-  };
-  
-  // Row vector of a column-major sparse matrix or column of a row-major one.
-  template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
-  class GenericSparseBlockInnerIteratorImpl<XprType,BlockRows,BlockCols,InnerPanel,true>
-  {
-    typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
-    enum {
-      IsRowMajor = BlockType::IsRowMajor
-    };
-    typedef typename BlockType::_MatrixTypeNested _MatrixTypeNested;
-    typedef typename BlockType::StorageIndex StorageIndex;
-    typedef typename BlockType::Scalar Scalar;
-    const BlockType& m_block;
-    Index m_outerPos;
-    Index m_innerIndex;
-    Scalar m_value;
-    Index m_end;
-  public:
-
-    explicit EIGEN_STRONG_INLINE GenericSparseBlockInnerIteratorImpl(const BlockType& block, Index outer = 0)
-      : 
-        m_block(block),
-        m_outerPos( (IsRowMajor ? block.m_startCol.value() : block.m_startRow.value()) - 1), // -1 so that operator++ finds the first non-zero entry
-        m_innerIndex(IsRowMajor ? block.m_startRow.value() : block.m_startCol.value()),
-        m_end(IsRowMajor ? block.m_startCol.value()+block.m_blockCols.value() : block.m_startRow.value()+block.m_blockRows.value())
-    {
-      EIGEN_UNUSED_VARIABLE(outer);
-      eigen_assert(outer==0);
-      
-      ++(*this);
-    }
-    
-    inline Index index()  const { return m_outerPos - (IsRowMajor ? m_block.m_startCol.value() : m_block.m_startRow.value()); }
-    inline Index outer()  const { return 0; }
-    inline Index row()    const { return IsRowMajor ? 0 : index(); }
-    inline Index col()    const { return IsRowMajor ? index() : 0; }
-    
-    inline Scalar value() const { return m_value; }
-    
-    inline GenericSparseBlockInnerIteratorImpl& operator++()
-    {
-      // search next non-zero entry
-      while(++m_outerPos<m_end)
-      {
-        typename XprType::InnerIterator it(m_block.m_matrix, m_outerPos);
-        // search for the key m_innerIndex in the current outer-vector
-        while(it && it.index() < m_innerIndex) ++it;
-        if(it && it.index()==m_innerIndex)
-        {
-          m_value = it.value();
-          break;
-        }
-      }
-      return *this;
-    }
-    
-    inline operator bool() const { return m_outerPos < m_end; }
-  };
 
 template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
 struct unary_evaluator<Block<ArgType,BlockRows,BlockCols,InnerPanel>, IteratorBased >
@@ -548,9 +442,16 @@ struct unary_evaluator<Block<ArgType,BlockRows,BlockCols,InnerPanel>, IteratorBa
     explicit unary_evaluator(const XprType& op)
       : m_argImpl(op.nestedExpression()), m_block(op)
     {}
+    
+    inline Index nonZerosEstimate() const {
+      Index nnz = m_block.nonZeros();
+      if(nnz<0)
+        return m_argImpl.nonZerosEstimate() * m_block.size() / m_block.nestedExpression().size();
+      return nnz;
+    }
 
   protected:
-    typedef typename evaluator<ArgType>::InnerIterator        EvalIterator;
+    typedef typename evaluator<ArgType>::InnerIterator EvalIterator;
     
     typename evaluator<ArgType>::nestedType m_argImpl;
     const XprType &m_block;
@@ -595,6 +496,7 @@ public:
     : m_eval(aEval),
       m_outerPos( (IsRowMajor ? aEval.m_block.startCol() : aEval.m_block.startRow()) - 1), // -1 so that operator++ finds the first non-zero entry
       m_innerIndex(IsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol()),
+      m_value(0),
       m_end(IsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows())
   {
     EIGEN_UNUSED_VARIABLE(outer);
diff --git a/Eigen/src/SparseCore/SparseCompressedBase.h b/Eigen/src/SparseCore/SparseCompressedBase.h
index a5ba45e04..0dbb94faf 100644
--- a/Eigen/src/SparseCore/SparseCompressedBase.h
+++ b/Eigen/src/SparseCore/SparseCompressedBase.h
@@ -35,6 +35,25 @@ class SparseCompressedBase
     class InnerIterator;
     class ReverseInnerIterator;
     
+  protected:
+    typedef typename Base::IndexVector IndexVector;
+    Eigen::Map<IndexVector> innerNonZeros() { return Eigen::Map<IndexVector>(innerNonZeroPtr(), isCompressed()?0:derived().outerSize()); }
+    const  Eigen::Map<const IndexVector> innerNonZeros() const { return Eigen::Map<const IndexVector>(innerNonZeroPtr(), isCompressed()?0:derived().outerSize()); }
+        
+  public:
+    
+    /** \returns the number of non zero coefficients */
+    inline Index nonZeros() const
+    {
+      if(isCompressed())
+        return outerIndexPtr()[derived().outerSize()]-outerIndexPtr()[0];
+      else if(derived().outerSize()==0)
+        return 0;
+      else
+        return innerNonZeros().sum();
+      
+    }
+    
     /** \returns a const pointer to the array of values.
       * This function is aimed at interoperability with other libraries.
       * \sa innerIndexPtr(), outerIndexPtr() */
@@ -165,6 +184,10 @@ struct evaluator<SparseCompressedBase<Derived> >
   evaluator() : m_matrix(0) {}
   explicit evaluator(const Derived &mat) : m_matrix(&mat) {}
   
+  inline Index nonZerosEstimate() const {
+    return m_matrix->nonZeros();
+  }
+  
   operator Derived&() { return m_matrix->const_cast_derived(); }
   operator const Derived&() const { return *m_matrix; }
   
diff --git a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
index 3b4e9df59..f53427abf 100644
--- a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
@@ -121,6 +121,10 @@ public:
       m_lhsImpl(xpr.lhs()), 
       m_rhsImpl(xpr.rhs())  
   { }
+  
+  inline Index nonZerosEstimate() const {
+    return m_lhsImpl.nonZerosEstimate() + m_rhsImpl.nonZerosEstimate();
+  }
 
 protected:
   const BinaryOp m_functor;
@@ -198,6 +202,10 @@ public:
       m_lhsImpl(xpr.lhs()), 
       m_rhsImpl(xpr.rhs())  
   { }
+  
+  inline Index nonZerosEstimate() const {
+    return (std::min)(m_lhsImpl.nonZerosEstimate(), m_rhsImpl.nonZerosEstimate());
+  }
 
 protected:
   const BinaryOp m_functor;
@@ -243,7 +251,7 @@ public:
     EIGEN_STRONG_INLINE Index col() const { return m_rhsIter.col(); }
 
     EIGEN_STRONG_INLINE operator bool() const { return m_rhsIter; }
-
+    
   protected:
     const LhsEvaluator &m_lhsEval;
     RhsIterator m_rhsIter;
@@ -262,6 +270,10 @@ public:
       m_lhsImpl(xpr.lhs()), 
       m_rhsImpl(xpr.rhs())  
   { }
+  
+  inline Index nonZerosEstimate() const {
+    return m_rhsImpl.nonZerosEstimate();
+  }
 
 protected:
   const BinaryOp m_functor;
@@ -308,7 +320,7 @@ public:
     EIGEN_STRONG_INLINE Index col() const { return m_lhsIter.col(); }
 
     EIGEN_STRONG_INLINE operator bool() const { return m_lhsIter; }
-
+    
   protected:
     LhsIterator m_lhsIter;
     const RhsEvaluator &m_rhsEval;
@@ -327,6 +339,10 @@ public:
       m_lhsImpl(xpr.lhs()), 
       m_rhsImpl(xpr.rhs())  
   { }
+  
+  inline Index nonZerosEstimate() const {
+    return m_lhsImpl.nonZerosEstimate();
+  }
 
 protected:
   const BinaryOp m_functor;
diff --git a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
index 63d8f329c..d484be876 100644
--- a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
@@ -30,6 +30,10 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>
     };
     
     explicit unary_evaluator(const XprType& op) : m_functor(op.functor()), m_argImpl(op.nestedExpression()) {}
+    
+    inline Index nonZerosEstimate() const {
+      return m_argImpl.nonZerosEstimate();
+    }
 
   protected:
     typedef typename evaluator<ArgType>::InnerIterator        EvalIterator;
diff --git a/Eigen/src/SparseCore/SparseMap.h b/Eigen/src/SparseCore/SparseMap.h
index a6ff7d559..7c512d9fe 100644
--- a/Eigen/src/SparseCore/SparseMap.h
+++ b/Eigen/src/SparseCore/SparseMap.h
@@ -105,9 +105,6 @@ class SparseMapBase<Derived,ReadOnlyAccessors>
       return ((*r==inner) && (id<end)) ? m_values[id] : Scalar(0);
     }
 
-    /** \returns the number of non zero coefficients */
-    inline Index nonZeros() const  { return m_nnz; }
-
     inline SparseMapBase(Index rows, Index cols, Index nnz, IndexPointer outerIndexPtr, IndexPointer innerIndexPtr,
                               ScalarPointer valuePtr, IndexPointer innerNonZerosPtr = 0)
       : m_outerSize(IsRowMajor?rows:cols), m_innerSize(IsRowMajor?cols:rows), m_nnz(nnz), m_outerIndex(outerIndexPtr),
diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index 4c3a47959..ef93cf80c 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -95,6 +95,7 @@ class SparseMatrix
   public:
     typedef SparseCompressedBase<SparseMatrix> Base;
     using Base::isCompressed;
+    using Base::nonZeros;
     _EIGEN_SPARSE_PUBLIC_INTERFACE(SparseMatrix)
     EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, +=)
     EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, -=)
@@ -122,9 +123,6 @@ class SparseMatrix
     StorageIndex* m_outerIndex;
     StorageIndex* m_innerNonZeros;     // optional, if null then the data is compressed
     Storage m_data;
-    
-    Eigen::Map<IndexVector> innerNonZeros() { return Eigen::Map<IndexVector>(m_innerNonZeros, m_innerNonZeros?m_outerSize:0); }
-    const  Eigen::Map<const IndexVector> innerNonZeros() const { return Eigen::Map<const IndexVector>(m_innerNonZeros, m_innerNonZeros?m_outerSize:0); }
 
   public:
     
@@ -252,14 +250,6 @@ class SparseMatrix
         memset(m_innerNonZeros, 0, (m_outerSize)*sizeof(StorageIndex));
     }
 
-    /** \returns the number of non zero coefficients */
-    inline Index nonZeros() const
-    {
-      if(m_innerNonZeros)
-        return innerNonZeros().sum();
-      return convert_index(Index(m_data.size()));
-    }
-
     /** Preallocates \a reserveSize non zeros.
       *
       * Precondition: the matrix must be in compressed mode. */
diff --git a/Eigen/src/SparseCore/SparseMatrixBase.h b/Eigen/src/SparseCore/SparseMatrixBase.h
index 55b0ad9d2..d4ab8b908 100644
--- a/Eigen/src/SparseCore/SparseMatrixBase.h
+++ b/Eigen/src/SparseCore/SparseMatrixBase.h
@@ -149,9 +149,6 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
     /** \returns the number of coefficients, which is \a rows()*cols().
       * \sa rows(), cols(). */
     inline Index size() const { return rows() * cols(); }
-    /** \returns the number of nonzero coefficients which is in practice the number
-      * of stored coefficients. */
-    inline Index nonZeros() const { return derived().nonZeros(); }
     /** \returns true if either the number of rows or the number of columns is equal to 1.
       * In other words, this function returns
       * \code rows()==1 || cols()==1 \endcode
diff --git a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
index 3db01bf2d..48050077e 100644
--- a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
+++ b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
@@ -33,14 +33,6 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r
   // allocate a temporary buffer
   AmbiVector<Scalar,StorageIndex> tempVector(rows);
 
-  // estimate the number of non zero entries
-  // given a rhs column containing Y non zeros, we assume that the respective Y columns
-  // of the lhs differs in average of one non zeros, thus the number of non zeros for
-  // the product of a rhs column with the lhs is X+Y where X is the average number of non zero
-  // per column of the lhs.
-  // Therefore, we have nnz(lhs*rhs) = nnz(lhs) + nnz(rhs)
-  Index estimated_nnz_prod = lhs.nonZeros() + rhs.nonZeros();
-
   // mimics a resizeByInnerOuter:
   if(ResultType::IsRowMajor)
     res.resize(cols, rows);
@@ -49,6 +41,14 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r
   
   typename evaluator<Lhs>::type lhsEval(lhs);
   typename evaluator<Rhs>::type rhsEval(rhs);
+  
+  // estimate the number of non zero entries
+  // given a rhs column containing Y non zeros, we assume that the respective Y columns
+  // of the lhs differs in average of one non zeros, thus the number of non zeros for
+  // the product of a rhs column with the lhs is X+Y where X is the average number of non zero
+  // per column of the lhs.
+  // Therefore, we have nnz(lhs*rhs) = nnz(lhs) + nnz(rhs)
+  Index estimated_nnz_prod = lhsEval.nonZerosEstimate() + rhsEval.nonZerosEstimate();
 
   res.reserve(estimated_nnz_prod);
   double ratioColRes = double(estimated_nnz_prod)/double(lhs.rows()*rhs.cols());
diff --git a/Eigen/src/SparseCore/SparseTranspose.h b/Eigen/src/SparseCore/SparseTranspose.h
index 45d9c6700..d3fc7f102 100644
--- a/Eigen/src/SparseCore/SparseTranspose.h
+++ b/Eigen/src/SparseCore/SparseTranspose.h
@@ -40,15 +40,11 @@ namespace internal {
   };
 }
   
-// Implement nonZeros() for transpose. I'm not sure that's the best approach for that.
-// Perhaps it should be implemented in Transpose<> itself.
 template<typename MatrixType> class TransposeImpl<MatrixType,Sparse>
   : public internal::SparseTransposeImpl<MatrixType>
 {
   protected:
     typedef internal::SparseTransposeImpl<MatrixType> Base;
-  public:
-    inline Index nonZeros() const { return Base::derived().nestedExpression().nonZeros(); }
 };
 
 namespace internal {
@@ -61,6 +57,10 @@ struct unary_evaluator<Transpose<ArgType>, IteratorBased>
     typedef typename evaluator<ArgType>::ReverseInnerIterator EvalReverseIterator;
   public:
     typedef Transpose<ArgType> XprType;
+    
+    inline Index nonZerosEstimate() const {
+      return m_argImpl.nonZerosEstimate();
+    }
 
     class InnerIterator : public EvalIterator
     {
diff --git a/Eigen/src/SparseCore/SparseTriangularView.h b/Eigen/src/SparseCore/SparseTriangularView.h
index b5fbcbdde..34ec07a13 100644
--- a/Eigen/src/SparseCore/SparseTriangularView.h
+++ b/Eigen/src/SparseCore/SparseTriangularView.h
@@ -50,13 +50,6 @@ protected:
 
     template<typename OtherDerived> void solveInPlace(MatrixBase<OtherDerived>& other) const;
     template<typename OtherDerived> void solveInPlace(SparseMatrixBase<OtherDerived>& other) const;
-
-    inline Index nonZeros() const {
-      // FIXME HACK number of nonZeros is required for product logic
-      // this returns only an upper bound (but should be OK for most purposes)
-      return derived().nestedExpression().nonZeros();
-    }
-
   
 };
 
@@ -191,6 +184,10 @@ public:
     
   explicit unary_evaluator(const XprType &xpr) : m_argImpl(xpr.nestedExpression()) {}
   
+  inline Index nonZerosEstimate() const {
+    return m_argImpl.nonZerosEstimate();
+  }
+  
   class InnerIterator : public EvalIterator
   {
       typedef EvalIterator Base;
diff --git a/Eigen/src/SparseCore/SparseVector.h b/Eigen/src/SparseCore/SparseVector.h
index 35bcec819..7b65f32bc 100644
--- a/Eigen/src/SparseCore/SparseVector.h
+++ b/Eigen/src/SparseCore/SparseVector.h
@@ -442,6 +442,10 @@ struct evaluator<SparseVector<_Scalar,_Options,_Index> >
   
   explicit evaluator(const SparseVectorType &mat) : m_matrix(mat) {}
   
+  inline Index nonZerosEstimate() const {
+    return m_matrix.nonZeros();
+  }
+  
   operator SparseVectorType&() { return m_matrix.const_cast_derived(); }
   operator const SparseVectorType&() const { return m_matrix; }
   
diff --git a/Eigen/src/SuperLUSupport/SuperLUSupport.h b/Eigen/src/SuperLUSupport/SuperLUSupport.h
index efdc6d046..b9d5e48fb 100644
--- a/Eigen/src/SuperLUSupport/SuperLUSupport.h
+++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h
@@ -302,6 +302,7 @@ class SuperLUBase : public SparseSolverBase<Derived>
     typedef Matrix<Scalar,Dynamic,1> Vector;
     typedef Matrix<int, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
     typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType;    
+    typedef Map<PermutationMatrix<Dynamic,Dynamic,int> > PermutationMap;
     typedef SparseMatrix<Scalar> LUMatrixType;
 
   public:
@@ -459,10 +460,11 @@ class SuperLU : public SuperLUBase<_MatrixType,SuperLU<_MatrixType> >
     typedef typename Base::RealScalar RealScalar;
     typedef typename Base::StorageIndex StorageIndex;
     typedef typename Base::IntRowVectorType IntRowVectorType;
-    typedef typename Base::IntColVectorType IntColVectorType;    
+    typedef typename Base::IntColVectorType IntColVectorType;   
+    typedef typename Base::PermutationMap PermutationMap;
     typedef typename Base::LUMatrixType LUMatrixType;
     typedef TriangularView<LUMatrixType, Lower|UnitDiag>  LMatrixType;
-    typedef TriangularView<LUMatrixType,  Upper>           UMatrixType;
+    typedef TriangularView<LUMatrixType,  Upper>          UMatrixType;
 
   public:
     using Base::_solve_impl;
@@ -774,6 +776,8 @@ typename SuperLU<MatrixType>::Scalar SuperLU<MatrixType>::determinant() const
         det *= m_u.valuePtr()[lastId];
     }
   }
+  if(PermutationMap(m_p.data(),m_p.size()).determinant()*PermutationMap(m_q.data(),m_q.size()).determinant()<0)
+    det = -det;
   if(m_sluEqued!='N')
     return det/m_sluRscale.prod()/m_sluCscale.prod();
   else