160 files changed, 3499 insertions, 1682 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a28ad07d8..e037af3bc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -147,6 +147,12 @@ if(NOT MSVC)
   ei_add_cxx_compiler_flag("-Wenum-conversion")
   ei_add_cxx_compiler_flag("-Wc++11-extensions")
   
+  # -Wshadow is insanely too strict with gcc, hopefully it will become usable with gcc 6
+  # if(NOT CMAKE_COMPILER_IS_GNUCXX OR (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "5.0.0"))
+  if(NOT CMAKE_COMPILER_IS_GNUCXX)
+    ei_add_cxx_compiler_flag("-Wshadow")
+  endif()
+  
   ei_add_cxx_compiler_flag("-Wno-psabi")
   ei_add_cxx_compiler_flag("-Wno-variadic-macros")
   ei_add_cxx_compiler_flag("-Wno-long-long")
diff --git a/Eigen/Core b/Eigen/Core
index 1a3249604..80d7c1a69 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -178,7 +178,7 @@
     #undef bool
     #undef vector
     #undef pixel
-  #elif defined  __ARM_NEON
+  #elif (defined  __ARM_NEON) || (defined __ARM_NEON__)
     #define EIGEN_VECTORIZE
     #define EIGEN_VECTORIZE_NEON
     #include <arm_neon.h>
@@ -297,10 +297,12 @@ using std::ptrdiff_t;
   #include "src/Core/arch/AVX/PacketMath.h"
   #include "src/Core/arch/AVX/MathFunctions.h"
   #include "src/Core/arch/AVX/Complex.h"
+  #include "src/Core/arch/AVX/TypeCasting.h"
 #elif defined EIGEN_VECTORIZE_SSE
   #include "src/Core/arch/SSE/PacketMath.h"
   #include "src/Core/arch/SSE/MathFunctions.h"
   #include "src/Core/arch/SSE/Complex.h"
+  #include "src/Core/arch/SSE/TypeCasting.h"
 #elif defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
   #include "src/Core/arch/AltiVec/PacketMath.h"
   #include "src/Core/arch/AltiVec/Complex.h"
@@ -308,7 +310,6 @@ using std::ptrdiff_t;
   #include "src/Core/arch/NEON/PacketMath.h"
   #include "src/Core/arch/NEON/MathFunctions.h"
   #include "src/Core/arch/NEON/Complex.h"
-  #include "src/Core/arch/NEON/BlockingSizesLookupTables.h"
 #endif
 
 #if defined EIGEN_VECTORIZE_CUDA
@@ -382,7 +383,6 @@ using std::ptrdiff_t;
 #include "src/Core/Inverse.h"
 #include "src/Core/TriangularMatrix.h"
 #include "src/Core/SelfAdjointView.h"
-#include "src/Core/products/LookupBlockingSizesTable.h"
 #include "src/Core/products/GeneralBlockPanelKernel.h"
 #include "src/Core/products/Parallelizer.h"
 #include "src/Core/ProductEvaluators.h"
diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h
index 93a726483..37179521a 100644
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -314,9 +314,9 @@ template<> struct ldlt_inplace<Lower>
       }
       
       // In some previous versions of Eigen (e.g., 3.2.1), the scaling was omitted if the pivot
-      // was smaller than the cutoff value. However, soince LDLT is not rank-revealing
-      // we should only make sure we do not introduce INF or NaN values.
-      // LAPACK also uses 0 as the cutoff value.
+      // was smaller than the cutoff value. However, since LDLT is not rank-revealing
+      // we should only make sure that we do not introduce INF or NaN values.
+      // Remark that LAPACK also uses 0 as the cutoff value.
       RealScalar realAkk = numext::real(mat.coeffRef(k,k));
       if((rs>0) && (abs(realAkk) > RealScalar(0)))
         A21 /= realAkk;
diff --git a/Eigen/src/Cholesky/LLT_MKL.h b/Eigen/src/Cholesky/LLT_MKL.h
index 09bf59d43..0d42cb5bc 100644
--- a/Eigen/src/Cholesky/LLT_MKL.h
+++ b/Eigen/src/Cholesky/LLT_MKL.h
@@ -60,7 +60,7 @@ template<> struct mkl_llt<EIGTYPE> \
     lda = m.outerStride(); \
 \
     info = LAPACKE_##MKLPREFIX##potrf( matrix_order, uplo, size, (MKLTYPE*)a, lda ); \
-    info = (info==0) ? Success : NumericalIssue; \
+    info = (info==0) ? -1 : info>0 ? info-1 : size; \
     return info; \
   } \
 }; \
diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h
index 9a1f30bc8..d8a277143 100644
--- a/Eigen/src/Core/Array.h
+++ b/Eigen/src/Core/Array.h
@@ -74,7 +74,7 @@ class Array
     {
       return Base::operator=(other);
     }
-    
+
     /** Set all the entries to \a value.
       * \sa DenseBase::setConstant(), DenseBase::fill()
       */
@@ -101,7 +101,7 @@ class Array
       */
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Array& operator=(const ArrayBase<OtherDerived>& other)
+    EIGEN_STRONG_INLINE Array& operator=(const DenseBase<OtherDerived>& other)
     {
       return Base::_set(other);
     }
@@ -222,43 +222,18 @@ class Array
       m_storage.data()[3] = val3;
     }
 
-    /** Constructor copying the value of the expression \a other */
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Array(const ArrayBase<OtherDerived>& other)
-             : Base(other.rows() * other.cols(), other.rows(), other.cols())
-    {
-      Base::_check_template_params();
-      Base::_set_noalias(other);
-    }
     /** Copy constructor */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Array(const Array& other)
-            : Base(other.rows() * other.cols(), other.rows(), other.cols())
-    {
-      Base::_check_template_params();
-      Base::_set_noalias(other);
-    }
-    /** Copy constructor with in-place evaluation */
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Array(const ReturnByValue<OtherDerived>& other)
-    {
-      Base::_check_template_params();
-      Base::resize(other.rows(), other.cols());
-      other.evalTo(*this);
-    }
+            : Base(other)
+    { }
 
     /** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other)
-      : Base(other.derived().rows() * other.derived().cols(), other.derived().rows(), other.derived().cols())
-    {
-      Base::_check_template_params();
-      Base::_resize_to_match(other);
-      *this = other;
-    }
+      : Base(other.derived())
+    { }
 
     EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; }
     EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); }
diff --git a/Eigen/src/Core/ArrayWrapper.h b/Eigen/src/Core/ArrayWrapper.h
index 0b89c58cb..2ef112986 100644
--- a/Eigen/src/Core/ArrayWrapper.h
+++ b/Eigen/src/Core/ArrayWrapper.h
@@ -149,7 +149,7 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
     /** Forwards the resizing request to the nested expression
       * \sa DenseBase::resize(Index,Index)*/
     EIGEN_DEVICE_FUNC
-    void resize(Index nbRows, Index nbCols) { m_expression.const_cast_derived().resize(nbRows,nbCols); }
+    void resize(Index rows, Index cols) { m_expression.const_cast_derived().resize(rows,cols); }
 
   protected:
     NestedExpressionType m_expression;
@@ -198,7 +198,7 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
     typedef typename internal::nested<ExpressionType>::type NestedExpressionType;
 
     EIGEN_DEVICE_FUNC
-    explicit inline MatrixWrapper(ExpressionType& a_matrix) : m_expression(a_matrix) {}
+    explicit inline MatrixWrapper(ExpressionType& matrix) : m_expression(matrix) {}
 
     EIGEN_DEVICE_FUNC
     inline Index rows() const { return m_expression.rows(); }
@@ -288,7 +288,7 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
     /** Forwards the resizing request to the nested expression
       * \sa DenseBase::resize(Index,Index)*/
     EIGEN_DEVICE_FUNC
-    void resize(Index nbRows, Index nbCols) { m_expression.const_cast_derived().resize(nbRows,nbCols); }
+    void resize(Index rows, Index cols) { m_expression.const_cast_derived().resize(rows,cols); }
 
   protected:
     NestedExpressionType m_expression;
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 2ef37ca1c..7f84534e1 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -124,26 +124,26 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class
     /** Fixed-size constructor
       */
     EIGEN_DEVICE_FUNC
-    inline Block(XprType& xpr, Index a_startRow, Index a_startCol)
-      : Impl(xpr, a_startRow, a_startCol)
+    inline Block(XprType& xpr, Index startRow, Index startCol)
+      : Impl(xpr, startRow, startCol)
     {
       EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE)
-      eigen_assert(a_startRow >= 0 && BlockRows >= 1 && a_startRow + BlockRows <= xpr.rows()
-             && a_startCol >= 0 && BlockCols >= 1 && a_startCol + BlockCols <= xpr.cols());
+      eigen_assert(startRow >= 0 && BlockRows >= 1 && startRow + BlockRows <= xpr.rows()
+             && startCol >= 0 && BlockCols >= 1 && startCol + BlockCols <= xpr.cols());
     }
 
     /** Dynamic-size constructor
       */
     EIGEN_DEVICE_FUNC
     inline Block(XprType& xpr,
-          Index a_startRow, Index a_startCol,
+          Index startRow, Index startCol,
           Index blockRows, Index blockCols)
-      : Impl(xpr, a_startRow, a_startCol, blockRows, blockCols)
+      : Impl(xpr, startRow, startCol, blockRows, blockCols)
     {
       eigen_assert((RowsAtCompileTime==Dynamic || RowsAtCompileTime==blockRows)
           && (ColsAtCompileTime==Dynamic || ColsAtCompileTime==blockCols));
-      eigen_assert(a_startRow >= 0 && blockRows >= 0 && a_startRow  <= xpr.rows() - blockRows
-          && a_startCol >= 0 && blockCols >= 0 && a_startCol <= xpr.cols() - blockCols);
+      eigen_assert(startRow >= 0 && blockRows >= 0 && startRow  <= xpr.rows() - blockRows
+          && startCol >= 0 && blockCols >= 0 && startCol <= xpr.cols() - blockCols);
     }
 };
          
@@ -159,10 +159,10 @@ class BlockImpl<XprType, BlockRows, BlockCols, InnerPanel, Dense>
     typedef Impl Base;
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
     EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {}
-    EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index a_startRow, Index a_startCol) : Impl(xpr, a_startRow, a_startCol) {}
+    EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) {}
     EIGEN_DEVICE_FUNC
-    inline BlockImpl(XprType& xpr, Index a_startRow, Index a_startCol, Index blockRows, Index blockCols)
-      : Impl(xpr, a_startRow, a_startCol, blockRows, blockCols) {}
+    inline BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+      : Impl(xpr, startRow, startCol, blockRows, blockCols) {}
 };
 
 namespace internal {
@@ -198,8 +198,8 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
     /** Fixed-size constructor
       */
     EIGEN_DEVICE_FUNC
-    inline BlockImpl_dense(XprType& xpr, Index a_startRow, Index a_startCol)
-      : m_xpr(xpr), m_startRow(a_startRow), m_startCol(a_startCol),
+    inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
+      : m_xpr(xpr), m_startRow(startRow), m_startCol(startCol),
                     m_blockRows(BlockRows), m_blockCols(BlockCols)
     {}
 
@@ -207,9 +207,9 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
       */
     EIGEN_DEVICE_FUNC
     inline BlockImpl_dense(XprType& xpr,
-          Index a_startRow, Index a_startCol,
+          Index startRow, Index startCol,
           Index blockRows, Index blockCols)
-      : m_xpr(xpr), m_startRow(a_startRow), m_startCol(a_startCol),
+      : m_xpr(xpr), m_startRow(startRow), m_startCol(startCol),
                     m_blockRows(blockRows), m_blockCols(blockCols)
     {}
 
diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index ce00566a5..850877079 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -934,6 +934,16 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
     
     return m_argImpl.coeff(actual_row, actual_col);
   }
+  
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    // try to avoid using modulo; this is a pure optimization strategy
+    const Index actual_index = internal::traits<XprType>::RowsAtCompileTime==1
+                                  ? (ColFactor==1 ?  index : index%m_cols.value())
+                                  : (RowFactor==1 ?  index : index%m_rows.value());
+    
+    return m_argImpl.coeff(actual_index);
+  }
 
   template<int LoadMode>
   PacketReturnType packet(Index row, Index col) const
@@ -947,6 +957,16 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
 
     return m_argImpl.template packet<LoadMode>(actual_row, actual_col);
   }
+  
+  template<int LoadMode>
+  PacketReturnType packet(Index index) const
+  {
+    const Index actual_index = internal::traits<XprType>::RowsAtCompileTime==1
+                                  ? (ColFactor==1 ?  index : index%m_cols.value())
+                                  : (RowFactor==1 ?  index : index%m_rows.value());
+
+    return m_argImpl.template packet<LoadMode>(actual_index);
+  }
  
 protected:
   const ArgTypeNested m_arg; // FIXME is it OK to store both the argument and its evaluator?? (we have the same situation in evaluator_product)
diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h
index c7dfedae4..87ea14bac 100644
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -49,13 +49,13 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp
     EIGEN_DENSE_PUBLIC_INTERFACE(CwiseNullaryOp)
 
     EIGEN_DEVICE_FUNC
-    CwiseNullaryOp(Index nbRows, Index nbCols, const NullaryOp& func = NullaryOp())
-      : m_rows(nbRows), m_cols(nbCols), m_functor(func)
+    CwiseNullaryOp(Index rows, Index cols, const NullaryOp& func = NullaryOp())
+      : m_rows(rows), m_cols(cols), m_functor(func)
     {
-      eigen_assert(nbRows >= 0
-            && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == nbRows)
-            &&  nbCols >= 0
-            && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == nbCols));
+      eigen_assert(rows >= 0
+            && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows)
+            &&  cols >= 0
+            && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols));
     }
 
     EIGEN_DEVICE_FUNC
@@ -166,11 +166,11 @@ DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
 
 /** \returns an expression of a constant matrix of value \a value
   *
-  * The parameters \a nbRows and \a nbCols are the number of rows and of columns of
+  * The parameters \a rows and \a cols are the number of rows and of columns of
   * the returned matrix. Must be compatible with this DenseBase type.
   *
   * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
-  * it is redundant to pass \a nbRows and \a nbCols as arguments, so Zero() should be used
+  * it is redundant to pass \a rows and \a cols as arguments, so Zero() should be used
   * instead.
   *
   * The template parameter \a CustomNullaryOp is the type of the functor.
@@ -179,9 +179,9 @@ DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
   */
 template<typename Derived>
 EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
-DenseBase<Derived>::Constant(Index nbRows, Index nbCols, const Scalar& value)
+DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
 {
-  return DenseBase<Derived>::NullaryExpr(nbRows, nbCols, internal::scalar_constant_op<Scalar>(value));
+  return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_constant_op<Scalar>(value));
 }
 
 /** \returns an expression of a constant matrix of value \a value
@@ -357,8 +357,8 @@ PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
 
 /** Resizes to the given size, and sets all coefficients in this expression to the given \a value.
   *
-  * \param nbRows the new number of rows
-  * \param nbCols the new number of columns
+  * \param rows the new number of rows
+  * \param cols the new number of columns
   * \param val the value to which all coefficients are set
   *
   * Example: \include Matrix_setConstant_int_int.cpp
@@ -368,9 +368,9 @@ PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
   */
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
-PlainObjectBase<Derived>::setConstant(Index nbRows, Index nbCols, const Scalar& val)
+PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
 {
-  resize(nbRows, nbCols);
+  resize(rows, cols);
   return setConstant(val);
 }
 
@@ -429,9 +429,9 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low,
   */
 template<typename Derived>
 EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
-DenseBase<Derived>::Zero(Index nbRows, Index nbCols)
+DenseBase<Derived>::Zero(Index rows, Index cols)
 {
-  return Constant(nbRows, nbCols, Scalar(0));
+  return Constant(rows, cols, Scalar(0));
 }
 
 /** \returns an expression of a zero vector.
@@ -525,8 +525,8 @@ PlainObjectBase<Derived>::setZero(Index newSize)
 
 /** Resizes to the given size, and sets all coefficients in this expression to zero.
   *
-  * \param nbRows the new number of rows
-  * \param nbCols the new number of columns
+  * \param rows the new number of rows
+  * \param cols the new number of columns
   *
   * Example: \include Matrix_setZero_int_int.cpp
   * Output: \verbinclude Matrix_setZero_int_int.out
@@ -535,9 +535,9 @@ PlainObjectBase<Derived>::setZero(Index newSize)
   */
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
-PlainObjectBase<Derived>::setZero(Index nbRows, Index nbCols)
+PlainObjectBase<Derived>::setZero(Index rows, Index cols)
 {
-  resize(nbRows, nbCols);
+  resize(rows, cols);
   return setConstant(Scalar(0));
 }
 
@@ -545,7 +545,7 @@ PlainObjectBase<Derived>::setZero(Index nbRows, Index nbCols)
 
 /** \returns an expression of a matrix where all coefficients equal one.
   *
-  * The parameters \a nbRows and \a nbCols are the number of rows and of columns of
+  * The parameters \a rows and \a cols are the number of rows and of columns of
   * the returned matrix. Must be compatible with this MatrixBase type.
   *
   * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
@@ -559,9 +559,9 @@ PlainObjectBase<Derived>::setZero(Index nbRows, Index nbCols)
   */
 template<typename Derived>
 EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
-DenseBase<Derived>::Ones(Index nbRows, Index nbCols)
+DenseBase<Derived>::Ones(Index rows, Index cols)
 {
-  return Constant(nbRows, nbCols, Scalar(1));
+  return Constant(rows, cols, Scalar(1));
 }
 
 /** \returns an expression of a vector where all coefficients equal one.
@@ -651,8 +651,8 @@ PlainObjectBase<Derived>::setOnes(Index newSize)
 
 /** Resizes to the given size, and sets all coefficients in this expression to one.
   *
-  * \param nbRows the new number of rows
-  * \param nbCols the new number of columns
+  * \param rows the new number of rows
+  * \param cols the new number of columns
   *
   * Example: \include Matrix_setOnes_int_int.cpp
   * Output: \verbinclude Matrix_setOnes_int_int.out
@@ -661,9 +661,9 @@ PlainObjectBase<Derived>::setOnes(Index newSize)
   */
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
-PlainObjectBase<Derived>::setOnes(Index nbRows, Index nbCols)
+PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
 {
-  resize(nbRows, nbCols);
+  resize(rows, cols);
   return setConstant(Scalar(1));
 }
 
@@ -671,7 +671,7 @@ PlainObjectBase<Derived>::setOnes(Index nbRows, Index nbCols)
 
 /** \returns an expression of the identity matrix (not necessarily square).
   *
-  * The parameters \a nbRows and \a nbCols are the number of rows and of columns of
+  * The parameters \a rows and \a cols are the number of rows and of columns of
   * the returned matrix. Must be compatible with this MatrixBase type.
   *
   * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
@@ -685,9 +685,9 @@ PlainObjectBase<Derived>::setOnes(Index nbRows, Index nbCols)
   */
 template<typename Derived>
 EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
-MatrixBase<Derived>::Identity(Index nbRows, Index nbCols)
+MatrixBase<Derived>::Identity(Index rows, Index cols)
 {
-  return DenseBase<Derived>::NullaryExpr(nbRows, nbCols, internal::scalar_identity_op<Scalar>());
+  return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_identity_op<Scalar>());
 }
 
 /** \returns an expression of the identity matrix (not necessarily square).
@@ -783,8 +783,8 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
 
 /** \brief Resizes to the given size, and writes the identity expression (not necessarily square) into *this.
   *
-  * \param nbRows the new number of rows
-  * \param nbCols the new number of columns
+  * \param rows the new number of rows
+  * \param cols the new number of columns
   *
   * Example: \include Matrix_setIdentity_int_int.cpp
   * Output: \verbinclude Matrix_setIdentity_int_int.out
@@ -792,9 +792,9 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
   * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Identity()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index nbRows, Index nbCols)
+EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index cols)
 {
-  derived().resize(nbRows, nbCols);
+  derived().resize(rows, cols);
   return setIdentity();
 }
 
diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h
index c30d1bed9..361462e54 100644
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -66,7 +66,14 @@ template<typename Derived> class DenseBase
      */
     typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
 
+    /** The numeric type of the expression' coefficients, e.g. float, double, int or std::complex<float>, etc. */
     typedef typename internal::traits<Derived>::Scalar Scalar;
+    
+    /** The numeric type of the expression' coefficients, e.g. float, double, int or std::complex<float>, etc.
+      *
+      * It is an alias for the Scalar type */
+    typedef Scalar value_type;
+    
     typedef typename internal::packet_traits<Scalar>::type PacketScalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
 
@@ -221,11 +228,11 @@ template<typename Derived> class DenseBase
       * nothing else.
       */
     EIGEN_DEVICE_FUNC
-    void resize(Index nbRows, Index nbCols)
+    void resize(Index rows, Index cols)
     {
-      EIGEN_ONLY_USED_FOR_DEBUG(nbRows);
-      EIGEN_ONLY_USED_FOR_DEBUG(nbCols);
-      eigen_assert(nbRows == this->rows() && nbCols == this->cols()
+      EIGEN_ONLY_USED_FOR_DEBUG(rows);
+      EIGEN_ONLY_USED_FOR_DEBUG(cols);
+      eigen_assert(rows == this->rows() && cols == this->cols()
                 && "DenseBase::resize() does not actually allow to resize.");
     }
 
@@ -269,13 +276,12 @@ template<typename Derived> class DenseBase
     EIGEN_DEVICE_FUNC
     Derived& operator=(const ReturnByValue<OtherDerived>& func);
 
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** Copies \a other into *this without evaluating other. \returns a reference to *this.
+    /** \ínternal
+      * Copies \a other into *this without evaluating other. \returns a reference to *this.
       * \deprecated */
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
     Derived& lazyAssign(const DenseBase<OtherDerived>& other);
-#endif // not EIGEN_PARSED_BY_DOXYGEN
 
     EIGEN_DEVICE_FUNC
     CommaInitializer<Derived> operator<< (const Scalar& s);
diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h
index 8fcc83a5a..80c4c6e8e 100644
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@@ -218,7 +218,13 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
       if (this != &other) m_data = other.m_data;
       return *this; 
     }
-    EIGEN_DEVICE_FUNC DenseStorage(Index,Index,Index) {}
+    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      eigen_internal_assert(size==rows*cols && rows==_Rows && cols==_Cols);
+      EIGEN_UNUSED_VARIABLE(size);
+      EIGEN_UNUSED_VARIABLE(rows);
+      EIGEN_UNUSED_VARIABLE(cols);
+    }
     EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); }
     EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
     EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
@@ -277,13 +283,13 @@ template<typename T, int Size, int _Options> class DenseStorage<T, Size, Dynamic
       }
       return *this; 
     }
-    DenseStorage(Index, Index nbRows, Index nbCols) : m_rows(nbRows), m_cols(nbCols) {}
+    DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) {}
     void swap(DenseStorage& other)
     { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
     EIGEN_DEVICE_FUNC Index rows() const {return m_rows;}
     EIGEN_DEVICE_FUNC Index cols() const {return m_cols;}
-    void conservativeResize(Index, Index nbRows, Index nbCols) { m_rows = nbRows; m_cols = nbCols; }
-    void resize(Index, Index nbRows, Index nbCols) { m_rows = nbRows; m_cols = nbCols; }
+    void conservativeResize(Index, Index rows, Index cols) { m_rows = rows; m_cols = cols; }
+    void resize(Index, Index rows, Index cols) { m_rows = rows; m_cols = cols; }
     EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
     EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
 };
@@ -307,12 +313,12 @@ template<typename T, int Size, int _Cols, int _Options> class DenseStorage<T, Si
       }
       return *this; 
     }
-    DenseStorage(Index, Index nbRows, Index) : m_rows(nbRows) {}
+    DenseStorage(Index, Index rows, Index) : m_rows(rows) {}
     void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
     EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
     EIGEN_DEVICE_FUNC Index cols(void) const {return _Cols;}
-    void conservativeResize(Index, Index nbRows, Index) { m_rows = nbRows; }
-    void resize(Index, Index nbRows, Index) { m_rows = nbRows; }
+    void conservativeResize(Index, Index rows, Index) { m_rows = rows; }
+    void resize(Index, Index rows, Index) { m_rows = rows; }
     EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
     EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
 };
@@ -336,12 +342,12 @@ template<typename T, int Size, int _Rows, int _Options> class DenseStorage<T, Si
       }
       return *this;
     }
-    DenseStorage(Index, Index, Index nbCols) : m_cols(nbCols) {}
+    DenseStorage(Index, Index, Index cols) : m_cols(cols) {}
     void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
     EIGEN_DEVICE_FUNC Index rows(void) const {return _Rows;}
     EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
-    void conservativeResize(Index, Index, Index nbCols) { m_cols = nbCols; }
-    void resize(Index, Index, Index nbCols) { m_cols = nbCols; }
+    void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
+    void resize(Index, Index, Index cols) { m_cols = cols; }
     EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
     EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
 };
@@ -356,9 +362,12 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
     EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_rows(0), m_cols(0) {}
     explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
        : m_data(0), m_rows(0), m_cols(0) {}
-    DenseStorage(Index size, Index nbRows, Index nbCols)
-      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(nbRows), m_cols(nbCols)
-    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
+    DenseStorage(Index size, Index rows, Index cols)
+      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(rows), m_cols(cols)
+    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      eigen_internal_assert(size==rows*cols && rows>=0 && cols >=0);
+    }
     DenseStorage(const DenseStorage& other)
       : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(other.m_rows*other.m_cols))
       , m_rows(other.m_rows)
@@ -401,13 +410,13 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
     { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
     EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
     EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
-    void conservativeResize(Index size, Index nbRows, Index nbCols)
+    void conservativeResize(Index size, Index rows, Index cols)
     {
       m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, m_rows*m_cols);
-      m_rows = nbRows;
-      m_cols = nbCols;
+      m_rows = rows;
+      m_cols = cols;
     }
-    void resize(Index size, Index nbRows, Index nbCols)
+    void resize(Index size, Index rows, Index cols)
     {
       if(size != m_rows*m_cols)
       {
@@ -418,8 +427,8 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
           m_data = 0;
         EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
       }
-      m_rows = nbRows;
-      m_cols = nbCols;
+      m_rows = rows;
+      m_cols = cols;
     }
     EIGEN_DEVICE_FUNC const T *data() const { return m_data; }
     EIGEN_DEVICE_FUNC T *data() { return m_data; }
@@ -433,8 +442,12 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
   public:
     EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_cols(0) {}
     explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {}
-    DenseStorage(Index size, Index, Index nbCols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_cols(nbCols)
-    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
+    DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_cols(cols)
+    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      eigen_internal_assert(size==rows*cols && rows==_Rows && cols >=0);
+      EIGEN_UNUSED_VARIABLE(rows);
+    }
     DenseStorage(const DenseStorage& other)
       : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(_Rows*other.m_cols))
       , m_cols(other.m_cols)
@@ -472,12 +485,12 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
     void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
     EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
     EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
-    void conservativeResize(Index size, Index, Index nbCols)
+    void conservativeResize(Index size, Index, Index cols)
     {
       m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, _Rows*m_cols);
-      m_cols = nbCols;
+      m_cols = cols;
     }
-    EIGEN_STRONG_INLINE void resize(Index size, Index, Index nbCols)
+    EIGEN_STRONG_INLINE void resize(Index size, Index, Index cols)
     {
       if(size != _Rows*m_cols)
       {
@@ -488,7 +501,7 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
           m_data = 0;
         EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
       }
-      m_cols = nbCols;
+      m_cols = cols;
     }
     EIGEN_DEVICE_FUNC const T *data() const { return m_data; }
     EIGEN_DEVICE_FUNC T *data() { return m_data; }
@@ -502,8 +515,12 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
   public:
     EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_rows(0) {}
     explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {}
-    DenseStorage(Index size, Index nbRows, Index) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(nbRows)
-    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
+    DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(rows)
+    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      eigen_internal_assert(size==rows*cols && rows>=0 && cols == _Cols);
+      EIGEN_UNUSED_VARIABLE(cols);
+    }
     DenseStorage(const DenseStorage& other)
       : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(other.m_rows*_Cols))
       , m_rows(other.m_rows)
@@ -541,12 +558,12 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
     void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
     EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
     EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
-    void conservativeResize(Index size, Index nbRows, Index)
+    void conservativeResize(Index size, Index rows, Index)
     {
       m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, m_rows*_Cols);
-      m_rows = nbRows;
+      m_rows = rows;
     }
-    EIGEN_STRONG_INLINE void resize(Index size, Index nbRows, Index)
+    EIGEN_STRONG_INLINE void resize(Index size, Index rows, Index)
     {
       if(size != m_rows*_Cols)
       {
@@ -557,7 +574,7 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
           m_data = 0;
         EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
       }
-      m_rows = nbRows;
+      m_rows = rows;
     }
     EIGEN_DEVICE_FUNC const T *data() const { return m_data; }
     EIGEN_DEVICE_FUNC T *data() { return m_data; }
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 85c0cb4ba..8a7a0eddc 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -59,6 +59,7 @@ struct default_packet_traits
 
     HasDiv    = 0,
     HasSqrt   = 0,
+    HasRsqrt  = 0,
     HasExp    = 0,
     HasLog    = 0,
     HasLog10    = 0,
@@ -106,6 +107,28 @@ template<typename T> struct packet_traits : default_packet_traits
 
 template<typename T> struct packet_traits<const T> : packet_traits<T> { };
 
+template <typename Src, typename Tgt> struct type_casting_traits {
+  enum {
+    VectorizedCast = 0,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+
+/** \internal \returns static_cast<TgtType>(a) (coeff-wise) */
+template <typename SrcPacket, typename TgtPacket>
+EIGEN_DEVICE_FUNC inline TgtPacket
+pcast(const SrcPacket& a) {
+  return static_cast<TgtPacket>(a);
+}
+template <typename SrcPacket, typename TgtPacket>
+EIGEN_DEVICE_FUNC inline TgtPacket
+pcast(const SrcPacket& a, const SrcPacket& /*b*/) {
+  return static_cast<TgtPacket>(a);
+}
+
+
 /** \internal \returns a + b (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 padd(const Packet& a,
@@ -381,6 +404,12 @@ Packet plog10(const Packet& a) { using std::log10; return log10(a); }
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet psqrt(const Packet& a) { using std::sqrt; return sqrt(a); }
 
+/** \internal \returns the reciprocal square-root of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet prsqrt(const Packet& a) {
+  return pdiv(pset1<Packet>(1), psqrt(a));
+}
+
 /** \internal \returns the rounded value of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pround(const Packet& a) { using numext::round; return round(a); }
diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h
index 870e11024..6b2b3ade4 100644
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -117,11 +117,11 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
     /** Constructor in the fixed-size case.
       *
       * \param dataPtr pointer to the array to map
-      * \param a_stride optional Stride object, passing the strides.
+      * \param stride optional Stride object, passing the strides.
       */
     EIGEN_DEVICE_FUNC
-    explicit inline Map(PointerArgType dataPtr, const StrideType& a_stride = StrideType())
-      : Base(cast_to_pointer_type(dataPtr)), m_stride(a_stride)
+    explicit inline Map(PointerArgType dataPtr, const StrideType& stride = StrideType())
+      : Base(cast_to_pointer_type(dataPtr)), m_stride(stride)
     {
       PlainObjectType::Base::_check_template_params();
     }
@@ -129,12 +129,12 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
     /** Constructor in the dynamic-size vector case.
       *
       * \param dataPtr pointer to the array to map
-      * \param a_size the size of the vector expression
-      * \param a_stride optional Stride object, passing the strides.
+      * \param size the size of the vector expression
+      * \param stride optional Stride object, passing the strides.
       */
     EIGEN_DEVICE_FUNC
-    inline Map(PointerArgType dataPtr, Index a_size, const StrideType& a_stride = StrideType())
-      : Base(cast_to_pointer_type(dataPtr), a_size), m_stride(a_stride)
+    inline Map(PointerArgType dataPtr, Index size, const StrideType& stride = StrideType())
+      : Base(cast_to_pointer_type(dataPtr), size), m_stride(stride)
     {
       PlainObjectType::Base::_check_template_params();
     }
@@ -142,13 +142,13 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
     /** Constructor in the dynamic-size matrix case.
       *
       * \param dataPtr pointer to the array to map
-      * \param nbRows the number of rows of the matrix expression
-      * \param nbCols the number of columns of the matrix expression
-      * \param a_stride optional Stride object, passing the strides.
+      * \param rows the number of rows of the matrix expression
+      * \param cols the number of columns of the matrix expression
+      * \param stride optional Stride object, passing the strides.
       */
     EIGEN_DEVICE_FUNC
-    inline Map(PointerArgType dataPtr, Index nbRows, Index nbCols, const StrideType& a_stride = StrideType())
-      : Base(cast_to_pointer_type(dataPtr), nbRows, nbCols), m_stride(a_stride)
+    inline Map(PointerArgType dataPtr, Index rows, Index cols, const StrideType& stride = StrideType())
+      : Base(cast_to_pointer_type(dataPtr), rows, cols), m_stride(stride)
     {
       PlainObjectType::Base::_check_template_params();
     }
diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h
index acac74aa4..0d85085c8 100644
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@@ -146,12 +146,12 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
     }
 
     EIGEN_DEVICE_FUNC
-    inline MapBase(PointerType dataPtr, Index nbRows, Index nbCols)
-            : m_data(dataPtr), m_rows(nbRows), m_cols(nbCols)
+    inline MapBase(PointerType dataPtr, Index rows, Index cols)
+            : m_data(dataPtr), m_rows(rows), m_cols(cols)
     {
       eigen_assert( (dataPtr == 0)
-              || (   nbRows >= 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == nbRows)
-                  && nbCols >= 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == nbCols)));
+              || (   rows >= 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows)
+                  && cols >= 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols)));
       checkSanity();
     }
 
@@ -160,6 +160,8 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
     EIGEN_DEVICE_FUNC
     void checkSanity() const
     {
+      eigen_assert(EIGEN_IMPLIES(internal::packet_traits<Scalar>::AlignedOnScalar, (size_t(m_data) % sizeof(Scalar)) == 0)
+                   && "input pointer is not aligned on scalar boundary, e.g., use \"EIGEN_ALIGN8 T ptr[N];\" for double or complex<float>");
       eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::IsAligned, (size_t(m_data) % EIGEN_ALIGN_BYTES) == 0) && "data is not aligned");
     }
 
@@ -234,7 +236,7 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>
 
     EIGEN_DEVICE_FUNC explicit inline MapBase(PointerType dataPtr) : Base(dataPtr) {}
     EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index vecSize) : Base(dataPtr, vecSize) {}
-    EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index nbRows, Index nbCols) : Base(dataPtr, nbRows, nbCols) {}
+    EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index rows, Index cols) : Base(dataPtr, rows, cols) {}
 
     EIGEN_DEVICE_FUNC
     Derived& operator=(const MapBase& other)
diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h
index 88ffd7d60..b4a68e08a 100644
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -25,7 +25,7 @@ namespace Eigen {
   *
   * The first three template parameters are required:
   * \tparam _Scalar \anchor matrix_tparam_scalar Numeric type, e.g. float, double, int or std::complex<float>.
-  *                 User defined sclar types are supported as well (see \ref user_defined_scalars "here").
+  *                 User defined scalar types are supported as well (see \ref user_defined_scalars "here").
   * \tparam _Rows Number of rows, or \b Dynamic
   * \tparam _Cols Number of columns, or \b Dynamic
   *
@@ -170,7 +170,7 @@ class Matrix
       */
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Matrix& operator=(const MatrixBase<OtherDerived>& other)
+    EIGEN_STRONG_INLINE Matrix& operator=(const DenseBase<OtherDerived>& other)
     {
       return Base::_set(other);
     }
@@ -266,8 +266,8 @@ class Matrix
       * 
       * \warning This constructor is disabled for fixed-size \c 1x1 matrices. For instance,
       * calling Matrix<double,1,1>(1) will call the initialization constructor: Matrix(const Scalar&).
-      * For fixed-size \c 1x1 matrices it is thefore recommended to use the default
-      * constructor Matrix() instead, especilly when using one of the non standard
+      * For fixed-size \c 1x1 matrices it is therefore recommended to use the default
+      * constructor Matrix() instead, especially when using one of the non standard
       * \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives).
       */
     EIGEN_STRONG_INLINE explicit Matrix(Index dim);
@@ -281,8 +281,8 @@ class Matrix
       * 
       * \warning This constructor is disabled for fixed-size \c 1x2 and \c 2x1 vectors. For instance,
       * calling Matrix2f(2,1) will call the initialization constructor: Matrix(const Scalar& x, const Scalar& y).
-      * For fixed-size \c 1x2 or \c 2x1 vectors it is thefore recommended to use the default
-      * constructor Matrix() instead, especilly when using one of the non standard
+      * For fixed-size \c 1x2 or \c 2x1 vectors it is therefore recommended to use the default
+      * constructor Matrix() instead, especially when using one of the non standard
       * \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives).
       */
     EIGEN_DEVICE_FUNC
@@ -315,37 +315,10 @@ class Matrix
     }
 
 
-    /** \brief Constructor copying the value of the expression \a other */
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Matrix(const MatrixBase<OtherDerived>& other)
-             : Base(other.rows() * other.cols(), other.rows(), other.cols())
-    {
-      // This test resides here, to bring the error messages closer to the user. Normally, these checks
-      // are performed deeply within the library, thus causing long and scary error traces.
-      EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
-        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-
-      Base::_check_template_params();
-      Base::_set_noalias(other);
-    }
     /** \brief Copy constructor */
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Matrix(const Matrix& other)
-            : Base(other.rows() * other.cols(), other.rows(), other.cols())
-    {
-      Base::_check_template_params();
-      Base::_set_noalias(other);
-    }
-    /** \brief Copy constructor with in-place evaluation */
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Matrix(const ReturnByValue<OtherDerived>& other)
-    {
-      Base::_check_template_params();
-      Base::resize(other.rows(), other.cols());
-      other.evalTo(*this);
-    }
+    EIGEN_STRONG_INLINE Matrix(const Matrix& other) : Base(other)
+    { }
 
     /** \brief Copy constructor for generic expressions.
       * \sa MatrixBase::operator=(const EigenBase<OtherDerived>&)
@@ -353,14 +326,8 @@ class Matrix
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Matrix(const EigenBase<OtherDerived> &other)
-      : Base(other.derived().rows() * other.derived().cols(), other.derived().rows(), other.derived().cols())
-    {
-      Base::_check_template_params();
-      Base::_resize_to_match(other);
-      // FIXME/CHECK: isn't *this = other.derived() more efficient. it allows to
-      //              go for pure _set() implementations, right?
-      *this = other;
-    }
+      : Base(other.derived())
+    { }
 
     EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; }
     EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); }
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index ed28b4d07..81b900eee 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -164,11 +164,9 @@ template<typename Derived> class MatrixBase
     EIGEN_DEVICE_FUNC
     Derived& operator=(const ReturnByValue<OtherDerived>& other);
 
-#ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename ProductDerived, typename Lhs, typename Rhs>
     EIGEN_DEVICE_FUNC
     Derived& lazyAssign(const ProductBase<ProductDerived, Lhs,Rhs>& other);
-#endif // not EIGEN_PARSED_BY_DOXYGEN
 
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
@@ -412,7 +410,8 @@ template<typename Derived> class MatrixBase
     
     ScalarMultipleReturnType operator*(const UniformScaling<Scalar>& s) const;
     // put this as separate enum value to work around possible GCC 4.3 bug (?)
-    enum { HomogeneousReturnTypeDirection = ColsAtCompileTime==1?Vertical:Horizontal };
+    enum { HomogeneousReturnTypeDirection = ColsAtCompileTime==1&&RowsAtCompileTime==1 ? ((internal::traits<Derived>::Flags&RowMajorBit)==RowMajorBit ? Horizontal : Vertical)
+                                          : ColsAtCompileTime==1 ? Vertical : Horizontal };
     typedef Homogeneous<Derived, HomogeneousReturnTypeDirection> HomogeneousReturnType;
     HomogeneousReturnType homogeneous() const;
     
diff --git a/Eigen/src/Core/PermutationMatrix.h b/Eigen/src/Core/PermutationMatrix.h
index 065000c12..99f5aecdd 100644
--- a/Eigen/src/Core/PermutationMatrix.h
+++ b/Eigen/src/Core/PermutationMatrix.h
@@ -353,7 +353,7 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
       * array's size.
       */
     template<typename Other>
-    explicit inline PermutationMatrix(const MatrixBase<Other>& a_indices) : m_indices(a_indices)
+    explicit inline PermutationMatrix(const MatrixBase<Other>& indices) : m_indices(indices)
     {}
 
     /** Convert the Transpositions \a tr to a permutation matrix */
@@ -527,8 +527,8 @@ class PermutationWrapper : public PermutationBase<PermutationWrapper<_IndicesTyp
     typedef typename Traits::IndicesType IndicesType;
     #endif
 
-    inline PermutationWrapper(const IndicesType& a_indices)
-      : m_indices(a_indices)
+    inline PermutationWrapper(const IndicesType& indices)
+      : m_indices(indices)
     {}
 
     /** const version of indices(). */
@@ -596,7 +596,8 @@ struct permut_matrix_product_retval
       const Index n = Side==OnTheLeft ? rows() : cols();
       // FIXME we need an is_same for expression that is not sensitive to constness. For instance
       // is_same_xpr<Block<const Matrix>, Block<Matrix> >::value should be true.
-      if(is_same<MatrixTypeNestedCleaned,Dest>::value && extract_data(dst) == extract_data(m_matrix))
+      //if(is_same<MatrixTypeNestedCleaned,Dest>::value && extract_data(dst) == extract_data(m_matrix))
+      if(is_same_dense(dst, m_matrix))
       {
         // apply the permutation inplace
         Matrix<bool,PermutationType::RowsAtCompileTime,1,0,PermutationType::MaxRowsAtCompileTime> mask(m_permutation.size());
diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h
index 65d69f484..9cb32e7d8 100644
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -69,8 +69,9 @@ template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers> struct m
 #ifdef EIGEN_PARSED_BY_DOXYGEN
 namespace internal {
 
-// this is a warkaround to doxygen not being able to understand the inheritence logic
+// this is a workaround to doxygen not being able to understand the inheritance logic
 // when it is hidden by the dense_xpr_base helper struct.
+/** This class is just a workaround for Doxygen and it does not not actually exist. */
 template<typename Derived> struct dense_xpr_base_dispatcher_for_doxygen;// : public MatrixBase<Derived> {};
 /** This class is just a workaround for Doxygen and it does not not actually exist. */
 template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
@@ -96,6 +97,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
 
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
     typedef typename internal::traits<Derived>::Scalar Scalar;
+    
     typedef typename internal::packet_traits<Scalar>::type PacketScalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
     typedef Derived DenseType;
@@ -244,22 +246,22 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       * \sa resize(Index) for vectors, resize(NoChange_t, Index), resize(Index, NoChange_t)
       */
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void resize(Index nbRows, Index nbCols)
+    EIGEN_STRONG_INLINE void resize(Index rows, Index cols)
     {
-      eigen_assert(   EIGEN_IMPLIES(RowsAtCompileTime!=Dynamic,nbRows==RowsAtCompileTime)
-                   && EIGEN_IMPLIES(ColsAtCompileTime!=Dynamic,nbCols==ColsAtCompileTime)
-                   && EIGEN_IMPLIES(RowsAtCompileTime==Dynamic && MaxRowsAtCompileTime!=Dynamic,nbRows<=MaxRowsAtCompileTime)
-                   && EIGEN_IMPLIES(ColsAtCompileTime==Dynamic && MaxColsAtCompileTime!=Dynamic,nbCols<=MaxColsAtCompileTime)
-                   && nbRows>=0 && nbCols>=0 && "Invalid sizes when resizing a matrix or array.");
-      internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(nbRows, nbCols);
+      eigen_assert(   EIGEN_IMPLIES(RowsAtCompileTime!=Dynamic,rows==RowsAtCompileTime)
+                   && EIGEN_IMPLIES(ColsAtCompileTime!=Dynamic,cols==ColsAtCompileTime)
+                   && EIGEN_IMPLIES(RowsAtCompileTime==Dynamic && MaxRowsAtCompileTime!=Dynamic,rows<=MaxRowsAtCompileTime)
+                   && EIGEN_IMPLIES(ColsAtCompileTime==Dynamic && MaxColsAtCompileTime!=Dynamic,cols<=MaxColsAtCompileTime)
+                   && rows>=0 && cols>=0 && "Invalid sizes when resizing a matrix or array.");
+      internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(rows, cols);
       #ifdef EIGEN_INITIALIZE_COEFFS
-        Index size = nbRows*nbCols;
+        Index size = rows*cols;
         bool size_changed = size != this->size();
-        m_storage.resize(size, nbRows, nbCols);
+        m_storage.resize(size, rows, cols);
         if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
       #else
-        internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(nbRows, nbCols);
-        m_storage.resize(nbRows*nbCols, nbRows, nbCols);
+        internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(rows, cols);
+        m_storage.resize(rows*cols, rows, cols);
       #endif
     }
 
@@ -300,9 +302,9 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       * \sa resize(Index,Index)
       */
     EIGEN_DEVICE_FUNC
-    inline void resize(NoChange_t, Index nbCols)
+    inline void resize(NoChange_t, Index cols)
     {
-      resize(rows(), nbCols);
+      resize(rows(), cols);
     }
 
     /** Resizes the matrix, changing only the number of rows. For the parameter of type NoChange_t, just pass the special value \c NoChange
@@ -314,9 +316,9 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       * \sa resize(Index,Index)
       */
     EIGEN_DEVICE_FUNC
-    inline void resize(Index nbRows, NoChange_t)
+    inline void resize(Index rows, NoChange_t)
     {
-      resize(nbRows, cols());
+      resize(rows, cols());
     }
 
     /** Resizes \c *this to have the same dimensions as \a other.
@@ -356,9 +358,9 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       * appended to the matrix they will be uninitialized.
       */
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void conservativeResize(Index nbRows, Index nbCols)
+    EIGEN_STRONG_INLINE void conservativeResize(Index rows, Index cols)
     {
-      internal::conservative_resize_like_impl<Derived>::run(*this, nbRows, nbCols);
+      internal::conservative_resize_like_impl<Derived>::run(*this, rows, cols);
     }
 
     /** Resizes the matrix to \a rows x \a cols while leaving old values untouched.
@@ -369,10 +371,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       * In case the matrix is growing, new rows will be uninitialized.
       */
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void conservativeResize(Index nbRows, NoChange_t)
+    EIGEN_STRONG_INLINE void conservativeResize(Index rows, NoChange_t)
     {
       // Note: see the comment in conservativeResize(Index,Index)
-      conservativeResize(nbRows, cols());
+      conservativeResize(rows, cols());
     }
 
     /** Resizes the matrix to \a rows x \a cols while leaving old values untouched.
@@ -383,10 +385,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       * In case the matrix is growing, new columns will be uninitialized.
       */
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void conservativeResize(NoChange_t, Index nbCols)
+    EIGEN_STRONG_INLINE void conservativeResize(NoChange_t, Index cols)
     {
       // Note: see the comment in conservativeResize(Index,Index)
-      conservativeResize(rows(), nbCols);
+      conservativeResize(rows(), cols);
     }
 
     /** Resizes the vector to \a size while retaining old values.
@@ -479,9 +481,13 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
     }
 #endif
 
+    /** Copy constructor */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE PlainObjectBase(const PlainObjectBase& other)
+      : Base(), m_storage(other.m_storage) { }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE PlainObjectBase(Index a_size, Index nbRows, Index nbCols)
-      : m_storage(a_size, nbRows, nbCols)
+    EIGEN_STRONG_INLINE PlainObjectBase(Index size, Index rows, Index cols)
+      : m_storage(size, rows, cols)
     {
 //       _check_template_params();
 //       EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
@@ -498,15 +504,36 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       return this->derived();
     }
 
-    /** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
+    /** \sa PlainObjectBase::operator=(const EigenBase<OtherDerived>&) */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE PlainObjectBase(const DenseBase<OtherDerived> &other)
+      : m_storage()
+    {
+      _check_template_params();
+      resizeLike(other);
+      _set_noalias(other);
+    }
+
+    /** \sa PlainObjectBase::operator=(const EigenBase<OtherDerived>&) */
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC 
     EIGEN_STRONG_INLINE PlainObjectBase(const EigenBase<OtherDerived> &other)
-      : m_storage(other.derived().rows() * other.derived().cols(), other.derived().rows(), other.derived().cols())
+      : m_storage()
     {
       _check_template_params();
-      internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(other.derived().rows(), other.derived().cols());
-      Base::operator=(other.derived());
+      resizeLike(other);
+      *this = other.derived();
+    }
+    /** \brief Copy constructor with in-place evaluation */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE PlainObjectBase(const ReturnByValue<OtherDerived>& other)
+    {
+      _check_template_params();
+      // FIXME this does not automatically transpose vectors if necessary
+      resize(other.rows(), other.cols());
+      other.evalTo(this->derived());
     }
 
     /** \name Map
@@ -668,12 +695,12 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
 
     template<typename T0, typename T1>
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE void _init2(Index nbRows, Index nbCols, typename internal::enable_if<Base::SizeAtCompileTime!=2,T0>::type* = 0)
+    EIGEN_STRONG_INLINE void _init2(Index rows, Index cols, typename internal::enable_if<Base::SizeAtCompileTime!=2,T0>::type* = 0)
     {
       EIGEN_STATIC_ASSERT(bool(NumTraits<T0>::IsInteger) &&
                           bool(NumTraits<T1>::IsInteger),
                           FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
-      resize(nbRows,nbCols);
+      resize(rows,cols);
     }
     
     template<typename T0, typename T1>
diff --git a/Eigen/src/Core/Random.h b/Eigen/src/Core/Random.h
index fdd43ed0c..cf2a82877 100644
--- a/Eigen/src/Core/Random.h
+++ b/Eigen/src/Core/Random.h
@@ -162,8 +162,8 @@ PlainObjectBase<Derived>::setRandom(Index newSize)
   *
   * \not_reentrant
   * 
-  * \param nbRows the new number of rows
-  * \param nbCols the new number of columns
+  * \param rows the new number of rows
+  * \param cols the new number of columns
   *
   * Example: \include Matrix_setRandom_int_int.cpp
   * Output: \verbinclude Matrix_setRandom_int_int.out
@@ -172,9 +172,9 @@ PlainObjectBase<Derived>::setRandom(Index newSize)
   */
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
-PlainObjectBase<Derived>::setRandom(Index nbRows, Index nbCols)
+PlainObjectBase<Derived>::setRandom(Index rows, Index cols)
 {
-  resize(nbRows, nbCols);
+  resize(rows, cols);
   return setRandom();
 }
 
diff --git a/Eigen/src/Core/Replicate.h b/Eigen/src/Core/Replicate.h
index 3777049ee..208f86380 100644
--- a/Eigen/src/Core/Replicate.h
+++ b/Eigen/src/Core/Replicate.h
@@ -72,8 +72,8 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
     typedef typename internal::remove_all<MatrixType>::type NestedExpression;
 
     template<typename OriginalMatrixType>
-    inline explicit Replicate(const OriginalMatrixType& a_matrix)
-      : m_matrix(a_matrix), m_rowFactor(RowFactor), m_colFactor(ColFactor)
+    inline explicit Replicate(const OriginalMatrixType& matrix)
+      : m_matrix(matrix), m_rowFactor(RowFactor), m_colFactor(ColFactor)
     {
       EIGEN_STATIC_ASSERT((internal::is_same<typename internal::remove_const<MatrixType>::type,OriginalMatrixType>::value),
                           THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE)
@@ -81,8 +81,8 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
     }
 
     template<typename OriginalMatrixType>
-    inline Replicate(const OriginalMatrixType& a_matrix, Index rowFactor, Index colFactor)
-      : m_matrix(a_matrix), m_rowFactor(rowFactor), m_colFactor(colFactor)
+    inline Replicate(const OriginalMatrixType& matrix, Index rowFactor, Index colFactor)
+      : m_matrix(matrix), m_rowFactor(rowFactor), m_colFactor(colFactor)
     {
       EIGEN_STATIC_ASSERT((internal::is_same<typename internal::remove_const<MatrixType>::type,OriginalMatrixType>::value),
                           THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE)
@@ -91,31 +91,6 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
     inline Index rows() const { return m_matrix.rows() * m_rowFactor.value(); }
     inline Index cols() const { return m_matrix.cols() * m_colFactor.value(); }
 
-    inline Scalar coeff(Index rowId, Index colId) const
-    {
-      // try to avoid using modulo; this is a pure optimization strategy
-      const Index actual_row  = internal::traits<MatrixType>::RowsAtCompileTime==1 ? 0
-                            : RowFactor==1 ? rowId
-                            : rowId%m_matrix.rows();
-      const Index actual_col  = internal::traits<MatrixType>::ColsAtCompileTime==1 ? 0
-                            : ColFactor==1 ? colId
-                            : colId%m_matrix.cols();
-
-      return m_matrix.coeff(actual_row, actual_col);
-    }
-    template<int LoadMode>
-    inline PacketScalar packet(Index rowId, Index colId) const
-    {
-      const Index actual_row  = internal::traits<MatrixType>::RowsAtCompileTime==1 ? 0
-                            : RowFactor==1 ? rowId
-                            : rowId%m_matrix.rows();
-      const Index actual_col  = internal::traits<MatrixType>::ColsAtCompileTime==1 ? 0
-                            : ColFactor==1 ? colId
-                            : colId%m_matrix.cols();
-
-      return m_matrix.template packet<LoadMode>(actual_row, actual_col);
-    }
-
     const _MatrixTypeNested& nestedExpression() const
     { 
       return m_matrix; 
diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h
index 9bac726f7..ded42e0e8 100644
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -198,8 +198,8 @@ void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<Ot
   * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this
   * is an upper (resp. lower) triangular matrix.
   *
-  * Example: \include MatrixBase_marked.cpp
-  * Output: \verbinclude MatrixBase_marked.out
+  * Example: \include Triangular_solve.cpp
+  * Output: \verbinclude Triangular_solve.out
   *
   * This function returns an expression of the inverse-multiply and can works in-place if it is assigned
   * to the same matrix or vector \a other.
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index 2c967abca..7221ee03e 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -60,7 +60,7 @@ template<typename MatrixType> class Transpose
     typedef typename internal::remove_all<MatrixType>::type NestedExpression;
 
     EIGEN_DEVICE_FUNC
-    explicit inline Transpose(MatrixType& a_matrix) : m_matrix(a_matrix) {}
+    explicit inline Transpose(MatrixType& matrix) : m_matrix(matrix) {}
 
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose)
 
diff --git a/Eigen/src/Core/Transpositions.h b/Eigen/src/Core/Transpositions.h
index e9b54c2ba..b08df1ead 100644
--- a/Eigen/src/Core/Transpositions.h
+++ b/Eigen/src/Core/Transpositions.h
@@ -178,7 +178,7 @@ class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTim
 
     /** Generic constructor from expression of the transposition indices. */
     template<typename Other>
-    explicit inline Transpositions(const MatrixBase<Other>& a_indices) : m_indices(a_indices)
+    explicit inline Transpositions(const MatrixBase<Other>& indices) : m_indices(indices)
     {}
 
     /** Copies the \a other transpositions into \c *this */
@@ -292,8 +292,8 @@ class TranspositionsWrapper
     typedef typename Traits::IndicesType IndicesType;
     typedef typename IndicesType::Scalar StorageIndex;
 
-    explicit inline TranspositionsWrapper(IndicesType& a_indices)
-      : m_indices(a_indices)
+    explicit inline TranspositionsWrapper(IndicesType& indices)
+      : m_indices(indices)
     {}
 
     /** Copies the \a other transpositions into \c *this */
diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h
index fd53ae4cb..0a6397509 100644
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -19,9 +19,7 @@ template<int Side, typename TriangularType, typename Rhs> struct triangular_solv
   
 }
 
-/** \internal
-  *
-  * \class TriangularBase
+/** \class TriangularBase
   * \ingroup Core_Module
   *
   * \brief Base class for triangular part in a matrix
@@ -63,11 +61,11 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
     inline Index innerStride() const { return derived().innerStride(); }
     
     // dummy resize function
-    void resize(Index nbRows, Index nbCols)
+    void resize(Index rows, Index cols)
     {
-      EIGEN_UNUSED_VARIABLE(nbRows);
-      EIGEN_UNUSED_VARIABLE(nbCols);
-      eigen_assert(nbRows==rows() && nbCols==nbCols);
+      EIGEN_UNUSED_VARIABLE(rows);
+      EIGEN_UNUSED_VARIABLE(cols);
+      eigen_assert(rows==this->rows() && cols==this->cols());
     }
 
     EIGEN_DEVICE_FUNC
@@ -148,17 +146,17 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
 /** \class TriangularView
   * \ingroup Core_Module
   *
-  * \brief Base class for triangular part in a matrix
+  * \brief Expression of a triangular part in a matrix
   *
   * \param MatrixType the type of the object in which we are taking the triangular part
   * \param Mode the kind of triangular matrix expression to construct. Can be #Upper,
   *             #Lower, #UnitUpper, #UnitLower, #StrictlyUpper, or #StrictlyLower.
   *             This is in fact a bit field; it must have either #Upper or #Lower, 
-  *             and additionnaly it may have #UnitDiag or #ZeroDiag or neither.
+  *             and additionally it may have #UnitDiag or #ZeroDiag or neither.
   *
   * This class represents a triangular part of a matrix, not necessarily square. Strictly speaking, for rectangular
   * matrices one should speak of "trapezoid" parts. This class is the return type
-  * of MatrixBase::triangularView() and most of the time this is the only way it is used.
+  * of MatrixBase::triangularView() and SparseMatrixBase::triangularView(), and most of the time this is the only way it is used.
   *
   * \sa MatrixBase::triangularView()
   */
@@ -306,6 +304,15 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
     MatrixTypeNested m_matrix;
 };
 
+/** \ingroup Core_Module
+  *
+  * \brief Base class for a triangular part in a \b dense matrix
+  *
+  * This class is an abstract base class of class TriangularView, and objects of type TriangularViewImpl cannot be instantiated.
+  * It extends class TriangularView with additional methods which available for dense expressions only.
+  *
+  * \sa class TriangularView, MatrixBase::triangularView()
+  */
 template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_MatrixType,_Mode,Dense>
   : public TriangularBase<TriangularView<_MatrixType, _Mode> >
 {
@@ -549,8 +556,8 @@ void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
   * The parameter \a Mode can have the following values: \c #Upper, \c #StrictlyUpper, \c #UnitUpper,
   * \c #Lower, \c #StrictlyLower, \c #UnitLower.
   *
-  * Example: \include MatrixBase_extract.cpp
-  * Output: \verbinclude MatrixBase_extract.out
+  * Example: \include MatrixBase_triangularView.cpp
+  * Output: \verbinclude MatrixBase_triangularView.out
   *
   * \sa class TriangularView
   */
diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h
index 2810a7a0b..06cd56684 100644
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -271,6 +271,86 @@ pexp<Packet8f>(const Packet8f& _x) {
   return pmax(pmul(y, _mm256_castsi256_ps(emm0)), _x);
 }
 
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
+pexp<Packet4d>(const Packet4d& _x) {
+  Packet4d x = _x;
+
+  _EIGEN_DECLARE_CONST_Packet4d(1, 1.0);
+  _EIGEN_DECLARE_CONST_Packet4d(2, 2.0);
+  _EIGEN_DECLARE_CONST_Packet4d(half, 0.5);
+
+  _EIGEN_DECLARE_CONST_Packet4d(exp_hi, 709.437);
+  _EIGEN_DECLARE_CONST_Packet4d(exp_lo, -709.436139303);
+
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_LOG2EF, 1.4426950408889634073599);
+
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p0, 1.26177193074810590878e-4);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p1, 3.02994407707441961300e-2);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p2, 9.99999999999999999910e-1);
+
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q0, 3.00198505138664455042e-6);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q1, 2.52448340349684104192e-3);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q2, 2.27265548208155028766e-1);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q3, 2.00000000000000000009e0);
+
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C1, 0.693145751953125);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C2, 1.42860682030941723212e-6);
+  _EIGEN_DECLARE_CONST_Packet4i(1023, 1023);
+
+  Packet4d tmp, fx;
+
+  // clamp x
+  x = pmax(pmin(x, p4d_exp_hi), p4d_exp_lo);
+  // Express exp(x) as exp(g + n*log(2)).
+  fx = pmadd(p4d_cephes_LOG2EF, x, p4d_half);
+
+  // Get the integer modulus of log(2), i.e. the "n" described above.
+  fx = _mm256_floor_pd(fx);
+
+  // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
+  // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
+  // digits right.
+  tmp = pmul(fx, p4d_cephes_exp_C1);
+  Packet4d z = pmul(fx, p4d_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  Packet4d x2 = pmul(x, x);
+
+  // Evaluate the numerator polynomial of the rational interpolant.
+  Packet4d px = p4d_cephes_exp_p0;
+  px = pmadd(px, x2, p4d_cephes_exp_p1);
+  px = pmadd(px, x2, p4d_cephes_exp_p2);
+  px = pmul(px, x);
+
+  // Evaluate the denominator polynomial of the rational interpolant.
+  Packet4d qx = p4d_cephes_exp_q0;
+  qx = pmadd(qx, x2, p4d_cephes_exp_q1);
+  qx = pmadd(qx, x2, p4d_cephes_exp_q2);
+  qx = pmadd(qx, x2, p4d_cephes_exp_q3);
+
+  // I don't really get this bit, copied from the SSE2 routines, so...
+  // TODO(gonnet): Figure out what is going on here, perhaps find a better
+  // rational interpolant?
+  x = _mm256_div_pd(px, psub(qx, px));
+  x = pmadd(p4d_2, x, p4d_1);
+
+  // Build e=2^n by constructing the exponents in a 128-bit vector and
+  // shifting them to where they belong in double-precision values.
+  __m128i emm0 = _mm256_cvtpd_epi32(fx);
+  emm0 = _mm_add_epi32(emm0, p4i_1023);
+  emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
+  __m128i lo = _mm_slli_epi64(emm0, 52);
+  __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52);
+  __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0);
+  e = _mm256_insertf128_si256(e, hi, 1);
+
+  // Construct the result 2^n * exp(g) = e * x. The max is used to catch
+  // non-finite values in the input.
+  return pmax(pmul(x, Packet4d(e)), _x);
+}
+
 // Functions for sqrt.
 // The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
 // of Newton's method, at a cost of 1-2 bits of precision as opposed to the
@@ -300,15 +380,59 @@ psqrt<Packet8f>(const Packet8f& _x) {
   return pmul(_x, x);
 }
 #else
-template <>
-EIGEN_STRONG_INLINE Packet8f psqrt<Packet8f>(const Packet8f& x) {
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f psqrt<Packet8f>(const Packet8f& x) {
   return _mm256_sqrt_ps(x);
 }
 #endif
-template <>
-EIGEN_STRONG_INLINE Packet4d psqrt<Packet4d>(const Packet4d& x) {
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4d psqrt<Packet4d>(const Packet4d& x) {
   return _mm256_sqrt_pd(x);
 }
+#if EIGEN_FAST_MATH
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f prsqrt<Packet8f>(const Packet8f& _x) {
+ _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000);
+  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(nan, 0x7fc00000);
+  _EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f);
+  _EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f);
+  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000);
+
+  Packet8f neg_half = pmul(_x, p8f_minus_half);
+
+  // select only the inverse sqrt of positive normal inputs (denormals are
+  // flushed to zero and cause infs as well).
+  Packet8f le_zero_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ);
+  Packet8f x = _mm256_andnot_ps(le_zero_mask, _mm256_rsqrt_ps(_x));
+
+  // Fill in NaNs and Infs for the negative/zero entries.
+  Packet8f neg_mask = _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_LT_OQ);
+  Packet8f zero_mask = _mm256_andnot_ps(neg_mask, le_zero_mask);
+  Packet8f infs_and_nans = _mm256_or_ps(_mm256_and_ps(neg_mask, p8f_nan),
+                                        _mm256_and_ps(zero_mask, p8f_inf));
+
+  // Do a single step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), p8f_one_point_five));
+
+  // Insert NaNs and Infs in all the right places.
+  return _mm256_or_ps(x, infs_and_nans);
+}
+
+#else
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f prsqrt<Packet8f>(const Packet8f& x) {
+  _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f);
+  return _mm256_div_ps(p8f_one, _mm256_sqrt_ps(x));
+}
+#endif
+
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4d prsqrt<Packet4d>(const Packet4d& x) {
+  _EIGEN_DECLARE_CONST_Packet4d(one, 1.0);
+  return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(x));
+}
+
 
 }  // end namespace internal
 
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 695185a49..a2306fd1a 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -65,6 +65,7 @@ template<> struct packet_traits<float>  : default_packet_traits
     HasLog  = 1,
     HasExp  = 1,
     HasSqrt = 1,
+    HasRsqrt = 1,
     HasBlend = 1
   };
 };
@@ -79,8 +80,9 @@ template<> struct packet_traits<double> : default_packet_traits
     HasHalfPacket = 1,
 
     HasDiv  = 1,
-    HasExp  = 0,
+    HasExp  = 1,
     HasSqrt = 1,
+    HasRsqrt = 1,
     HasBlend = 1
   };
 };
@@ -432,26 +434,30 @@ struct palign_impl<Offset,Packet8f>
     if (Offset==1)
     {
       first = _mm256_blend_ps(first, second, 1);
-      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
-      first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0x88);
+      Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
+      Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
+      first = _mm256_blend_ps(tmp1, tmp2, 0x88);
     }
     else if (Offset==2)
     {
       first = _mm256_blend_ps(first, second, 3);
-      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
-      first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0xcc);
+      Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
+      Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
+      first = _mm256_blend_ps(tmp1, tmp2, 0xcc);
     }
     else if (Offset==3)
     {
       first = _mm256_blend_ps(first, second, 7);
-      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
-      first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0xee);
+      Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
+      Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
+      first = _mm256_blend_ps(tmp1, tmp2, 0xee);
     }
     else if (Offset==4)
     {
       first = _mm256_blend_ps(first, second, 15);
-      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0));
-      first = _mm256_permute_ps(_mm256_permute2f128_ps (tmp, tmp, 1), _MM_SHUFFLE(3,2,1,0));
+      Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0));
+      Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
+      first = _mm256_permute_ps(tmp2, _MM_SHUFFLE(3,2,1,0));
     }
     else if (Offset==5)
     {
diff --git a/Eigen/src/Core/arch/AVX/TypeCasting.h b/Eigen/src/Core/arch/AVX/TypeCasting.h
new file mode 100644
index 000000000..83bfdc604
--- /dev/null
+++ b/Eigen/src/Core/arch/AVX/TypeCasting.h
@@ -0,0 +1,51 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_AVX_H
+#define EIGEN_TYPE_CASTING_AVX_H
+
+namespace Eigen {
+
+namespace internal {
+
+// For now we use SSE to handle integers, so we can't use AVX instructions to cast
+// from int to float
+template <>
+struct type_casting_traits<float, int> {
+  enum {
+    VectorizedCast = 0,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template <>
+struct type_casting_traits<int, float> {
+  enum {
+    VectorizedCast = 0,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+
+
+template<> EIGEN_STRONG_INLINE Packet8i pcast<Packet8f, Packet8i>(const Packet8f& a) {
+  return _mm256_cvtps_epi32(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8i, Packet8f>(const Packet8i& a) {
+  return _mm256_cvtepi32_ps(a);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_AVX_H
diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h
index 565d2ece0..1f00e27d8 100644
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -408,7 +408,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, con
   // TODO optimize it for AltiVec
   Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
   Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_);
-  return Packet1cd(pdiv(res.v, vec_add(s,vec_perm(s, s, p16uc_COMPLEX32_REV))));
+  return Packet1cd(pdiv(res.v, vec_add(s,vec_perm(s, s, p16uc_REVERSE64))));
 }
 
 EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
diff --git a/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h b/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h
deleted file mode 100644
index 5007c155d..000000000
--- a/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h
+++ /dev/null
@@ -1,110 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_NEON_BLOCKING_SIZES_LOOKUP_TABLES_H
-#define EIGEN_NEON_BLOCKING_SIZES_LOOKUP_TABLES_H
-
-namespace Eigen {
-namespace internal {
-
-/* The following lookup table was generated from measurements on a Nexus 5,
- * which has a Qualcomm Krait 400 CPU. This is very representative of current
- * 32bit (ARMv7) Android devices. On the other hand, I don't know how
- * representative that is outside of these conditions. Accordingly,
- * let's only use this lookup table on ARM 32bit on Android for now.
- *
- * Measurements were single-threaded, with Scalar=float, compiled with
- * -mfpu=neon-vfpv4, so the pmadd instruction used was VFMA.F32.
- *
- * The device was cooled, allowing it to run a the max clock speed throughout.
- * This may not be representative of real-world thermal conditions.
- *
- * The benchmark attempted to flush caches to test cold-cache performance.
- */
-#if EIGEN_ARCH_ARM && EIGEN_OS_ANDROID
-template<>
-struct BlockingSizesLookupTable<float, float> {
-  static const size_t BaseSize = 16;
-  static const size_t NumSizes = 8;
-  static const unsigned short* Data() {
-    static const unsigned short data[512] = {
-      0x444, 0x445, 0x446, 0x447, 0x448, 0x449, 0x447, 0x447,
-      0x454, 0x455, 0x456, 0x457, 0x458, 0x459, 0x45a, 0x456,
-      0x464, 0x465, 0x466, 0x467, 0x468, 0x469, 0x46a, 0x467,
-      0x474, 0x475, 0x476, 0x467, 0x478, 0x479, 0x476, 0x478,
-      0x474, 0x475, 0x476, 0x477, 0x478, 0x479, 0x476, 0x476,
-      0x474, 0x475, 0x476, 0x477, 0x478, 0x479, 0x496, 0x488,
-      0x474, 0x475, 0x476, 0x4a6, 0x496, 0x496, 0x495, 0x4a6,
-      0x474, 0x475, 0x466, 0x4a6, 0x497, 0x4a5, 0x496, 0x4a5,
-      0x544, 0x545, 0x546, 0x547, 0x548, 0x549, 0x54a, 0x54b,
-      0x554, 0x555, 0x556, 0x557, 0x558, 0x559, 0x55a, 0x55b,
-      0x564, 0x565, 0x566, 0x567, 0x568, 0x569, 0x56a, 0x56b,
-      0x564, 0x565, 0x566, 0x567, 0x568, 0x569, 0x56a, 0x576,
-      0x564, 0x565, 0x566, 0x567, 0x568, 0x569, 0x56a, 0x587,
-      0x564, 0x565, 0x566, 0x567, 0x596, 0x596, 0x596, 0x597,
-      0x574, 0x565, 0x566, 0x596, 0x596, 0x5a6, 0x5a6, 0x5a6,
-      0x564, 0x565, 0x5a6, 0x596, 0x5a6, 0x5a6, 0x5a6, 0x5a6,
-      0x644, 0x645, 0x646, 0x647, 0x648, 0x649, 0x64a, 0x64b,
-      0x644, 0x655, 0x656, 0x657, 0x658, 0x659, 0x65a, 0x65b,
-      0x664, 0x665, 0x666, 0x667, 0x668, 0x669, 0x65a, 0x667,
-      0x654, 0x665, 0x676, 0x677, 0x678, 0x679, 0x67a, 0x675,
-      0x684, 0x675, 0x686, 0x687, 0x688, 0x688, 0x687, 0x686,
-      0x664, 0x685, 0x666, 0x677, 0x697, 0x696, 0x697, 0x697,
-      0x664, 0x665, 0x696, 0x696, 0x685, 0x6a6, 0x696, 0x696,
-      0x664, 0x675, 0x686, 0x696, 0x6a6, 0x696, 0x696, 0x696,
-      0x744, 0x745, 0x746, 0x747, 0x748, 0x749, 0x74a, 0x747,
-      0x754, 0x755, 0x756, 0x757, 0x758, 0x759, 0x75a, 0x757,
-      0x764, 0x765, 0x756, 0x767, 0x768, 0x759, 0x75a, 0x766,
-      0x744, 0x755, 0x766, 0x777, 0x768, 0x759, 0x778, 0x777,
-      0x744, 0x745, 0x766, 0x777, 0x788, 0x786, 0x786, 0x788,
-      0x754, 0x755, 0x766, 0x787, 0x796, 0x796, 0x787, 0x796,
-      0x684, 0x695, 0x696, 0x6a6, 0x795, 0x786, 0x795, 0x796,
-      0x684, 0x695, 0x696, 0x795, 0x786, 0x796, 0x795, 0x796,
-      0x844, 0x845, 0x846, 0x847, 0x848, 0x849, 0x848, 0x848,
-      0x844, 0x855, 0x846, 0x847, 0x848, 0x849, 0x855, 0x857,
-      0x844, 0x845, 0x846, 0x857, 0x848, 0x859, 0x866, 0x865,
-      0x844, 0x855, 0x846, 0x847, 0x878, 0x859, 0x877, 0x877,
-      0x844, 0x855, 0x846, 0x867, 0x886, 0x887, 0x885, 0x886,
-      0x784, 0x785, 0x786, 0x877, 0x897, 0x885, 0x896, 0x896,
-      0x684, 0x695, 0x686, 0x886, 0x885, 0x885, 0x886, 0x896,
-      0x694, 0x6a5, 0x6a6, 0x885, 0x885, 0x886, 0x896, 0x896,
-      0x944, 0x945, 0x946, 0x947, 0x948, 0x847, 0x847, 0x848,
-      0x954, 0x855, 0x856, 0x947, 0x858, 0x857, 0x858, 0x858,
-      0x944, 0x945, 0x946, 0x867, 0x948, 0x866, 0x867, 0x867,
-      0x944, 0x975, 0x976, 0x877, 0x877, 0x877, 0x877, 0x877,
-      0x784, 0x785, 0x886, 0x887, 0x886, 0x887, 0x887, 0x887,
-      0x784, 0x785, 0x786, 0x796, 0x887, 0x897, 0x896, 0x896,
-      0x684, 0x695, 0x6a6, 0x886, 0x886, 0x896, 0x896, 0x896,
-      0x6a4, 0x6a5, 0x696, 0x896, 0x886, 0x896, 0x896, 0x896,
-      0xa44, 0xa45, 0xa46, 0xa47, 0x847, 0x848, 0x847, 0x848,
-      0xa44, 0xa45, 0x856, 0x857, 0x857, 0x857, 0x857, 0x857,
-      0xa44, 0xa65, 0x866, 0x867, 0x867, 0x867, 0x867, 0x867,
-      0x774, 0x875, 0x876, 0x877, 0x877, 0x877, 0x877, 0x877,
-      0x784, 0x785, 0x886, 0x887, 0x887, 0x887, 0x887, 0x887,
-      0x784, 0x785, 0x786, 0x787, 0x887, 0x896, 0x897, 0x897,
-      0x684, 0x6a5, 0x696, 0x886, 0x886, 0x896, 0x896, 0x896,
-      0x684, 0x6a5, 0x6a5, 0x886, 0x886, 0x896, 0x896, 0x896,
-      0xb44, 0x845, 0x846, 0x847, 0x847, 0x945, 0x846, 0x946,
-      0xb54, 0x855, 0x856, 0x857, 0x857, 0x856, 0x857, 0x856,
-      0x864, 0x865, 0x866, 0x867, 0x867, 0x866, 0x866, 0x867,
-      0x864, 0x875, 0x876, 0x877, 0x877, 0x877, 0x877, 0x877,
-      0x784, 0x885, 0x886, 0x787, 0x887, 0x887, 0x887, 0x887,
-      0x784, 0x785, 0x786, 0x796, 0x886, 0x897, 0x897, 0x897,
-      0x684, 0x695, 0x696, 0x886, 0x896, 0x896, 0x896, 0x896,
-      0x684, 0x685, 0x696, 0xb57, 0x896, 0x896, 0x896, 0x896
-    };
-    return data;
-  }
-};
-#endif
-
-}
-}
-
-#endif // EIGEN_NEON_BLOCKING_SIZES_LOOKUP_TABLES_H
diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index c7fb12fe8..60ab56a47 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -114,7 +114,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<f
 
 template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
 {
-  Packet4f res;
+  Packet4f res = pset1<Packet4f>(0.f);
   res = vsetq_lane_f32(std::real(from[0*stride]), res, 0);
   res = vsetq_lane_f32(std::imag(from[0*stride]), res, 1);
   res = vsetq_lane_f32(std::real(from[1*stride]), res, 2);
@@ -365,7 +365,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::c
 
 template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
 {
-  Packet2d res;
+  Packet2d res = pset1<Packet2d>(0.0);
   res = vsetq_lane_f64(std::real(from[0*stride]), res, 0);
   res = vsetq_lane_f64(std::imag(from[0*stride]), res, 1);
   return Packet1cd(res);
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index ce0abfd80..cda9056a4 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -252,7 +252,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& f
 
 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
 {
-  Packet4f res;
+  Packet4f res = pset1<Packet4f>(0.f);
   res = vsetq_lane_f32(from[0*stride], res, 0);
   res = vsetq_lane_f32(from[1*stride], res, 1);
   res = vsetq_lane_f32(from[2*stride], res, 2);
@@ -261,7 +261,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const floa
 }
 template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
 {
-  Packet4i res;
+  Packet4i res = pset1<Packet4i>(0);
   res = vsetq_lane_s32(from[0*stride], res, 0);
   res = vsetq_lane_s32(from[1*stride], res, 1);
   res = vsetq_lane_s32(from[2*stride], res, 2);
@@ -637,7 +637,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d&
 
 template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
 {
-  Packet2d res;
+  Packet2d res = pset1<Packet2d>(0.0);
   res = vsetq_lane_f64(from[0*stride], res, 0);
   res = vsetq_lane_f64(from[1*stride], res, 1);
   return res;
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index f86c0a39a..3b8b7303f 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -462,11 +462,59 @@ Packet4f psqrt<Packet4f>(const Packet4f& _x)
 
 #else
 
-template<> EIGEN_STRONG_INLINE Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); }
+template<>EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED 
+Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); }
 
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
+
+#if EIGEN_FAST_MATH
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f prsqrt<Packet4f>(const Packet4f& _x) {
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000);
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000);
+  _EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f);
+  _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f);
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000);
+
+  Packet4f neg_half = pmul(_x, p4f_minus_half);
+
+  // select only the inverse sqrt of positive normal inputs (denormals are
+  // flushed to zero and cause infs as well).
+  Packet4f le_zero_mask = _mm_cmple_ps(_x, p4f_flt_min);
+  Packet4f x = _mm_andnot_ps(le_zero_mask, _mm_rsqrt_ps(_x));
+
+  // Fill in NaNs and Infs for the negative/zero entries.
+  Packet4f neg_mask = _mm_cmplt_ps(_x, _mm_setzero_ps());
+  Packet4f zero_mask = _mm_andnot_ps(neg_mask, le_zero_mask);
+  Packet4f infs_and_nans = _mm_or_ps(_mm_and_ps(neg_mask, p4f_nan),
+                                        _mm_and_ps(zero_mask, p4f_inf));
+
+  // Do a single step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), p4f_one_point_five));
+
+  // Insert NaNs and Infs in all the right places.
+  return _mm_or_ps(x, infs_and_nans);
+}
+
+#else
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f prsqrt<Packet4f>(const Packet4f& x) {
+  // Unfortunately we can't use the much faster mm_rqsrt_ps since it only provides an approximation.
+  return _mm_div_ps(pset1<Packet4f>(1.0f), _mm_sqrt_ps(x));
+}
+
+#endif
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d prsqrt<Packet2d>(const Packet2d& x) {
+  // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation.
+  return _mm_div_pd(pset1<Packet2d>(1.0), _mm_sqrt_pd(x));
+}
 
 } // end namespace internal
 
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 3653783fd..38a84273d 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -108,6 +108,7 @@ template<> struct packet_traits<float>  : default_packet_traits
     HasLog  = 1,
     HasExp  = 1,
     HasSqrt = 1,
+    HasRsqrt = 1,
     HasBlend = 1
   };
 };
@@ -124,6 +125,7 @@ template<> struct packet_traits<double> : default_packet_traits
     HasDiv  = 1,
     HasExp  = 1,
     HasSqrt = 1,
+    HasRsqrt = 1,
     HasBlend = 1
   };
 };
diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h
new file mode 100644
index 000000000..c84893230
--- /dev/null
+++ b/Eigen/src/Core/arch/SSE/TypeCasting.h
@@ -0,0 +1,77 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_SSE_H
+#define EIGEN_TYPE_CASTING_SSE_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <>
+struct type_casting_traits<float, int> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
+  return _mm_cvttps_epi32(a);
+}
+
+
+template <>
+struct type_casting_traits<int, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
+  return _mm_cvtepi32_ps(a);
+}
+
+
+template <>
+struct type_casting_traits<double, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 2,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
+  return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
+}
+
+template <>
+struct type_casting_traits<float, double> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 2
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
+  // Simply discard the second half of the input
+  return _mm_cvtps_pd(a);
+}
+
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_SSE_H
diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h
index 9c96181c7..85e605889 100644
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h
@@ -155,6 +155,48 @@ struct functor_traits<scalar_max_op<Scalar> > {
 };
 
 /** \internal
+  * \brief Template functors for comparison of two scalars
+  * \todo Implement packet-comparisons
+  */
+template<typename Scalar, ComparisonName cmp> struct scalar_cmp_op;
+
+template<typename Scalar, ComparisonName cmp>
+struct functor_traits<scalar_cmp_op<Scalar, cmp> > {
+  enum {
+    Cost = NumTraits<Scalar>::AddCost,
+    PacketAccess = false
+  };
+};
+
+template<ComparisonName Cmp, typename Scalar>
+struct result_of<scalar_cmp_op<Scalar, Cmp>(Scalar,Scalar)> {
+  typedef bool type;
+};
+
+
+template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_EQ> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a==b;}
+};
+template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_LT> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a<b;}
+};
+template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_LE> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a<=b;}
+};
+template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_UNORD> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return !(a<=b || b<=a);}
+};
+template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_NEQ> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a!=b;}
+};
+
+
+/** \internal
   * \brief Template functor to compute the hypot of two scalars
   *
   * \sa MatrixBase::stableNorm(), class Redux
diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h
index c92692e53..a6fa5ee31 100644
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@@ -289,6 +289,25 @@ struct functor_traits<scalar_sqrt_op<Scalar> >
 };
 
 /** \internal
+  * \brief Template functor to compute the reciprocal square root of a scalar
+  * \sa class CwiseUnaryOp, Cwise::rsqrt()
+  */
+template<typename Scalar> struct scalar_rsqrt_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_rsqrt_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::sqrt; return Scalar(1)/sqrt(a); }
+  typedef typename packet_traits<Scalar>::type Packet;
+  inline Packet packetOp(const Packet& a) const { return internal::prsqrt(a); }
+};
+
+template<typename Scalar>
+struct functor_traits<scalar_rsqrt_op<Scalar> >
+{ enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasRsqrt
+  };
+};
+
+/** \internal
   * \brief Template functor to compute the cosine of a scalar
   * \sa class CwiseUnaryOp, ArrayBase::cos()
   */
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 24623963b..1d62ccd93 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -112,14 +112,18 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
       nr = Traits::nr,
       nr_mask = (0xffffffff/nr)*nr
     };
-    Index k_cache = (l1-ksub)/kdiv;
+    // Increasing k gives us more time to prefetch the content of the "C"
+    // registers. However once the latency is hidden there is no point in
+    // increasing the value of k, so we'll cap it at 320 (value determined
+    // experimentally).
+    const Index k_cache = (std::min<Index>)((l1-ksub)/kdiv, 320);
     if (k_cache < k) {
       k = k_cache & k_mask;
       eigen_internal_assert(k > 0);
     }
 
-    Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
-    Index n_per_thread = numext::div_ceil(n, num_threads);
+    const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
+    const Index n_per_thread = numext::div_ceil(n, num_threads);
     if (n_cache <= n_per_thread) {
       // Don't exceed the capacity of the l2 cache.
       eigen_internal_assert(n_cache >= static_cast<Index>(nr));
@@ -131,8 +135,8 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
 
     if (l3 > l2) {
       // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
-      Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
-      Index m_per_thread = numext::div_ceil(m, num_threads);
+      const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
+      const Index m_per_thread = numext::div_ceil(m, num_threads);
       if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
         m = m_cache & mr_mask;
         eigen_internal_assert(m > 0);
@@ -287,7 +291,6 @@ inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
   *
   * The blocking size parameters may be evaluated:
   *   - either by a heuristic based on cache sizes;
-  *   - or using a precomputed lookup table;
   *   - or using fixed prescribed values (for testing purposes).
   *
   * \sa setCpuCacheSizes */
@@ -296,9 +299,7 @@ template<typename LhsScalar, typename RhsScalar, int KcFactor>
 void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
 {
   if (!useSpecificBlockingSizes(k, m, n)) {
-    if (!lookupBlockingSizesFromTable<LhsScalar, RhsScalar>(k, m, n, num_threads)) {
-      evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor>(k, m, n, num_threads);
-    }
+    evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor>(k, m, n, num_threads);
   }
 
   typedef gebp_traits<LhsScalar,RhsScalar> Traits;
diff --git a/Eigen/src/Core/products/LookupBlockingSizesTable.h b/Eigen/src/Core/products/LookupBlockingSizesTable.h
deleted file mode 100644
index 39a53c8f1..000000000
--- a/Eigen/src/Core/products/LookupBlockingSizesTable.h
+++ /dev/null
@@ -1,97 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_LOOKUP_BLOCKING_SIZES_TABLE_H
-#define EIGEN_LOOKUP_BLOCKING_SIZES_TABLE_H
-
-namespace Eigen {
-
-namespace internal {
-
-template <typename LhsScalar,
-          typename RhsScalar,
-          bool HasLookupTable = BlockingSizesLookupTable<LhsScalar, RhsScalar>::NumSizes != 0 >
-struct LookupBlockingSizesFromTableImpl
-{
-  static bool run(Index&, Index&, Index&, Index)
-  {
-    return false;
-  }
-};
-
-inline size_t floor_log2_helper(unsigned short& x, size_t offset)
-{
-  unsigned short y = x >> offset;
-  if (y) {
-    x = y;
-    return offset;
-  } else {
-    return 0;
-  }
-}
-
-inline size_t floor_log2(unsigned short x)
-{
-  return floor_log2_helper(x, 8)
-       + floor_log2_helper(x, 4)
-       + floor_log2_helper(x, 2)
-       + floor_log2_helper(x, 1);
-}
-
-inline size_t ceil_log2(unsigned short x)
-{
-  return x > 1 ? floor_log2(x - 1) + 1 : 0;
-}
-
-template <typename LhsScalar,
-          typename RhsScalar>
-struct LookupBlockingSizesFromTableImpl<LhsScalar, RhsScalar, true>
-{
-  static bool run(Index& k, Index& m, Index& n, Index)
-  {
-    using std::min;
-    using std::max;
-    typedef BlockingSizesLookupTable<LhsScalar, RhsScalar> Table;
-    const unsigned short minsize = Table::BaseSize;
-    const unsigned short maxsize = minsize << (Table::NumSizes - 1);
-    const unsigned short k_clamped = max<unsigned short>(minsize, min<Index>(k, maxsize));
-    const unsigned short m_clamped = max<unsigned short>(minsize, min<Index>(m, maxsize));
-    const unsigned short n_clamped = max<unsigned short>(minsize, min<Index>(n, maxsize));
-    const size_t k_index = ceil_log2(k_clamped / minsize);
-    const size_t m_index = ceil_log2(m_clamped / minsize);
-    const size_t n_index = ceil_log2(n_clamped / minsize);
-    const size_t index = n_index + Table::NumSizes * (m_index + Table::NumSizes * k_index);
-    const unsigned short table_entry = Table::Data()[index];
-    k = min<Index>(k, 1 << ((table_entry & 0xf00) >> 8));
-    m = min<Index>(m, 1 << ((table_entry & 0x0f0) >> 4));
-    n = min<Index>(n, 1 << ((table_entry & 0x00f) >> 0));
-    return true;
-  }
-};
-
-template <typename LhsScalar,
-          typename RhsScalar>
-bool lookupBlockingSizesFromTable(Index& k, Index& m, Index& n, Index num_threads)
-{
-  if (num_threads > 1) {
-    // We don't currently have lookup tables recorded for multithread performance,
-    // and we have confirmed experimentally that our single-thread-recorded LUTs are
-    // poor for multithread performance, and our LUTs don't currently contain
-    // any annotation about multithread status (FIXME - we need that).
-    // So for now, we just early-return here.
-    return false;
-  }
-  return LookupBlockingSizesFromTableImpl<LhsScalar, RhsScalar>::run(k, m, n, num_threads);
-}
-
-}
-
-}
-
-#endif // EIGEN_LOOKUP_BLOCKING_SIZES_TABLE_H
diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h
index f5de67c59..a9a198d64 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -117,8 +117,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
           {
             // TODO write a small kernel handling this (can be shared with trsv)
             Index i  = IsLower ? k2+k1+k : k2-k1-k-1;
-            Index s  = IsLower ? k2+k1 : i+1;
             Index rs = actualPanelWidth - k - 1; // remaining size
+            Index s  = TriStorageOrder==RowMajor ? (IsLower ? k2+k1 : i+1)
+                                                 :  IsLower ? i+1 : i-rs;
 
             Scalar a = (Mode & UnitDiag) ? Scalar(1) : Scalar(1)/conj(tri(i,i));
             for (Index j=j2; j<j2+actual_cols; ++j)
@@ -135,7 +136,6 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
               }
               else
               {
-                Index s = IsLower ? i+1 : i-rs;
                 Scalar b = (other(i,j) *= a);
                 Scalar* r = &other(s,j);
                 const Scalar* l = &tri(s,i);
diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index d1855b50b..419409608 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -492,6 +492,16 @@ struct IndexBased {};
 // evaluator based on iterators to access coefficients. 
 struct IteratorBased {};
 
+/** \internal
+ * Constants for comparison functors
+ */
+enum ComparisonName {
+  cmp_EQ = 0,
+  cmp_LT = 1,
+  cmp_LE = 2,
+  cmp_UNORD = 3,
+  cmp_NEQ = 4
+};
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index 8034f9b5e..0d24beb5a 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -189,6 +189,7 @@ template<typename Scalar> struct scalar_imag_op;
 template<typename Scalar> struct scalar_abs_op;
 template<typename Scalar> struct scalar_abs2_op;
 template<typename Scalar> struct scalar_sqrt_op;
+template<typename Scalar> struct scalar_rsqrt_op;
 template<typename Scalar> struct scalar_exp_op;
 template<typename Scalar> struct scalar_log_op;
 template<typename Scalar> struct scalar_cos_op;
@@ -287,14 +288,6 @@ struct stem_function
   typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
   typedef ComplexScalar type(ComplexScalar, int);
 };
-
-template <typename LhsScalar,
-          typename RhsScalar>
-struct BlockingSizesLookupTable
-{
-  static const size_t NumSizes = 0;
-};
-
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 7c7a3b8e7..4eeb8211c 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -101,7 +101,7 @@
 
 
 /// \internal EIGEN_GNUC_STRICT set to 1 if the compiler is really GCC and not a compatible compiler (e.g., ICC, clang, mingw, etc.)
-#if EIGEN_COMP_GNUC && !(EIGEN_COMP_CLANG || EIGEN_COMP_CLANG || EIGEN_COMP_MINGW || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM )
+#if EIGEN_COMP_GNUC && !(EIGEN_COMP_CLANG || EIGEN_COMP_ICC || EIGEN_COMP_MINGW || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM )
   #define EIGEN_COMP_GNUC_STRICT 1
 #else
   #define EIGEN_COMP_GNUC_STRICT 0
@@ -283,6 +283,19 @@
   #define EIGEN_OS_WIN_STRICT 0
 #endif
 
+/// \internal EIGEN_OS_SUN set to 1 if the OS is SUN
+#if (defined(sun) || defined(__sun)) && !(defined(__SVR4) || defined(__svr4__))
+  #define EIGEN_OS_SUN 1
+#else
+  #define EIGEN_OS_SUN 0
+#endif
+
+/// \internal EIGEN_OS_SOLARIS set to 1 if the OS is Solaris
+#if (defined(sun) || defined(__sun)) && (defined(__SVR4) || defined(__svr4__))
+  #define EIGEN_OS_SOLARIS 1
+#else
+  #define EIGEN_OS_SOLARIS 0
+#endif
 
 
 
@@ -589,6 +602,7 @@ namespace Eigen {
   #error Please tell me what is the equivalent of __attribute__((aligned(n))) for your compiler
 #endif
 
+#define EIGEN_ALIGN8  EIGEN_ALIGN_TO_BOUNDARY(8)
 #define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16)
 #define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32)
 #define EIGEN_ALIGN_DEFAULT EIGEN_ALIGN_TO_BOUNDARY(EIGEN_ALIGN_BYTES)
@@ -630,7 +644,7 @@ namespace Eigen {
 // just an empty macro !
 #define EIGEN_EMPTY
 
-#if EIGEN_COMP_MSVC_STRICT && EIGEN_COMP_MSVC < 1900
+#if EIGEN_COMP_MSVC_STRICT && EIGEN_COMP_MSVC < 1800 // for older MSVC versions using the base operator is sufficient (cf Bug 1000)
   #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
     using Base::operator =;
 #elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
@@ -649,6 +663,11 @@ namespace Eigen {
     }
 #endif
 
+
+/** \internal
+ * \brief Macro to manually inherit assignment operators.
+ * This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is defined.
+ */
 #define EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Derived) EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived)
 
 /**
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 16f8cc1b0..62f329984 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -59,18 +59,20 @@
 
 #endif
 
-// See bug 554 (http://eigen.tuxfamily.org/bz/show_bug.cgi?id=554)
-// It seems to be unsafe to check _POSIX_ADVISORY_INFO without including unistd.h first.
-// Currently, let's include it only on unix systems:
-#if EIGEN_OS_UNIX
-  #include <unistd.h>
-  #if (EIGEN_OS_QNX || (defined _GNU_SOURCE) || EIGEN_COMP_PGI || ((defined _XOPEN_SOURCE) && (_XOPEN_SOURCE >= 600))) && (defined _POSIX_ADVISORY_INFO) && (_POSIX_ADVISORY_INFO > 0)
-    #define EIGEN_HAS_POSIX_MEMALIGN 1
+#ifndef EIGEN_HAS_POSIX_MEMALIGN
+  // See bug 554 (http://eigen.tuxfamily.org/bz/show_bug.cgi?id=554)
+  // It seems to be unsafe to check _POSIX_ADVISORY_INFO without including unistd.h first.
+  // Currently, let's include it only on unix systems:
+  #if EIGEN_OS_UNIX && !(EIGEN_OS_SUN || EIGEN_OS_SOLARIS)
+    #include <unistd.h>
+    #if (EIGEN_OS_QNX || (defined _GNU_SOURCE) || EIGEN_COMP_PGI || ((defined _XOPEN_SOURCE) && (_XOPEN_SOURCE >= 600))) && (defined _POSIX_ADVISORY_INFO) && (_POSIX_ADVISORY_INFO > 0)
+      #define EIGEN_HAS_POSIX_MEMALIGN 1
+    #endif
   #endif
-#endif
 
-#ifndef EIGEN_HAS_POSIX_MEMALIGN
-  #define EIGEN_HAS_POSIX_MEMALIGN 0
+  #ifndef EIGEN_HAS_POSIX_MEMALIGN
+    #define EIGEN_HAS_POSIX_MEMALIGN 0
+  #endif
 #endif
 
 #if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_AVX
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index 562f425bd..55132c8cf 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -606,6 +606,18 @@ template<typename T, int S> struct is_diagonal<DiagonalMatrix<T,S> >
 template<typename S1, typename S2> struct glue_shapes;
 template<> struct glue_shapes<DenseShape,TriangularShape> { typedef TriangularShape type;  };
 
+template<typename T1, typename T2>
+bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if<has_direct_access<T1>::ret&&has_direct_access<T2>::ret, T1>::type * = 0)
+{
+  return (mat1.data()==mat2.data()) && (mat1.innerStride()==mat2.innerStride()) && (mat1.outerStride()==mat2.outerStride());
+}
+
+template<typename T1, typename T2>
+bool is_same_dense(const T1 &, const T2 &, typename enable_if<!(has_direct_access<T1>::ret&&has_direct_access<T2>::ret), T1>::type * = 0)
+{
+  return false;
+}
+
 } // end namespace internal
 
 // we require Lhs and Rhs to have the same scalar type. Currently there is no example of a binary functor
diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h
index 677c7c0bb..02ebb7d17 100644
--- a/Eigen/src/Eigenvalues/RealQZ.h
+++ b/Eigen/src/Eigenvalues/RealQZ.h
@@ -315,8 +315,8 @@ namespace Eigen {
       const Index dim=m_S.cols();
       if (abs(m_S.coeff(i+1,i))==Scalar(0))
         return;
-      Index z = findSmallDiagEntry(i,i+1);
-      if (z==i-1)
+      Index j = findSmallDiagEntry(i,i+1);
+      if (j==i-1)
       {
         // block of (S T^{-1})
         Matrix2s STi = m_T.template block<2,2>(i,i).template triangularView<Upper>().
@@ -352,7 +352,7 @@ namespace Eigen {
       }
       else
       {
-        pushDownZero(z,i,i+1);
+        pushDownZero(j,i,i+1);
       }
     }
 
diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
index 1dcfacf0b..27a014a96 100644
--- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
@@ -198,17 +198,21 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
     EIGEN_DEVICE_FUNC
     SelfAdjointEigenSolver& compute(const MatrixType& matrix, int options = ComputeEigenvectors);
     
-    /** \brief Computes eigendecomposition of given matrix using a direct algorithm
+    /** \brief Computes eigendecomposition of given matrix using a closed-form algorithm
       *
       * This is a variant of compute(const MatrixType&, int options) which
       * directly solves the underlying polynomial equation.
       * 
-      * Currently only 3x3 matrices for which the sizes are known at compile time are supported (e.g., Matrix3d).
+      * Currently only 2x2 and 3x3 matrices for which the sizes are known at compile time are supported (e.g., Matrix3d).
       * 
-      * This method is usually significantly faster than the QR algorithm
+      * This method is usually significantly faster than the QR iterative algorithm
       * but it might also be less accurate. It is also worth noting that
       * for 3x3 matrices it involves trigonometric operations which are
       * not necessarily available for all scalar types.
+      * 
+      * For the 3x3 case, we observed the following worst case relative error regarding the eigenvalues:
+      *   - double: 1e-8
+      *   - float:  1e-3
       *
       * \sa compute(const MatrixType&, int options)
       */
@@ -474,11 +478,14 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag
   Index end = n-1;
   Index start = 0;
   Index iter = 0; // total number of iterations
-
+  
+  typedef typename DiagType::RealScalar RealScalar;
+  const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
+  
   while (end>0)
   {
     for (Index i = start; i<end; ++i)
-      if (internal::isMuchSmallerThan(abs(subdiag[i]),(abs(diag[i])+abs(diag[i+1]))))
+      if (internal::isMuchSmallerThan(abs(subdiag[i]),(abs(diag[i])+abs(diag[i+1]))) || abs(subdiag[i]) <= considerAsZero)
         subdiag[i] = 0;
 
     // find the largest unreduced block
@@ -537,6 +544,11 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
   typedef typename SolverType::RealVectorType VectorType;
   typedef typename SolverType::Scalar Scalar;
   
+
+  /** \internal
+   * Computes the roots of the characteristic polynomial of \a m.
+   * For numerical stability m.trace() should be near zero and to avoid over- or underflow m should be normalized.
+   */
   EIGEN_DEVICE_FUNC
   static inline void computeRoots(const MatrixType& m, VectorType& roots)
   {
@@ -557,40 +569,48 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
     // Construct the parameters used in classifying the roots of the equation
     // and in solving the equation for the roots in closed form.
     Scalar c2_over_3 = c2*s_inv3;
-    Scalar a_over_3 = (c1 - c2*c2_over_3)*s_inv3;
-    if (a_over_3 > Scalar(0))
-      a_over_3 = Scalar(0);
+    Scalar a_over_3 = (c2*c2_over_3 - c1)*s_inv3;
+    a_over_3 = numext::maxi(a_over_3, Scalar(0));
 
     Scalar half_b = Scalar(0.5)*(c0 + c2_over_3*(Scalar(2)*c2_over_3*c2_over_3 - c1));
 
-    Scalar q = half_b*half_b + a_over_3*a_over_3*a_over_3;
-    if (q > Scalar(0))
-      q = Scalar(0);
+    Scalar q = a_over_3*a_over_3*a_over_3 - half_b*half_b;
+    q = numext::maxi(q, Scalar(0));
 
     // Compute the eigenvalues by solving for the roots of the polynomial.
-    Scalar rho = sqrt(-a_over_3);
-    Scalar theta = atan2(sqrt(-q),half_b)*s_inv3;
+    Scalar rho = sqrt(a_over_3);
+    Scalar theta = atan2(sqrt(q),half_b)*s_inv3;  // since sqrt(q) > 0, atan2 is in [0, pi] and theta is in [0, pi/3]
     Scalar cos_theta = cos(theta);
     Scalar sin_theta = sin(theta);
-    roots(0) = c2_over_3 + Scalar(2)*rho*cos_theta;
-    roots(1) = c2_over_3 - rho*(cos_theta + s_sqrt3*sin_theta);
-    roots(2) = c2_over_3 - rho*(cos_theta - s_sqrt3*sin_theta);
-
-    // Sort in increasing order.
-    if (roots(0) >= roots(1))
-      numext::swap(roots(0),roots(1));
-    if (roots(1) >= roots(2))
-    {
-      numext::swap(roots(1),roots(2));
-      if (roots(0) >= roots(1))
-        numext::swap(roots(0),roots(1));
-    }
+    // roots are already sorted, since cos is monotonically decreasing on [0, pi]
+    roots(0) = c2_over_3 - rho*(cos_theta + s_sqrt3*sin_theta); // == 2*rho*cos(theta+2pi/3)
+    roots(1) = c2_over_3 - rho*(cos_theta - s_sqrt3*sin_theta); // == 2*rho*cos(theta+ pi/3)
+    roots(2) = c2_over_3 + Scalar(2)*rho*cos_theta;
   }
-  
+
+  EIGEN_DEVICE_FUNC
+  static inline bool extract_kernel(MatrixType& mat, Ref<VectorType> res, Ref<VectorType> representative)
+  {
+    using std::abs;
+    Index i0;
+    // Find non-zero column i0 (by construction, there must exist a non zero coefficient on the diagonal):
+    mat.diagonal().cwiseAbs().maxCoeff(&i0);
+    // mat.col(i0) is a good candidate for an orthogonal vector to the current eigenvector,
+    // so let's save it:
+    representative = mat.col(i0);
+    Scalar n0, n1;
+    VectorType c0, c1;
+    n0 = (c0 = representative.cross(mat.col((i0+1)%3))).squaredNorm();
+    n1 = (c1 = representative.cross(mat.col((i0+2)%3))).squaredNorm();
+    if(n0>n1) res = c0/std::sqrt(n0);
+    else      res = c1/std::sqrt(n1);
+
+    return true;
+  }
+
   EIGEN_DEVICE_FUNC
   static inline void run(SolverType& solver, const MatrixType& mat, int options)
   {
-    using std::sqrt;
     eigen_assert(mat.cols() == 3 && mat.cols() == mat.rows());
     eigen_assert((options&~(EigVecMask|GenEigMask))==0
             && (options&EigVecMask)!=EigVecMask
@@ -600,116 +620,72 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
     MatrixType& eivecs = solver.m_eivec;
     VectorType& eivals = solver.m_eivalues;
   
-    // map the matrix coefficients to [-1:1] to avoid over- and underflow.
-    Scalar scale = mat.cwiseAbs().maxCoeff();
-    MatrixType scaledMat = mat / scale;
+    // Shift the matrix to the mean eigenvalue and map the matrix coefficients to [-1:1] to avoid over- and underflow.
+    Scalar shift = mat.trace() / Scalar(3);
+    // TODO Avoid this copy. Currently it is necessary to suppress bogus values when determining maxCoeff and for computing the eigenvectors later
+    MatrixType scaledMat = mat.template selfadjointView<Lower>();
+    scaledMat.diagonal().array() -= shift;
+    Scalar scale = scaledMat.cwiseAbs().maxCoeff();
+    if(scale > 0) scaledMat /= scale;   // TODO for scale==0 we could save the remaining operations
 
     // compute the eigenvalues
     computeRoots(scaledMat,eivals);
 
-    // compute the eigen vectors
+    // compute the eigenvectors
     if(computeEigenvectors)
     {
-      Scalar safeNorm2 = Eigen::NumTraits<Scalar>::epsilon();
       if((eivals(2)-eivals(0))<=Eigen::NumTraits<Scalar>::epsilon())
       {
+        // All three eigenvalues are numerically the same
         eivecs.setIdentity();
       }
       else
       {
-        scaledMat = scaledMat.template selfadjointView<Lower>();
         MatrixType tmp;
         tmp = scaledMat;
 
+        // Compute the eigenvector of the most distinct eigenvalue
         Scalar d0 = eivals(2) - eivals(1);
         Scalar d1 = eivals(1) - eivals(0);
-        int k =  d0 > d1 ? 2 : 0;
-        d0 = d0 > d1 ? d0 : d1;
-
-        tmp.diagonal().array () -= eivals(k);
-        VectorType cross;
-        Scalar n;
-        n = (cross = tmp.row(0).cross(tmp.row(1))).squaredNorm();
-
-        if(n>safeNorm2)
+        Index k(0), l(2);
+        if(d0 > d1)
         {
-          eivecs.col(k) = cross / sqrt(n);
+          std::swap(k,l);
+          d0 = d1;
         }
-        else
+
+        // Compute the eigenvector of index k
         {
-          n = (cross = tmp.row(0).cross(tmp.row(2))).squaredNorm();
-
-          if(n>safeNorm2)
-          {
-            eivecs.col(k) = cross / sqrt(n);
-          }
-          else
-          {
-            n = (cross = tmp.row(1).cross(tmp.row(2))).squaredNorm();
-
-            if(n>safeNorm2)
-            {
-              eivecs.col(k) = cross / sqrt(n);
-            }
-            else
-            {
-              // the input matrix and/or the eigenvaues probably contains some inf/NaN,
-              // => exit
-              // scale back to the original size.
-              eivals *= scale;
-
-              solver.m_info = NumericalIssue;
-              solver.m_isInitialized = true;
-              solver.m_eigenvectorsOk = computeEigenvectors;
-              return;
-            }
-          }
+          tmp.diagonal().array () -= eivals(k);
+          // By construction, 'tmp' is of rank 2, and its kernel corresponds to the respective eigenvector.
+          extract_kernel(tmp, eivecs.col(k), eivecs.col(l));
         }
 
-        tmp = scaledMat;
-        tmp.diagonal().array() -= eivals(1);
-
-        if(d0<=Eigen::NumTraits<Scalar>::epsilon())
+        // Compute eigenvector of index l
+        if(d0<=2*Eigen::NumTraits<Scalar>::epsilon()*d1)
         {
-          eivecs.col(1) = eivecs.col(k).unitOrthogonal();
+          // If d0 is too small, then the two other eigenvalues are numerically the same,
+          // and thus we only have to ortho-normalize the near orthogonal vector we saved above.
+          eivecs.col(l) -= eivecs.col(k).dot(eivecs.col(l))*eivecs.col(l);
+          eivecs.col(l).normalize();
         }
         else
         {
-          n = (cross = eivecs.col(k).cross(tmp.row(0))).squaredNorm();
-          if(n>safeNorm2)
-          {
-            eivecs.col(1) = cross / sqrt(n);
-          }
-          else
-          {
-            n = (cross = eivecs.col(k).cross(tmp.row(1))).squaredNorm();
-            if(n>safeNorm2)
-              eivecs.col(1) = cross / sqrt(n);
-            else
-            {
-              n = (cross = eivecs.col(k).cross(tmp.row(2))).squaredNorm();
-              if(n>safeNorm2)
-                eivecs.col(1) = cross / sqrt(n);
-              else
-              {
-                // we should never reach this point,
-                // if so the last two eigenvalues are likely to be very close to each other
-                eivecs.col(1) = eivecs.col(k).unitOrthogonal();
-              }
-            }
-          }
-
-          // make sure that eivecs[1] is orthogonal to eivecs[2]
-          // FIXME: this step should not be needed
-          Scalar d = eivecs.col(1).dot(eivecs.col(k));
-          eivecs.col(1) = (eivecs.col(1) - d * eivecs.col(k)).normalized();
+          tmp = scaledMat;
+          tmp.diagonal().array () -= eivals(l);
+
+          VectorType dummy;
+          extract_kernel(tmp, eivecs.col(l), dummy);
         }
 
-        eivecs.col(k==2 ? 0 : 2) = eivecs.col(k).cross(eivecs.col(1)).normalized();
+        // Compute last eigenvector from the other two
+        eivecs.col(1) = eivecs.col(2).cross(eivecs.col(0)).normalized();
       }
     }
+
     // Rescale back to the original size.
     eivals *= scale;
+    eivals.array() += shift;
     
     solver.m_info = Success;
     solver.m_isInitialized = true;
@@ -729,7 +705,7 @@ struct direct_selfadjoint_eigenvalues<SolverType,2,false>
   static inline void computeRoots(const MatrixType& m, VectorType& roots)
   {
     using std::sqrt;
-    const Scalar t0 = Scalar(0.5) * sqrt( numext::abs2(m(0,0)-m(1,1)) + Scalar(4)*m(1,0)*m(1,0));
+    const Scalar t0 = Scalar(0.5) * sqrt( numext::abs2(m(0,0)-m(1,1)) + Scalar(4)*numext::abs2(m(1,0)));
     const Scalar t1 = Scalar(0.5) * (m(0,0) + m(1,1));
     roots(0) = t1 - t0;
     roots(1) = t1 + t0;
@@ -739,6 +715,7 @@ struct direct_selfadjoint_eigenvalues<SolverType,2,false>
   static inline void run(SolverType& solver, const MatrixType& mat, int options)
   {
     EIGEN_USING_STD_MATH(sqrt);
+    EIGEN_USING_STD_MATH(abs);
     
     eigen_assert(mat.cols() == 2 && mat.cols() == mat.rows());
     eigen_assert((options&~(EigVecMask|GenEigMask))==0
@@ -760,22 +737,29 @@ struct direct_selfadjoint_eigenvalues<SolverType,2,false>
     // compute the eigen vectors
     if(computeEigenvectors)
     {
-      scaledMat.diagonal().array () -= eivals(1);
-      Scalar a2 = numext::abs2(scaledMat(0,0));
-      Scalar c2 = numext::abs2(scaledMat(1,1));
-      Scalar b2 = numext::abs2(scaledMat(1,0));
-      if(a2>c2)
+      if((eivals(1)-eivals(0))<=abs(eivals(1))*Eigen::NumTraits<Scalar>::epsilon())
       {
-        eivecs.col(1) << -scaledMat(1,0), scaledMat(0,0);
-        eivecs.col(1) /= sqrt(a2+b2);
+        eivecs.setIdentity();
       }
       else
       {
-        eivecs.col(1) << -scaledMat(1,1), scaledMat(1,0);
-        eivecs.col(1) /= sqrt(c2+b2);
-      }
+        scaledMat.diagonal().array () -= eivals(1);
+        Scalar a2 = numext::abs2(scaledMat(0,0));
+        Scalar c2 = numext::abs2(scaledMat(1,1));
+        Scalar b2 = numext::abs2(scaledMat(1,0));
+        if(a2>c2)
+        {
+          eivecs.col(1) << -scaledMat(1,0), scaledMat(0,0);
+          eivecs.col(1) /= sqrt(a2+b2);
+        }
+        else
+        {
+          eivecs.col(1) << -scaledMat(1,1), scaledMat(1,0);
+          eivecs.col(1) /= sqrt(c2+b2);
+        }
 
-      eivecs.col(0) << eivecs.col(1).unitOrthogonal();
+        eivecs.col(0) << eivecs.col(1).unitOrthogonal();
+      }
     }
     
     // Rescale back to the original size.
diff --git a/Eigen/src/Geometry/AlignedBox.h b/Eigen/src/Geometry/AlignedBox.h
index b7c02e8db..08ee843db 100644
--- a/Eigen/src/Geometry/AlignedBox.h
+++ b/Eigen/src/Geometry/AlignedBox.h
@@ -19,10 +19,12 @@ namespace Eigen {
   *
   * \brief An axis aligned box
   *
-  * \param _Scalar the type of the scalar coefficients
-  * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
+  * \tparam _Scalar the type of the scalar coefficients
+  * \tparam _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
   *
   * This class represents an axis aligned box as a pair of the minimal and maximal corners.
+  * \warning The result of most methods is undefined when applied to an empty box. You can check for empty boxes using isEmpty().
+  * \sa alignedboxtypedefs
   */
 template <typename _Scalar, int _AmbientDim>
 class AlignedBox
@@ -40,18 +42,21 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
   /** Define constants to name the corners of a 1D, 2D or 3D axis aligned bounding box */
   enum CornerType
   {
-    /** 1D names */
+    /** 1D names @{ */
     Min=0, Max=1,
+    /** @} */
 
-    /** Added names for 2D */
+    /** Identifier for 2D corner @{ */
     BottomLeft=0, BottomRight=1,
     TopLeft=2, TopRight=3,
+    /** @} */
 
-    /** Added names for 3D */
+    /** Identifier for 3D corner  @{ */
     BottomLeftFloor=0, BottomRightFloor=1,
     TopLeftFloor=2, TopRightFloor=3,
     BottomLeftCeil=4, BottomRightCeil=5,
     TopLeftCeil=6, TopRightCeil=7
+    /** @} */
   };
 
 
@@ -63,34 +68,33 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
   inline explicit AlignedBox(Index _dim) : m_min(_dim), m_max(_dim)
   { setEmpty(); }
 
-  /** Constructs a box with extremities \a _min and \a _max. */
+  /** Constructs a box with extremities \a _min and \a _max.
+   * \warning If either component of \a _min is larger than the same component of \a _max, the constructed box is empty. */
   template<typename OtherVectorType1, typename OtherVectorType2>
   inline AlignedBox(const OtherVectorType1& _min, const OtherVectorType2& _max) : m_min(_min), m_max(_max) {}
 
   /** Constructs a box containing a single point \a p. */
   template<typename Derived>
-  inline explicit AlignedBox(const MatrixBase<Derived>& a_p)
-  {
-    typename internal::nested_eval<Derived,2>::type p(a_p.derived());
-    m_min = p;
-    m_max = p;
-  }
+  inline explicit AlignedBox(const MatrixBase<Derived>& p) : m_min(p), m_max(m_min)
+  { }
 
   ~AlignedBox() {}
 
   /** \returns the dimension in which the box holds */
   inline Index dim() const { return AmbientDimAtCompileTime==Dynamic ? m_min.size() : Index(AmbientDimAtCompileTime); }
 
-  /** \deprecated use isEmpty */
+  /** \deprecated use isEmpty() */
   inline bool isNull() const { return isEmpty(); }
 
-  /** \deprecated use setEmpty */
+  /** \deprecated use setEmpty() */
   inline void setNull() { setEmpty(); }
 
-  /** \returns true if the box is empty. */
+  /** \returns true if the box is empty.
+   * \sa setEmpty */
   inline bool isEmpty() const { return (m_min.array() > m_max.array()).any(); }
 
-  /** Makes \c *this an empty box. */
+  /** Makes \c *this an empty box.
+   * \sa isEmpty */
   inline void setEmpty()
   {
     m_min.setConstant( ScalarTraits::highest() );
@@ -175,31 +179,34 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
 
   /** \returns true if the point \a p is inside the box \c *this. */
   template<typename Derived>
-  inline bool contains(const MatrixBase<Derived>& a_p) const
+  inline bool contains(const MatrixBase<Derived>& p) const
   {
-    typename internal::nested<Derived,2>::type p(a_p.derived());
-    return (m_min.array()<=p.array()).all() && (p.array()<=m_max.array()).all();
+    typename internal::nested<Derived,2>::type p_n(p.derived());
+    return (m_min.array()<=p_n.array()).all() && (p_n.array()<=m_max.array()).all();
   }
 
   /** \returns true if the box \a b is entirely inside the box \c *this. */
   inline bool contains(const AlignedBox& b) const
   { return (m_min.array()<=(b.min)().array()).all() && ((b.max)().array()<=m_max.array()).all(); }
 
-  /** \returns true if the box \a b is intersecting the box \c *this. */
+  /** \returns true if the box \a b is intersecting the box \c *this.
+   * \sa intersection, clamp */
   inline bool intersects(const AlignedBox& b) const
   { return (m_min.array()<=(b.max)().array()).all() && ((b.min)().array()<=m_max.array()).all(); }
 
-  /** Extends \c *this such that it contains the point \a p and returns a reference to \c *this. */
+  /** Extends \c *this such that it contains the point \a p and returns a reference to \c *this.
+   * \sa extend(const AlignedBox&) */
   template<typename Derived>
-  inline AlignedBox& extend(const MatrixBase<Derived>& a_p)
+  inline AlignedBox& extend(const MatrixBase<Derived>& p)
   {
-    typename internal::nested<Derived,2>::type p(a_p.derived());
-    m_min = m_min.cwiseMin(p);
-    m_max = m_max.cwiseMax(p);
+    typename internal::nested<Derived,2>::type p_n(p.derived());
+    m_min = m_min.cwiseMin(p_n);
+    m_max = m_max.cwiseMax(p_n);
     return *this;
   }
 
-  /** Extends \c *this such that it contains the box \a b and returns a reference to \c *this. */
+  /** Extends \c *this such that it contains the box \a b and returns a reference to \c *this.
+   * \sa merged, extend(const MatrixBase&) */
   inline AlignedBox& extend(const AlignedBox& b)
   {
     m_min = m_min.cwiseMin(b.m_min);
@@ -207,7 +214,9 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
     return *this;
   }
 
-  /** Clamps \c *this by the box \a b and returns a reference to \c *this. */
+  /** Clamps \c *this by the box \a b and returns a reference to \c *this.
+   * \note If the boxes don't intersect, the resulting box is empty.
+   * \sa intersection(), intersects() */
   inline AlignedBox& clamp(const AlignedBox& b)
   {
     m_min = m_min.cwiseMax(b.m_min);
@@ -215,11 +224,15 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
     return *this;
   }
 
-  /** Returns an AlignedBox that is the intersection of \a b and \c *this */
+  /** Returns an AlignedBox that is the intersection of \a b and \c *this
+   * \note If the boxes don't intersect, the resulting box is empty.
+   * \sa intersects(), clamp, contains()  */
   inline AlignedBox intersection(const AlignedBox& b) const
   {return AlignedBox(m_min.cwiseMax(b.m_min), m_max.cwiseMin(b.m_max)); }
 
-  /** Returns an AlignedBox that is the union of \a b and \c *this */
+  /** Returns an AlignedBox that is the union of \a b and \c *this.
+   * \note Merging with an empty box may result in a box bigger than \c *this. 
+   * \sa extend(const AlignedBox&) */
   inline AlignedBox merged(const AlignedBox& b) const
   { return AlignedBox(m_min.cwiseMin(b.m_min), m_max.cwiseMax(b.m_max)); }
 
@@ -235,20 +248,20 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
 
   /** \returns the squared distance between the point \a p and the box \c *this,
     * and zero if \a p is inside the box.
-    * \sa exteriorDistance()
+    * \sa exteriorDistance(const MatrixBase&), squaredExteriorDistance(const AlignedBox&)
     */
   template<typename Derived>
-  inline Scalar squaredExteriorDistance(const MatrixBase<Derived>& a_p) const;
+  inline Scalar squaredExteriorDistance(const MatrixBase<Derived>& p) const;
 
   /** \returns the squared distance between the boxes \a b and \c *this,
     * and zero if the boxes intersect.
-    * \sa exteriorDistance()
+    * \sa exteriorDistance(const AlignedBox&), squaredExteriorDistance(const MatrixBase&)
     */
   inline Scalar squaredExteriorDistance(const AlignedBox& b) const;
 
   /** \returns the distance between the point \a p and the box \c *this,
     * and zero if \a p is inside the box.
-    * \sa squaredExteriorDistance()
+    * \sa squaredExteriorDistance(const MatrixBase&), exteriorDistance(const AlignedBox&)
     */
   template<typename Derived>
   inline NonInteger exteriorDistance(const MatrixBase<Derived>& p) const
@@ -256,7 +269,7 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
 
   /** \returns the distance between the boxes \a b and \c *this,
     * and zero if the boxes intersect.
-    * \sa squaredExteriorDistance()
+    * \sa squaredExteriorDistance(const AlignedBox&), exteriorDistance(const MatrixBase&)
     */
   inline NonInteger exteriorDistance(const AlignedBox& b) const
   { using std::sqrt; return sqrt(NonInteger(squaredExteriorDistance(b))); }
diff --git a/Eigen/src/Geometry/Homogeneous.h b/Eigen/src/Geometry/Homogeneous.h
index 756ecf9dc..b7f996615 100644
--- a/Eigen/src/Geometry/Homogeneous.h
+++ b/Eigen/src/Geometry/Homogeneous.h
@@ -365,6 +365,37 @@ struct generic_product_impl<Homogeneous<LhsArg,Horizontal>, Rhs, HomogeneousShap
   }
 };
 
+template<typename Lhs,typename Rhs>
+struct homogeneous_right_product_refactoring_helper
+{
+  enum {
+    Dim  = Lhs::ColsAtCompileTime,
+    Rows = Lhs::RowsAtCompileTime
+  };
+  typedef typename Rhs::template ConstNRowsBlockXpr<Dim>::Type          LinearBlockConst;
+  typedef typename remove_const<LinearBlockConst>::type                 LinearBlock;
+  typedef typename Rhs::ConstRowXpr                                     ConstantColumn;
+  typedef Replicate<const ConstantColumn,Rows,1>                        ConstantBlock;
+  typedef Product<Lhs,LinearBlock,LazyProduct>                          LinearProduct;
+  typedef CwiseBinaryOp<internal::scalar_sum_op<typename Lhs::Scalar>, const LinearProduct, const ConstantBlock> Xpr;
+};
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, HomogeneousShape, DenseShape>
+ : public evaluator<typename homogeneous_right_product_refactoring_helper<typename Lhs::NestedExpression,Rhs>::Xpr>
+{
+  typedef Product<Lhs, Rhs, LazyProduct> XprType;
+  typedef homogeneous_right_product_refactoring_helper<typename Lhs::NestedExpression,Rhs> helper;
+  typedef typename helper::ConstantBlock ConstantBlock;
+  typedef typename helper::Xpr RefactoredXpr;
+  typedef evaluator<RefactoredXpr> Base;
+  
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
+    : Base(  xpr.lhs().nestedExpression() .lazyProduct(  xpr.rhs().template topRows<helper::Dim>(xpr.lhs().nestedExpression().cols()) )
+            + ConstantBlock(xpr.rhs().row(xpr.rhs().rows()-1),xpr.lhs().rows(), 1) )
+  {}
+};
+
 template<typename Lhs, typename RhsArg, int ProductTag>
 struct generic_product_impl<Lhs, Homogeneous<RhsArg,Vertical>, DenseShape, HomogeneousShape, ProductTag>
 {
@@ -375,6 +406,37 @@ struct generic_product_impl<Lhs, Homogeneous<RhsArg,Vertical>, DenseShape, Homog
   }
 };
 
+template<typename Lhs,typename Rhs>
+struct homogeneous_left_product_refactoring_helper
+{
+  enum {
+    Dim = Rhs::RowsAtCompileTime,
+    Cols = Rhs::ColsAtCompileTime
+  };
+  typedef typename Lhs::template ConstNColsBlockXpr<Dim>::Type          LinearBlockConst;
+  typedef typename remove_const<LinearBlockConst>::type                 LinearBlock;
+  typedef typename Lhs::ConstColXpr                                     ConstantColumn;
+  typedef Replicate<const ConstantColumn,1,Cols>                        ConstantBlock;
+  typedef Product<LinearBlock,Rhs,LazyProduct>                          LinearProduct;
+  typedef CwiseBinaryOp<internal::scalar_sum_op<typename Lhs::Scalar>, const LinearProduct, const ConstantBlock> Xpr;
+};
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape, HomogeneousShape>
+ : public evaluator<typename homogeneous_left_product_refactoring_helper<Lhs,typename Rhs::NestedExpression>::Xpr>
+{
+  typedef Product<Lhs, Rhs, LazyProduct> XprType;
+  typedef homogeneous_left_product_refactoring_helper<Lhs,typename Rhs::NestedExpression> helper;
+  typedef typename helper::ConstantBlock ConstantBlock;
+  typedef typename helper::Xpr RefactoredXpr;
+  typedef evaluator<RefactoredXpr> Base;
+  
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
+    : Base(   xpr.lhs().template leftCols<helper::Dim>(xpr.rhs().nestedExpression().rows()) .lazyProduct( xpr.rhs().nestedExpression() )
+            + ConstantBlock(xpr.lhs().col(xpr.lhs().cols()-1),1,xpr.rhs().cols()) )
+  {}
+};
+
 template<typename Scalar, int Dim, int Mode,int Options, typename RhsArg, int ProductTag>
 struct generic_product_impl<Transform<Scalar,Dim,Mode,Options>, Homogeneous<RhsArg,Vertical>, DenseShape, HomogeneousShape, ProductTag>
 {
diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h
index a89d75958..15a063994 100644
--- a/Eigen/src/Geometry/Quaternion.h
+++ b/Eigen/src/Geometry/Quaternion.h
@@ -232,7 +232,7 @@ public:
 
   typedef _Scalar Scalar;
 
-  EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Quaternion)
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Quaternion)
   using Base::operator*=;
 
   typedef typename internal::traits<Quaternion>::Coefficients Coefficients;
@@ -342,7 +342,7 @@ class Map<const Quaternion<_Scalar>, _Options >
 
     typedef _Scalar Scalar;
     typedef typename internal::traits<Map>::Coefficients Coefficients;
-    EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Map)
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Map)
     using Base::operator*=;
 
     /** Constructs a Mapped Quaternion object from the pointer \a coeffs
@@ -379,7 +379,7 @@ class Map<Quaternion<_Scalar>, _Options >
 
     typedef _Scalar Scalar;
     typedef typename internal::traits<Map>::Coefficients Coefficients;
-    EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Map)
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Map)
     using Base::operator*=;
 
     /** Constructs a Mapped Quaternion object from the pointer \a coeffs
@@ -637,7 +637,7 @@ inline Quaternion<typename internal::traits<Derived>::Scalar> QuaternionBase<Der
 {
   // FIXME should this function be called multiplicativeInverse and conjugate() be called inverse() or opposite()  ??
   Scalar n2 = this->squaredNorm();
-  if (n2 > 0)
+  if (n2 > Scalar(0))
     return Quaternion<Scalar>(conjugate().coeffs() / n2);
   else
   {
@@ -723,7 +723,7 @@ QuaternionBase<Derived>::slerp(const Scalar& t, const QuaternionBase<OtherDerive
     scale0 = sin( ( Scalar(1) - t ) * theta) / sinTheta;
     scale1 = sin( ( t * theta) ) / sinTheta;
   }
-  if(d<0) scale1 = -scale1;
+  if(d<Scalar(0)) scale1 = -scale1;
 
   return Quaternion<Scalar>(scale0 * coeffs() + scale1 * other.coeffs());
 }
diff --git a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
index e67f09184..be98993f0 100644
--- a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
+++ b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
@@ -59,20 +59,21 @@ bool bicgstab(const MatrixType& mat, const Rhs& rhs, Dest& x,
 
   VectorType s(n), t(n);
 
-  RealScalar tol2 = tol*tol;
+  RealScalar tol2 = tol*tol*rhs_sqnorm;
   RealScalar eps2 = NumTraits<Scalar>::epsilon()*NumTraits<Scalar>::epsilon();
   Index i = 0;
   Index restarts = 0;
 
-  while ( r.squaredNorm()/rhs_sqnorm > tol2 && i<maxIters )
+  while ( r.squaredNorm() > tol2 && i<maxIters )
   {
     Scalar rho_old = rho;
 
     rho = r0.dot(r);
     if (abs(rho) < eps2*r0_sqnorm)
     {
-      // The new residual vector became too orthogonal to the arbitrarily choosen direction r0
+      // The new residual vector became too orthogonal to the arbitrarily chosen direction r0
       // Let's restart with a new r0:
+      r  = rhs - mat * x;
       r0 = r;
       rho = r0_sqnorm = r.squaredNorm();
       if(restarts++ == 0)
@@ -202,8 +203,8 @@ public:
   template<typename Rhs,typename Dest>
   void _solve_impl(const MatrixBase<Rhs>& b, Dest& x) const
   {
-    // x.setZero();
-    x = b;
+    x.resize(this->rows(),b.cols());
+    x.setZero();
     _solve_with_guess_impl(b,x);
   }
 
diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
index b7f8debb3..102e01f76 100644
--- a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
+++ b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
@@ -157,7 +157,6 @@ class IncompleteLUT : public SparseSolverBase<IncompleteLUT<_Scalar, _StorageInd
     {
       analyzePattern(amat); 
       factorize(amat);
-      m_isInitialized = m_factorizationIsOk;
       return *this;
     }
 
@@ -232,6 +231,8 @@ void IncompleteLUT<Scalar,StorageIndex>::analyzePattern(const _MatrixType& amat)
   m_Pinv  = m_P.inverse(); // ... and the inverse permutation
 
   m_analysisIsOk = true;
+  m_factorizationIsOk = false;
+  m_isInitialized = false;
 }
 
 template <typename Scalar, typename StorageIndex>
@@ -440,6 +441,7 @@ void IncompleteLUT<Scalar,StorageIndex>::factorize(const _MatrixType& amat)
   m_lu.makeCompressed();
 
   m_factorizationIsOk = true;
+  m_isInitialized = m_factorizationIsOk;
   m_info = Success;
 }
 
diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h
index 9b141c8df..8e673838f 100644
--- a/Eigen/src/SVD/BDCSVD.h
+++ b/Eigen/src/SVD/BDCSVD.h
@@ -375,7 +375,7 @@ void BDCSVD<MatrixType>::structured_update(Block<MatrixXr,Dynamic,Dynamic> A, co
 template<typename MatrixType>
 void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift)
 {
-  // requires nbRows = nbCols + 1;
+  // requires rows = cols + 1;
   using std::pow;
   using std::sqrt;
   using std::abs;
@@ -825,7 +825,7 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
       while (rightShifted - leftShifted > 2 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(leftShifted), abs(rightShifted)))
       {
         RealScalar midShifted = (leftShifted + rightShifted) / 2;
-        RealScalar fMid = secularEq(midShifted, col0, diag, perm, diagShifted, shift);
+        fMid = secularEq(midShifted, col0, diag, perm, diagShifted, shift);
         if (fLeft * fMid < 0)
         {
           rightShifted = midShifted;
diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h
index a46a47104..e29d36cf2 100644
--- a/Eigen/src/SVD/JacobiSVD.h
+++ b/Eigen/src/SVD/JacobiSVD.h
@@ -387,7 +387,7 @@ struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, true>
       if(svd.computeU()) svd.m_matrixU.applyOnTheRight(p,q,rot.adjoint());
       if(work_matrix.coeff(p,q) != Scalar(0))
       {
-        Scalar z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q);
+        z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q);
         work_matrix.col(q) *= z;
         if(svd.computeV()) svd.m_matrixV.col(q) *= z;
       }
diff --git a/Eigen/src/SparseCore/CompressedStorage.h b/Eigen/src/SparseCore/CompressedStorage.h
index 52c7da297..5af270bc5 100644
--- a/Eigen/src/SparseCore/CompressedStorage.h
+++ b/Eigen/src/SparseCore/CompressedStorage.h
@@ -50,9 +50,12 @@ class CompressedStorage
 
     CompressedStorage& operator=(const CompressedStorage& other)
     {
-      resize(other.size());
-      internal::smart_copy(other.m_values,  other.m_values  + m_size, m_values);
-      internal::smart_copy(other.m_indices, other.m_indices + m_size, m_indices);
+      if(other.size()>0)
+      {
+        resize(other.size());
+        internal::smart_copy(other.m_values,  other.m_values  + m_size, m_values);
+        internal::smart_copy(other.m_indices, other.m_indices + m_size, m_indices);
+      }
       return *this;
     }
 
diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h
index 778939791..b41e8f15d 100644
--- a/Eigen/src/SparseCore/SparseBlock.h
+++ b/Eigen/src/SparseCore/SparseBlock.h
@@ -285,6 +285,9 @@ public:
   {}
   
   using Base::operator=;
+private:
+  template<typename Derived> BlockImpl(const SparseMatrixBase<Derived>& xpr, Index i);
+  template<typename Derived> BlockImpl(const SparseMatrixBase<Derived>& xpr);
 };
   
 //----------
diff --git a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
index f53427abf..096af7fb0 100644
--- a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
@@ -29,6 +29,24 @@ namespace Eigen {
 //  4 - dense op dense     product      dense
 //                         generic      dense
 
+template<typename BinaryOp, typename Lhs, typename Rhs>
+class CwiseBinaryOpImpl<BinaryOp, Lhs, Rhs, Sparse>
+  : public SparseMatrixBase<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
+{
+  public:
+    typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> Derived;
+    EIGEN_SPARSE_PUBLIC_INTERFACE(Derived)
+    CwiseBinaryOpImpl()
+    {
+      typedef typename internal::traits<Lhs>::StorageKind LhsStorageKind;
+      typedef typename internal::traits<Rhs>::StorageKind RhsStorageKind;
+      EIGEN_STATIC_ASSERT((
+                (!internal::is_same<LhsStorageKind,RhsStorageKind>::value)
+            ||  ((Lhs::Flags&RowMajorBit) == (Rhs::Flags&RowMajorBit))),
+            THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH);
+    }
+};
+
 namespace internal {
 
 template<typename BinaryOp, typename Lhs, typename Rhs, typename Derived,
diff --git a/Eigen/src/SparseCore/SparseDenseProduct.h b/Eigen/src/SparseCore/SparseDenseProduct.h
index edb9d5998..731d40f29 100644
--- a/Eigen/src/SparseCore/SparseDenseProduct.h
+++ b/Eigen/src/SparseCore/SparseDenseProduct.h
@@ -41,7 +41,7 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, t
         typename Res::Scalar tmp(0);
         for(LhsInnerIterator it(lhsEval,j); it ;++it)
           tmp += it.value() * rhs.coeff(it.index(),c);
-        res.coeffRef(j,c) = alpha * tmp;
+        res.coeffRef(j,c) += alpha * tmp;
       }
     }
   }
@@ -128,17 +128,18 @@ namespace internal {
 
 template<typename Lhs, typename Rhs, int ProductType>
 struct generic_product_impl<Lhs, Rhs, SparseShape, DenseShape, ProductType>
+ : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,SparseShape,DenseShape,ProductType> >
 {
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  
   template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
   {
     typedef typename nested_eval<Lhs,Dynamic>::type LhsNested;
     typedef typename nested_eval<Rhs,Dynamic>::type RhsNested;
     LhsNested lhsNested(lhs);
     RhsNested rhsNested(rhs);
-    
-    dst.setZero();
-    internal::sparse_time_dense_product(lhsNested, rhsNested, dst, typename Dest::Scalar(1));
+    internal::sparse_time_dense_product(lhsNested, rhsNested, dst, alpha);
   }
 };
 
@@ -149,19 +150,21 @@ struct generic_product_impl<Lhs, Rhs, SparseTriangularShape, DenseShape, Product
 
 template<typename Lhs, typename Rhs, int ProductType>
 struct generic_product_impl<Lhs, Rhs, DenseShape, SparseShape, ProductType>
+  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,SparseShape,ProductType> >
 {
-  template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  
+  template<typename Dst>
+  static void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
   {
     typedef typename nested_eval<Lhs,Dynamic>::type LhsNested;
     typedef typename nested_eval<Rhs,Dynamic>::type RhsNested;
     LhsNested lhsNested(lhs);
     RhsNested rhsNested(rhs);
     
-    dst.setZero();
     // transpose everything
-    Transpose<Dest> dstT(dst);
-    internal::sparse_time_dense_product(rhsNested.transpose(), lhsNested.transpose(), dstT, typename Dest::Scalar(1));
+    Transpose<Dst> dstT(dst);
+    internal::sparse_time_dense_product(rhsNested.transpose(), lhsNested.transpose(), dstT, alpha);
   }
 };
 
diff --git a/Eigen/src/SparseCore/SparseDiagonalProduct.h b/Eigen/src/SparseCore/SparseDiagonalProduct.h
index b7598c885..29a67da35 100644
--- a/Eigen/src/SparseCore/SparseDiagonalProduct.h
+++ b/Eigen/src/SparseCore/SparseDiagonalProduct.h
@@ -107,7 +107,8 @@ struct sparse_diagonal_product_evaluator<SparseXprType, DiagCoeffType, SDP_AsCwi
   {
   public:
     InnerIterator(const sparse_diagonal_product_evaluator &xprEval, Index outer)
-      : m_cwiseEval(xprEval.m_sparseXprNested.innerVector(outer).cwiseProduct(xprEval.m_diagCoeffNested)),
+      : m_cwiseXpr(xprEval.m_sparseXprNested.innerVector(outer).cwiseProduct(xprEval.m_diagCoeffNested)),
+        m_cwiseEval(m_cwiseXpr),
         m_cwiseIter(m_cwiseEval, 0),
         m_outer(outer)
     {}
@@ -123,6 +124,7 @@ struct sparse_diagonal_product_evaluator<SparseXprType, DiagCoeffType, SDP_AsCwi
     inline operator bool() const  { return m_cwiseIter; }
     
   protected:
+    const CwiseProductType m_cwiseXpr;
     CwiseProductEval m_cwiseEval;
     CwiseProductIterator m_cwiseIter;
     Index m_outer;
diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index ef93cf80c..d9964d0f6 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -262,22 +262,25 @@ class SparseMatrix
     #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** Preallocates \a reserveSize[\c j] non zeros for each column (resp. row) \c j.
       *
-      * This function turns the matrix in non-compressed mode */
+      * This function turns the matrix in non-compressed mode.
+      * 
+      * The type \c SizesType must expose the following interface:
+        \code
+        typedef value_type;
+        const value_type& operator[](i) const;
+        \endcode
+      * for \c i in the [0,this->outerSize()[ range.
+      * Typical choices include std::vector<int>, Eigen::VectorXi, Eigen::VectorXi::Constant, etc.
+      */
     template<class SizesType>
     inline void reserve(const SizesType& reserveSizes);
     #else
     template<class SizesType>
-    inline void reserve(const SizesType& reserveSizes, const typename SizesType::value_type& enableif = typename SizesType::value_type())
-    {
-      EIGEN_UNUSED_VARIABLE(enableif);
-      reserveInnerVectors(reserveSizes);
-    }
-    template<class SizesType>
-    inline void reserve(const SizesType& reserveSizes, const typename SizesType::Scalar& enableif =
+    inline void reserve(const SizesType& reserveSizes, const typename SizesType::value_type& enableif =
     #if (!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1500) // MSVC 2005 fails to compile with this typename
         typename
     #endif
-        SizesType::Scalar())
+        SizesType::value_type())
     {
       EIGEN_UNUSED_VARIABLE(enableif);
       reserveInnerVectors(reserveSizes);
@@ -722,6 +725,9 @@ class SparseMatrix
       }
       else if(this!=&other)
       {
+        #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+          EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+        #endif
         initAssignment(other);
         if(other.isCompressed())
         {
@@ -789,10 +795,8 @@ class SparseMatrix
       std::free(m_innerNonZeros);
     }
 
-#ifndef EIGEN_PARSED_BY_DOXYGEN
     /** Overloaded for performance */
     Scalar sum() const;
-#endif
     
 #   ifdef EIGEN_SPARSEMATRIX_PLUGIN
 #     include EIGEN_SPARSEMATRIX_PLUGIN
@@ -1118,9 +1122,9 @@ typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& SparseMatrix<_Scalar,_Op
       //     so that the entire free-space is allocated to the current inner-vector.
       eigen_internal_assert(data_end < m_data.allocatedSize());
       StorageIndex new_end = convert_index(m_data.allocatedSize());
-      for(Index j=outer+1; j<=m_outerSize; ++j)
-        if(m_outerIndex[j]==data_end)
-          m_outerIndex[j] = new_end;
+      for(Index k=outer+1; k<=m_outerSize; ++k)
+        if(m_outerIndex[k]==data_end)
+          m_outerIndex[k] = new_end;
     }
     return m_data.value(p);
   }
@@ -1143,9 +1147,9 @@ typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& SparseMatrix<_Scalar,_Op
       //     so that the entire free-space is allocated to the current inner-vector.
       eigen_internal_assert(data_end < m_data.allocatedSize());
       StorageIndex new_end = convert_index(m_data.allocatedSize());
-      for(Index j=outer+1; j<=m_outerSize; ++j)
-        if(m_outerIndex[j]==data_end)
-          m_outerIndex[j] = new_end;
+      for(Index k=outer+1; k<=m_outerSize; ++k)
+        if(m_outerIndex[k]==data_end)
+          m_outerIndex[k] = new_end;
     }
     
     // and insert it at the right position (sorted insertion)
diff --git a/Eigen/src/SparseCore/SparseMatrixBase.h b/Eigen/src/SparseCore/SparseMatrixBase.h
index d4ab8b908..f1b5d2a97 100644
--- a/Eigen/src/SparseCore/SparseMatrixBase.h
+++ b/Eigen/src/SparseCore/SparseMatrixBase.h
@@ -28,6 +28,12 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
   public:
 
     typedef typename internal::traits<Derived>::Scalar Scalar;
+    
+    /** The numeric type of the expression' coefficients, e.g. float, double, int or std::complex<float>, etc.
+      *
+      * It is an alias for the Scalar type */
+    typedef Scalar value_type;
+    
     typedef typename internal::packet_traits<Scalar>::type PacketScalar;
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
     typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
diff --git a/Eigen/src/SparseCore/SparseRef.h b/Eigen/src/SparseCore/SparseRef.h
index 2ca039323..8df62a119 100644
--- a/Eigen/src/SparseCore/SparseRef.h
+++ b/Eigen/src/SparseCore/SparseRef.h
@@ -12,6 +12,10 @@
 
 namespace Eigen {
 
+enum {
+  StandardCompressedFormat = 2
+};
+  
 namespace internal {
 
 template<typename Derived> class SparseRefBase;
@@ -72,6 +76,19 @@ protected:
 
 } // namespace internal
 
+
+/** 
+  * \ingroup Sparse_Module
+  *
+  * \brief A sparse matrix expression referencing an existing sparse expression
+  *
+  * \tparam PlainObjectType the equivalent sparse matrix type of the referenced data
+  * \tparam Options specifies whether the a standard compressed format is required \c Options is  \c #StandardCompressedFormat, or \c 0.
+  *                The default is \c 0.
+  * \tparam StrideType Only used for dense Ref
+  *
+  * \sa class Ref
+  */
 template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
 class Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType >
   : public internal::SparseRefBase<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType > >
@@ -93,6 +110,7 @@ class Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType >
     inline Ref(SparseMatrix<MatScalar,OtherOptions,MatIndex>& expr)
     {
       EIGEN_STATIC_ASSERT(bool(Traits::template match<SparseMatrix<MatScalar,OtherOptions,MatIndex> >::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      eigen_assert( ((Options & int(StandardCompressedFormat))==0) || (expr.isCompressed()) );
       Base::construct(expr.derived());
     }
     
@@ -100,6 +118,7 @@ class Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType >
     inline Ref(MappedSparseMatrix<MatScalar,OtherOptions,MatIndex>& expr)
     {
       EIGEN_STATIC_ASSERT(bool(Traits::template match<SparseMatrix<MatScalar,OtherOptions,MatIndex> >::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      eigen_assert( ((Options & int(StandardCompressedFormat))==0) || (expr.isCompressed()) );
       Base::construct(expr.derived());
     }
     
@@ -112,6 +131,7 @@ class Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType >
     {
       EIGEN_STATIC_ASSERT(bool(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
       EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      eigen_assert( ((Options & int(StandardCompressedFormat))==0) || (expr.isCompressed()) );
       Base::construct(expr.const_cast_derived());
     }
 };
@@ -148,7 +168,15 @@ class Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType
     template<typename Expression>
     void construct(const Expression& expr,internal::true_type)
     {
-      Base::construct(expr);
+      if((Options & int(StandardCompressedFormat)) && (!expr.isCompressed()))
+      {
+        m_object = expr;
+        Base::construct(m_object);
+      }
+      else
+      {
+        Base::construct(expr);
+      }
     }
 
     template<typename Expression>
diff --git a/Eigen/src/SparseCore/SparseTriangularView.h b/Eigen/src/SparseCore/SparseTriangularView.h
index 34ec07a13..0e6c01531 100644
--- a/Eigen/src/SparseCore/SparseTriangularView.h
+++ b/Eigen/src/SparseCore/SparseTriangularView.h
@@ -11,8 +11,17 @@
 #ifndef EIGEN_SPARSE_TRIANGULARVIEW_H
 #define EIGEN_SPARSE_TRIANGULARVIEW_H
 
-namespace Eigen { 
+namespace Eigen {
 
+/** \ingroup SparseCore_Module
+  *
+  * \brief Base class for a triangular part in a \b sparse matrix
+  *
+  * This class is an abstract base class of class TriangularView, and objects of type TriangularViewImpl cannot be instantiated.
+  * It extends class TriangularView with additional methods which are available for sparse expressions only.
+  *
+  * \sa class TriangularView, SparseMatrixBase::triangularView()
+  */
 template<typename MatrixType, unsigned int Mode> class TriangularViewImpl<MatrixType,Mode,Sparse>
   : public SparseMatrixBase<TriangularView<MatrixType,Mode> >
 {
diff --git a/Eigen/src/SparseCore/SparseView.h b/Eigen/src/SparseCore/SparseView.h
index 1c69aa458..0a87f01d9 100644
--- a/Eigen/src/SparseCore/SparseView.h
+++ b/Eigen/src/SparseCore/SparseView.h
@@ -36,9 +36,9 @@ public:
   EIGEN_SPARSE_PUBLIC_INTERFACE(SparseView)
   typedef typename internal::remove_all<MatrixType>::type NestedExpression;
 
-  explicit SparseView(const MatrixType& mat, const Scalar& m_reference = Scalar(0),
-             RealScalar m_epsilon = NumTraits<Scalar>::dummy_precision()) : 
-    m_matrix(mat), m_reference(m_reference), m_epsilon(m_epsilon) {}
+  explicit SparseView(const MatrixType& mat, const Scalar& reference = Scalar(0),
+                      RealScalar epsilon = NumTraits<Scalar>::dummy_precision())
+    : m_matrix(mat), m_reference(reference), m_epsilon(epsilon) {}
 
   inline Index rows() const { return m_matrix.rows(); }
   inline Index cols() const { return m_matrix.cols(); }
diff --git a/Eigen/src/SparseCore/TriangularSolver.h b/Eigen/src/SparseCore/TriangularSolver.h
index fd1a55bc6..8872012db 100644
--- a/Eigen/src/SparseCore/TriangularSolver.h
+++ b/Eigen/src/SparseCore/TriangularSolver.h
@@ -75,7 +75,7 @@ struct sparse_solve_triangular_selector<Lhs,Rhs,Mode,Upper,RowMajor>
       for(Index i=lhs.rows()-1 ; i>=0 ; --i)
       {
         Scalar tmp = other.coeff(i,col);
-        Scalar l_ii = 0;
+        Scalar l_ii(0);
         LhsIterator it(lhsEval, i);
         while(it && it.index()<i)
           ++it;
diff --git a/Eigen/src/SuperLUSupport/SuperLUSupport.h b/Eigen/src/SuperLUSupport/SuperLUSupport.h
index 1bbd2758e..d067d8fdf 100644
--- a/Eigen/src/SuperLUSupport/SuperLUSupport.h
+++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h
@@ -503,11 +503,9 @@ class SuperLU : public SuperLUBase<_MatrixType,SuperLU<_MatrixType> >
       */
     void factorize(const MatrixType& matrix);
     
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** \internal */
     template<typename Rhs,typename Dest>
     void _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const;
-    #endif // EIGEN_PARSED_BY_DOXYGEN
     
     inline const LMatrixType& matrixL() const
     {
diff --git a/Eigen/src/UmfPackSupport/UmfPackSupport.h b/Eigen/src/UmfPackSupport/UmfPackSupport.h
index 3d30403c7..0a5043ef2 100644
--- a/Eigen/src/UmfPackSupport/UmfPackSupport.h
+++ b/Eigen/src/UmfPackSupport/UmfPackSupport.h
@@ -107,15 +107,6 @@ inline int umfpack_get_determinant(std::complex<double> *Mx, double *Ex, void *N
   return umfpack_zi_get_determinant(&mx_real,0,Ex,NumericHandle,User_Info);
 }
 
-namespace internal {
-  template<typename T> struct umfpack_helper_is_sparse_plain : false_type {};
-  template<typename Scalar, int Options, typename StorageIndex>
-  struct umfpack_helper_is_sparse_plain<SparseMatrix<Scalar,Options,StorageIndex> >
-    : true_type {};
-  template<typename Scalar, int Options, typename StorageIndex>
-  struct umfpack_helper_is_sparse_plain<MappedSparseMatrix<Scalar,Options,StorageIndex> >
-    : true_type {};
-}
 
 /** \ingroup UmfPackSupport_Module
   * \brief A sparse LU factorization and solver based on UmfPack
@@ -147,12 +138,18 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
     typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
     typedef SparseMatrix<Scalar> LUMatrixType;
     typedef SparseMatrix<Scalar,ColMajor,int> UmfpackMatrixType;
+    typedef Ref<const UmfpackMatrixType, StandardCompressedFormat> UmfpackMatrixRef;
 
   public:
 
-    UmfPackLU() { init(); }
+    UmfPackLU()
+      : m_dummy(0,0), mp_matrix(m_dummy)
+    {
+      init();
+    }
 
     explicit UmfPackLU(const MatrixType& matrix)
+      : mp_matrix(matrix)
     {
       init();
       compute(matrix);
@@ -164,8 +161,8 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
       if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar());
     }
 
-    inline Index rows() const { return m_copyMatrix.rows(); }
-    inline Index cols() const { return m_copyMatrix.cols(); }
+    inline Index rows() const { return mp_matrix.rows(); }
+    inline Index cols() const { return mp_matrix.cols(); }
 
     /** \brief Reports whether previous computation was successful.
       *
@@ -211,7 +208,7 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
     {
       if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar());
       if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar());
-      grapInput(matrix.derived());
+      grab(matrix.derived());
       analyzePattern_impl();
       factorize_impl();
     }
@@ -228,7 +225,7 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
       if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar());
       if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar());
       
-      grapInput(matrix.derived());
+      grab(matrix.derived());
 
       analyzePattern_impl();
     }
@@ -246,16 +243,14 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
       if(m_numeric)
         umfpack_free_numeric(&m_numeric,Scalar());
 
-      grapInput(matrix.derived());
+      grab(matrix.derived());
       
       factorize_impl();
     }
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** \internal */
     template<typename BDerived,typename XDerived>
     bool _solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const;
-    #endif
 
     Scalar determinant() const;
 
@@ -269,53 +264,16 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
       m_isInitialized         = false;
       m_numeric               = 0;
       m_symbolic              = 0;
-      m_outerIndexPtr         = 0;
-      m_innerIndexPtr         = 0;
-      m_valuePtr              = 0;
       m_extractedDataAreDirty = true;
     }
     
-    template<typename InputMatrixType>
-    void grapInput_impl(const InputMatrixType& mat, internal::true_type)
-    {
-      m_copyMatrix.resize(mat.rows(), mat.cols());
-      if( ((MatrixType::Flags&RowMajorBit)==RowMajorBit) || sizeof(typename MatrixType::StorageIndex)!=sizeof(int) || !mat.isCompressed() )
-      {
-        // non supported input -> copy
-        m_copyMatrix = mat;
-        m_outerIndexPtr = m_copyMatrix.outerIndexPtr();
-        m_innerIndexPtr = m_copyMatrix.innerIndexPtr();
-        m_valuePtr      = m_copyMatrix.valuePtr();
-      }
-      else
-      {
-        m_outerIndexPtr = mat.outerIndexPtr();
-        m_innerIndexPtr = mat.innerIndexPtr();
-        m_valuePtr      = mat.valuePtr();
-      }
-    }
-    
-    template<typename InputMatrixType>
-    void grapInput_impl(const InputMatrixType& mat, internal::false_type)
-    {
-      m_copyMatrix = mat;
-      m_outerIndexPtr = m_copyMatrix.outerIndexPtr();
-      m_innerIndexPtr = m_copyMatrix.innerIndexPtr();
-      m_valuePtr      = m_copyMatrix.valuePtr();
-    }
-    
-    template<typename InputMatrixType>
-    void grapInput(const InputMatrixType& mat)
-    {
-      grapInput_impl(mat, internal::umfpack_helper_is_sparse_plain<InputMatrixType>());
-    }
-    
     void analyzePattern_impl()
     {
       int errorCode = 0;
-      errorCode = umfpack_symbolic(internal::convert_index<int>(m_copyMatrix.rows()),
-                                   internal::convert_index<int>(m_copyMatrix.cols()),
-                                   m_outerIndexPtr, m_innerIndexPtr, m_valuePtr, &m_symbolic, 0, 0);
+      errorCode = umfpack_symbolic(internal::convert_index<int>(mp_matrix.rows()),
+                                   internal::convert_index<int>(mp_matrix.cols()),
+                                   mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(),
+                                   &m_symbolic, 0, 0);
 
       m_isInitialized = true;
       m_info = errorCode ? InvalidInput : Success;
@@ -327,24 +285,39 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
     void factorize_impl()
     {
       int errorCode;
-      errorCode = umfpack_numeric(m_outerIndexPtr, m_innerIndexPtr, m_valuePtr,
+      errorCode = umfpack_numeric(mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(),
                                   m_symbolic, &m_numeric, 0, 0);
 
       m_info = errorCode ? NumericalIssue : Success;
       m_factorizationIsOk = true;
       m_extractedDataAreDirty = true;
     }
-
+    
+    template<typename MatrixDerived>
+    void grab(const EigenBase<MatrixDerived> &A)
+    {
+      mp_matrix.~UmfpackMatrixRef();
+      ::new (&mp_matrix) UmfpackMatrixRef(A.derived());
+    }
+    
+    void grab(const UmfpackMatrixRef &A)
+    {
+      if(&(A.derived()) != &mp_matrix)
+      {
+        mp_matrix.~UmfpackMatrixRef();
+        ::new (&mp_matrix) UmfpackMatrixRef(A);
+      }
+    }
+  
     // cached data to reduce reallocation, etc.
     mutable LUMatrixType m_l;
     mutable LUMatrixType m_u;
     mutable IntColVectorType m_p;
     mutable IntRowVectorType m_q;
 
-    UmfpackMatrixType m_copyMatrix;
-    const Scalar* m_valuePtr;
-    const int* m_outerIndexPtr;
-    const int* m_innerIndexPtr;
+    UmfpackMatrixType m_dummy;
+    UmfpackMatrixRef mp_matrix;
+  
     void* m_numeric;
     void* m_symbolic;
 
@@ -416,7 +389,7 @@ bool UmfPackLU<MatrixType>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBas
     if(x.innerStride()==1)
       x_ptr = &x.col(j).coeffRef(0);
     errorCode = umfpack_solve(UMFPACK_A,
-        m_outerIndexPtr, m_innerIndexPtr, m_valuePtr,
+        mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(),
         x_ptr, &b.const_cast_derived().col(j).coeffRef(0), m_numeric, 0, 0);
     if(x.innerStride()!=1)
       x.col(j) = x_tmp;
diff --git a/Eigen/src/plugins/ArrayCwiseBinaryOps.h b/Eigen/src/plugins/ArrayCwiseBinaryOps.h
index 0c22184c0..58844800a 100644
--- a/Eigen/src/plugins/ArrayCwiseBinaryOps.h
+++ b/Eigen/src/plugins/ArrayCwiseBinaryOps.h
@@ -74,6 +74,44 @@ max
   return (max)(Derived::PlainObject::Constant(rows(), cols(), other));
 }
 
+// TODO code generating macros could be moved to Macros.h and could include generation of documentation
+#define EIGEN_MAKE_CWISE_COMP_OP(OP, COMPARATOR) \
+template<typename OtherDerived> \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_ ## COMPARATOR>, const Derived, const OtherDerived> \
+OP(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const \
+{ \
+  return CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_ ## COMPARATOR>, const Derived, const OtherDerived>(derived(), other.derived()); \
+}\
+typedef CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_ ## COMPARATOR>, const Derived, const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> > Cmp ## COMPARATOR ## ReturnType; \
+typedef CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_ ## COMPARATOR>, const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject>, const Derived > RCmp ## COMPARATOR ## ReturnType; \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Cmp ## COMPARATOR ## ReturnType \
+OP(const Scalar& s) const { \
+  return this->OP(Derived::PlainObject::Constant(rows(), cols(), s)); \
+} \
+EIGEN_DEVICE_FUNC friend EIGEN_STRONG_INLINE const RCmp ## COMPARATOR ## ReturnType \
+OP(const Scalar& s, const Derived& d) { \
+  return Derived::PlainObject::Constant(d.rows(), d.cols(), s).OP(d); \
+}
+
+#define EIGEN_MAKE_CWISE_COMP_R_OP(OP, R_OP, RCOMPARATOR) \
+template<typename OtherDerived> \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_##RCOMPARATOR>, const OtherDerived, const Derived> \
+OP(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const \
+{ \
+  return CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_##RCOMPARATOR>, const OtherDerived, const Derived>(other.derived(), derived()); \
+} \
+EIGEN_DEVICE_FUNC \
+inline const RCmp ## RCOMPARATOR ## ReturnType \
+OP(const Scalar& s) const { \
+  return Derived::PlainObject::Constant(rows(), cols(), s).R_OP(*this); \
+} \
+friend inline const Cmp ## RCOMPARATOR ## ReturnType \
+OP(const Scalar& s, const Derived& d) { \
+  return d.R_OP(Derived::PlainObject::Constant(d.rows(), d.cols(), s)); \
+}
+
+
+
 /** \returns an expression of the coefficient-wise \< operator of *this and \a other
   *
   * Example: \include Cwise_less.cpp
@@ -81,7 +119,7 @@ max
   *
   * \sa all(), any(), operator>(), operator<=()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(operator<,std::less)
+EIGEN_MAKE_CWISE_COMP_OP(operator<, LT)
 
 /** \returns an expression of the coefficient-wise \<= operator of *this and \a other
   *
@@ -90,7 +128,7 @@ EIGEN_MAKE_CWISE_BINARY_OP(operator<,std::less)
   *
   * \sa all(), any(), operator>=(), operator<()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(operator<=,std::less_equal)
+EIGEN_MAKE_CWISE_COMP_OP(operator<=, LE)
 
 /** \returns an expression of the coefficient-wise \> operator of *this and \a other
   *
@@ -99,7 +137,7 @@ EIGEN_MAKE_CWISE_BINARY_OP(operator<=,std::less_equal)
   *
   * \sa all(), any(), operator>=(), operator<()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(operator>,std::greater)
+EIGEN_MAKE_CWISE_COMP_R_OP(operator>, operator<, LT)
 
 /** \returns an expression of the coefficient-wise \>= operator of *this and \a other
   *
@@ -108,7 +146,7 @@ EIGEN_MAKE_CWISE_BINARY_OP(operator>,std::greater)
   *
   * \sa all(), any(), operator>(), operator<=()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(operator>=,std::greater_equal)
+EIGEN_MAKE_CWISE_COMP_R_OP(operator>=, operator<=, LE)
 
 /** \returns an expression of the coefficient-wise == operator of *this and \a other
   *
@@ -122,7 +160,7 @@ EIGEN_MAKE_CWISE_BINARY_OP(operator>=,std::greater_equal)
   *
   * \sa all(), any(), isApprox(), isMuchSmallerThan()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(operator==,std::equal_to)
+EIGEN_MAKE_CWISE_COMP_OP(operator==, EQ)
 
 /** \returns an expression of the coefficient-wise != operator of *this and \a other
   *
@@ -136,7 +174,11 @@ EIGEN_MAKE_CWISE_BINARY_OP(operator==,std::equal_to)
   *
   * \sa all(), any(), isApprox(), isMuchSmallerThan()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(operator!=,std::not_equal_to)
+EIGEN_MAKE_CWISE_COMP_OP(operator!=, NEQ)
+
+
+#undef EIGEN_MAKE_CWISE_COMP_OP
+#undef EIGEN_MAKE_CWISE_COMP_R_OP
 
 // scalar addition
 
diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
index 9843c2a58..c9f7c8f6e 100644
--- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h
+++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
@@ -422,26 +422,3 @@ operator!() const
                       THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL);
   return BooleanNotReturnType(derived());
 }
-
-
-#define EIGEN_MAKE_SCALAR_CWISE_UNARY_OP(METHOD_NAME,FUNCTOR) \
-  EIGEN_DEVICE_FUNC \
-  inline const CwiseUnaryOp<std::binder2nd<FUNCTOR<Scalar> >, const Derived> \
-  METHOD_NAME(const Scalar& s) const { \
-    return CwiseUnaryOp<std::binder2nd<FUNCTOR<Scalar> >, const Derived> \
-            (derived(), std::bind2nd(FUNCTOR<Scalar>(), s)); \
-  } \
-  friend inline const CwiseUnaryOp<std::binder1st<FUNCTOR<Scalar> >, const Derived> \
-  METHOD_NAME(const Scalar& s, const Derived& d) { \
-	  return CwiseUnaryOp<std::binder1st<FUNCTOR<Scalar> >, const Derived> \
-			  (d, std::bind1st(FUNCTOR<Scalar>(), s)); \
-  }
-
-EIGEN_MAKE_SCALAR_CWISE_UNARY_OP(operator==,  std::equal_to)
-EIGEN_MAKE_SCALAR_CWISE_UNARY_OP(operator!=,  std::not_equal_to)
-EIGEN_MAKE_SCALAR_CWISE_UNARY_OP(operator<,   std::less)
-EIGEN_MAKE_SCALAR_CWISE_UNARY_OP(operator<=,  std::less_equal)
-EIGEN_MAKE_SCALAR_CWISE_UNARY_OP(operator>,   std::greater)
-EIGEN_MAKE_SCALAR_CWISE_UNARY_OP(operator>=,  std::greater_equal)
-
-
diff --git a/Eigen/src/plugins/MatrixCwiseBinaryOps.h b/Eigen/src/plugins/MatrixCwiseBinaryOps.h
index b9582a5a0..6dd2e1192 100644
--- a/Eigen/src/plugins/MatrixCwiseBinaryOps.h
+++ b/Eigen/src/plugins/MatrixCwiseBinaryOps.h
@@ -132,3 +132,21 @@ cwiseQuotient(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
   return CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
 }
+
+typedef CwiseBinaryOp<internal::scalar_cmp_op<Scalar,internal::cmp_EQ>, const Derived, const ConstantReturnType> CwiseScalarEqualReturnType;
+
+/** \returns an expression of the coefficient-wise == operator of \c *this and a scalar \a s
+  *
+  * \warning this performs an exact comparison, which is generally a bad idea with floating-point types.
+  * In order to check for equality between two vectors or matrices with floating-point coefficients, it is
+  * generally a far better idea to use a fuzzy comparison as provided by isApprox() and
+  * isMuchSmallerThan().
+  *
+  * \sa cwiseEqual(const MatrixBase<OtherDerived> &) const
+  */
+EIGEN_DEVICE_FUNC
+inline const CwiseScalarEqualReturnType
+cwiseEqual(const Scalar& s) const
+{
+  return CwiseScalarEqualReturnType(derived(), Derived::Constant(rows(), cols(), s), internal::scalar_cmp_op<Scalar,internal::cmp_EQ>());
+}
diff --git a/Eigen/src/plugins/MatrixCwiseUnaryOps.h b/Eigen/src/plugins/MatrixCwiseUnaryOps.h
index c99ee94ec..e339140bf 100644
--- a/Eigen/src/plugins/MatrixCwiseUnaryOps.h
+++ b/Eigen/src/plugins/MatrixCwiseUnaryOps.h
@@ -8,13 +8,14 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-// This file is a base class plugin containing matrix specifics coefficient wise functions.
+// This file is included into the body of the base classes supporting matrix specific coefficient-wise functions.
+// This include MatrixBase and SparseMatrixBase.
 
 typedef CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived> CwiseAbsReturnType;
 typedef CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived> CwiseAbs2ReturnType;
 typedef CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived> CwiseSqrtReturnType;
 typedef CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived> CwiseInverseReturnType;
-typedef CwiseUnaryOp<std::binder1st<std::equal_to<Scalar> >, const Derived> CwiseScalarEqualReturnType;
+
 /** \returns an expression of the coefficient-wise absolute value of \c *this
   *
   * Example: \include MatrixBase_cwiseAbs.cpp
@@ -58,19 +59,3 @@ cwiseSqrt() const { return CwiseSqrtReturnType(derived()); }
 EIGEN_DEVICE_FUNC
 inline const CwiseInverseReturnType
 cwiseInverse() const { return CwiseInverseReturnType(derived()); }
-
-/** \returns an expression of the coefficient-wise == operator of \c *this and a scalar \a s
-  *
-  * \warning this performs an exact comparison, which is generally a bad idea with floating-point types.
-  * In order to check for equality between two vectors or matrices with floating-point coefficients, it is
-  * generally a far better idea to use a fuzzy comparison as provided by isApprox() and
-  * isMuchSmallerThan().
-  *
-  * \sa cwiseEqual(const MatrixBase<OtherDerived> &) const
-  */
-EIGEN_DEVICE_FUNC
-inline const CwiseScalarEqualReturnType
-cwiseEqual(const Scalar& s) const
-{
-  return CwiseScalarEqualReturnType(derived(), std::bind1st(std::equal_to<Scalar>(), s));
-}
diff --git a/bench/btl/libs/blaze/blaze_interface.hh b/bench/btl/libs/blaze/blaze_interface.hh
index ed43ecdd4..ee1523944 100644
--- a/bench/btl/libs/blaze/blaze_interface.hh
+++ b/bench/btl/libs/blaze/blaze_interface.hh
@@ -85,15 +85,15 @@ public :
   }
 
   static inline void transposed_matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N){
-    X = A.transpose()*B.transpose();
+    X = (trans(A)*trans(B));
   }
 
   static inline void ata_product(const gene_matrix & A, gene_matrix & X, int N){
-    X = (A.transpose()*A);
+    X = (trans(A)*A);
   }
 
   static inline void aat_product(const gene_matrix & A, gene_matrix & X, int N){
-    X = (A*A.transpose());
+    X = (A*trans(A));
   }
 
   static inline void matrix_vector_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){
@@ -101,7 +101,7 @@ public :
   }
 
   static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){
-    X = (A.transpose()*B);
+    X = (trans(A)*B);
   }
 
   static inline void axpy(const real coef, const gene_vector & X, gene_vector & Y, int N){
diff --git a/bench/btl/libs/blaze/main.cpp b/bench/btl/libs/blaze/main.cpp
index 582a2956b..80e8f4eaa 100644
--- a/bench/btl/libs/blaze/main.cpp
+++ b/bench/btl/libs/blaze/main.cpp
@@ -29,7 +29,7 @@ int main()
   bench<Action_axpby<blaze_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT);
 
   bench<Action_matrix_vector_product<blaze_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
-//   bench<Action_atv_product<blaze_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
+  bench<Action_atv_product<blaze_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
 //   bench<Action_matrix_matrix_product<blaze_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
 //   bench<Action_ata_product<blaze_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
 //   bench<Action_aat_product<blaze_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
diff --git a/bench/eig33.cpp b/bench/eig33.cpp
index 1608b999d..47947a9be 100644
--- a/bench/eig33.cpp
+++ b/bench/eig33.cpp
@@ -50,7 +50,7 @@ inline void computeRoots(const Matrix& m, Roots& roots)
 {
   typedef typename Matrix::Scalar Scalar;
   const Scalar s_inv3 = 1.0/3.0;
-  const Scalar s_sqrt3 = internal::sqrt(Scalar(3.0));
+  const Scalar s_sqrt3 = std::sqrt(Scalar(3.0));
 
   // The characteristic equation is x^3 - c2*x^2 + c1*x - c0 = 0.  The
   // eigenvalues are the roots to this equation, all guaranteed to be
@@ -73,23 +73,13 @@ inline void computeRoots(const Matrix& m, Roots& roots)
     q = Scalar(0);
 
   // Compute the eigenvalues by solving for the roots of the polynomial.
-  Scalar rho = internal::sqrt(-a_over_3);
-  Scalar theta = std::atan2(internal::sqrt(-q),half_b)*s_inv3;
-  Scalar cos_theta = internal::cos(theta);
-  Scalar sin_theta = internal::sin(theta);
-  roots(0) = c2_over_3 + Scalar(2)*rho*cos_theta;
-  roots(1) = c2_over_3 - rho*(cos_theta + s_sqrt3*sin_theta);
-  roots(2) = c2_over_3 - rho*(cos_theta - s_sqrt3*sin_theta);
-
-  // Sort in increasing order.
-  if (roots(0) >= roots(1))
-    std::swap(roots(0),roots(1));
-  if (roots(1) >= roots(2))
-  {
-    std::swap(roots(1),roots(2));
-    if (roots(0) >= roots(1))
-      std::swap(roots(0),roots(1));
-  }
+  Scalar rho = std::sqrt(-a_over_3);
+  Scalar theta = std::atan2(std::sqrt(-q),half_b)*s_inv3;
+  Scalar cos_theta = std::cos(theta);
+  Scalar sin_theta = std::sin(theta);
+  roots(2) = c2_over_3 + Scalar(2)*rho*cos_theta;
+  roots(0) = c2_over_3 - rho*(cos_theta + s_sqrt3*sin_theta);
+  roots(1) = c2_over_3 - rho*(cos_theta - s_sqrt3*sin_theta);
 }
 
 template<typename Matrix, typename Vector>
@@ -99,9 +89,12 @@ void eigen33(const Matrix& mat, Matrix& evecs, Vector& evals)
   // Scale the matrix so its entries are in [-1,1].  The scaling is applied
   // only when at least one matrix entry has magnitude larger than 1.
 
-  Scalar scale = mat.cwiseAbs()/*.template triangularView<Lower>()*/.maxCoeff();
+  Scalar shift = mat.trace()/3;
+  Matrix scaledMat = mat;
+  scaledMat.diagonal().array() -= shift;
+  Scalar scale = scaledMat.cwiseAbs()/*.template triangularView<Lower>()*/.maxCoeff();
   scale = std::max(scale,Scalar(1));
-  Matrix scaledMat = mat / scale;
+  scaledMat/=scale;
 
   // Compute the eigenvalues
 //   scaledMat.setZero();
@@ -166,6 +159,7 @@ void eigen33(const Matrix& mat, Matrix& evecs, Vector& evals)
   
   // Rescale back to the original size.
   evals *= scale;
+  evals.array()+=shift;
 }
 
 int main()
@@ -173,24 +167,29 @@ int main()
   BenchTimer t;
   int tries = 10;
   int rep = 400000;
-  typedef Matrix3f Mat;
-  typedef Vector3f Vec;
+  typedef Matrix3d Mat;
+  typedef Vector3d Vec;
   Mat A = Mat::Random(3,3);
   A = A.adjoint() * A;
+//   Mat Q = A.householderQr().householderQ();
+//   A = Q * Vec(2.2424567,2.2424566,7.454353).asDiagonal() * Q.transpose();
 
   SelfAdjointEigenSolver<Mat> eig(A);
   BENCH(t, tries, rep, eig.compute(A));
-  std::cout << "Eigen:  " << t.best() << "s\n";
+  std::cout << "Eigen iterative:  " << t.best() << "s\n";
+  
+  BENCH(t, tries, rep, eig.computeDirect(A));
+  std::cout << "Eigen direct   :  " << t.best() << "s\n";
 
   Mat evecs;
   Vec evals;
   BENCH(t, tries, rep, eigen33(A,evecs,evals));
   std::cout << "Direct: " << t.best() << "s\n\n";
 
-  std::cerr << "Eigenvalue/eigenvector diffs:\n";
-  std::cerr << (evals - eig.eigenvalues()).transpose() << "\n";
-  for(int k=0;k<3;++k)
-    if(evecs.col(k).dot(eig.eigenvectors().col(k))<0)
-      evecs.col(k) = -evecs.col(k);
-  std::cerr << evecs - eig.eigenvectors() << "\n\n";
+//   std::cerr << "Eigenvalue/eigenvector diffs:\n";
+//   std::cerr << (evals - eig.eigenvalues()).transpose() << "\n";
+//   for(int k=0;k<3;++k)
+//     if(evecs.col(k).dot(eig.eigenvectors().col(k))<0)
+//       evecs.col(k) = -evecs.col(k);
+//   std::cerr << evecs - eig.eigenvectors() << "\n\n";
 }
diff --git a/blas/level2_impl.h b/blas/level2_impl.h
index e604fe611..917f2e372 100644
--- a/blas/level2_impl.h
+++ b/blas/level2_impl.h
@@ -351,8 +351,8 @@ int EIGEN_BLAS_FUNC(tbsv)(char *uplo, char *op, char *diag, int *n, int *k, Real
   static bool init = false;
   if(!init)
   {
-    for(int k=0; k<16; ++k)
-      func[k] = 0;
+    for(int i=0; i<16; ++i)
+      func[i] = 0;
 
     func[NOTR  | (UP << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector<int,Upper|0,       Scalar,false,Scalar,ColMajor>::run);
     func[TR    | (UP << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector<int,Lower|0,       Scalar,false,Scalar,RowMajor>::run);
diff --git a/blas/level3_impl.h b/blas/level3_impl.h
index 37a803ced..b3b727675 100644
--- a/blas/level3_impl.h
+++ b/blas/level3_impl.h
@@ -18,8 +18,8 @@ int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScal
   static bool init = false;
   if(!init)
   {
-    for(int k=0; k<12; ++k)
-      func[k] = 0;
+    for(int i=0; i<12; ++i)
+      func[i] = 0;
     func[NOTR  | (NOTR << 2)] = (internal::general_matrix_matrix_product<DenseIndex,Scalar,ColMajor,false,Scalar,ColMajor,false,ColMajor>::run);
     func[TR    | (NOTR << 2)] = (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,false,Scalar,ColMajor,false,ColMajor>::run);
     func[ADJ   | (NOTR << 2)] = (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,false,ColMajor>::run);
@@ -72,8 +72,8 @@ int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m,
   static bool init = false;
   if(!init)
   {
-    for(int k=0; k<32; ++k)
-      func[k] = 0;
+    for(int i=0; i<32; ++i)
+      func[i] = 0;
 
     func[NOTR  | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|0,          false,ColMajor,ColMajor>::run);
     func[TR    | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|0,          false,RowMajor,ColMajor>::run);
@@ -312,8 +312,8 @@ int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palp
   static bool init = false;
   if(!init)
   {
-    for(int k=0; k<8; ++k)
-      func[k] = 0;
+    for(int i=0; i<8; ++i)
+      func[i] = 0;
 
     func[NOTR  | (UP << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,ColMajor,Conj, Upper>::run);
     func[TR    | (UP << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,false,Scalar,ColMajor,ColMajor,Conj, Upper>::run);
@@ -506,8 +506,8 @@ int EIGEN_BLAS_FUNC(herk)(char *uplo, char *op, int *n, int *k, RealScalar *palp
   static bool init = false;
   if(!init)
   {
-    for(int k=0; k<8; ++k)
-      func[k] = 0;
+    for(int i=0; i<8; ++i)
+      func[i] = 0;
 
     func[NOTR  | (UP << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,Conj, ColMajor,Upper>::run);
     func[ADJ   | (UP << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,false,ColMajor,Upper>::run);
diff --git a/doc/A05_PortingFrom2To3.dox b/doc/A05_PortingFrom2To3.dox
index 47011aec0..2d9182bbb 100644
--- a/doc/A05_PortingFrom2To3.dox
+++ b/doc/A05_PortingFrom2To3.dox
@@ -278,7 +278,7 @@ result = Vector4f::MapAligned(some_aligned_array);
 
 \section StdContainers STL Containers
 
-In Eigen2, <tt>#include<Eigen/StdVector></tt> tweaked std::vector to automatically align elements. The problem was that that was quite invasive. In Eigen3, we only override standard behavior if you use Eigen::aligned_allocator<T> as your allocator type. So for example, if you use std::vector<Matrix4f>, you need to do the following change (note that aligned_allocator is under namespace Eigen):
+In Eigen2, <tt>\#include\<Eigen/StdVector\></tt> tweaked std::vector to automatically align elements. The problem was that that was quite invasive. In Eigen3, we only override standard behavior if you use Eigen::aligned_allocator<T> as your allocator type. So for example, if you use std::vector<Matrix4f>, you need to do the following change (note that aligned_allocator is under namespace Eigen):
 
 <table class="manual">
 <tr><th>Eigen 2</th><th>Eigen 3</th></tr>
diff --git a/doc/B01_Experimental.dox b/doc/B01_Experimental.dox
index 5fc0ccd60..e1f031db8 100644
--- a/doc/B01_Experimental.dox
+++ b/doc/B01_Experimental.dox
@@ -4,7 +4,7 @@ namespace Eigen {
 
 \eigenAutoToc
 
-\section summary Summary
+\section Experimental_summary Summary
 
 With the 2.0 release, Eigen's API is, to a large extent, stable. However, we wish to retain the freedom to make API incompatible changes. To that effect, we call many parts of Eigen "experimental" which means that they are not subject to API stability guarantee.
 
@@ -17,7 +17,7 @@ Experimental features may at any time:
 \li be subject to an API incompatible change;
 \li introduce API or ABI incompatible changes in your own code if you let them affect your API or ABI.
 
-\section modules Experimental modules
+\section Experimental_modules Experimental modules
 
 The following modules are considered entirely experimental, and we make no firm API stability guarantee about them for the time being:
 \li SVD
@@ -26,7 +26,7 @@ The following modules are considered entirely experimental, and we make no firm
 \li Sparse
 \li Geometry (this one should be mostly stable, but it's a little too early to make a formal guarantee)
 
-\section core Experimental parts of the Core module
+\section Experimental_core Experimental parts of the Core module
 
 In the Core module, the only classes subject to ABI stability guarantee (meaning that you can use it for data members in your public ABI) is:
 \li Matrix
diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
index 5d82add72..800bb30ee 100644
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@@ -206,7 +206,7 @@ TAB_SIZE               = 8
 # You can put \n's in the value part of an alias to insert newlines.
 
 ALIASES                = "only_for_vectors=This is only for vectors (either row-vectors or column-vectors), i.e. matrices which are known at compile-time to have either one row or one column." \
-			 "not_reentrant=\warning This function is not re-entrant." \
+                         "not_reentrant=\warning This function is not re-entrant." \
                          "array_module=This is defined in the %Array module. \code #include <Eigen/Array> \endcode" \
                          "cholesky_module=This is defined in the %Cholesky module. \code #include <Eigen/Cholesky> \endcode" \
                          "eigenvalues_module=This is defined in the %Eigenvalues module. \code #include <Eigen/Eigenvalues> \endcode" \
@@ -317,7 +317,7 @@ IDL_PROPERTY_SUPPORT   = YES
 # member in the group (if any) for the other members of the group. By default
 # all members of a group must be documented explicitly.
 
-DISTRIBUTE_GROUP_DOC   = NO
+DISTRIBUTE_GROUP_DOC   = YES
 
 # Set the SUBGROUPING tag to YES (the default) to allow class member groups of
 # the same type (for instance a group of public functions) to be put as a
@@ -367,7 +367,7 @@ TYPEDEF_HIDES_STRUCT   = NO
 # 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
 # corresponding to a cache size of 2^16 = 65536 symbols.
 
-SYMBOL_CACHE_SIZE      = 0
+# SYMBOL_CACHE_SIZE      = 0
 
 # Similar to the SYMBOL_CACHE_SIZE the size of the symbol lookup cache can be
 # set using LOOKUP_CACHE_SIZE. This cache is used to resolve symbols given
@@ -564,7 +564,7 @@ GENERATE_BUGLIST       = ${EIGEN_DOXY_INTERNAL}
 # disable (NO) the deprecated list. This list is created by putting
 # \deprecated commands in the documentation.
 
-GENERATE_DEPRECATEDLIST= ${EIGEN_DOXY_INTERNAL}
+GENERATE_DEPRECATEDLIST= YES
 
 # The ENABLED_SECTIONS tag can be used to enable conditional
 # documentation sections, marked by \if sectionname ... \endif.
@@ -1467,13 +1467,13 @@ XML_OUTPUT             = xml
 # which can be used by a validating XML parser to check the
 # syntax of the XML files.
 
-XML_SCHEMA             =
+# XML_SCHEMA             =
 
 # The XML_DTD tag can be used to specify an XML DTD,
 # which can be used by a validating XML parser to check the
 # syntax of the XML files.
 
-XML_DTD                =
+# XML_DTD                =
 
 # If the XML_PROGRAMLISTING tag is set to YES Doxygen will
 # dump the program listings (including syntax highlighting
@@ -1700,7 +1700,7 @@ DOT_NUM_THREADS        = 0
 # the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the
 # directory containing the font.
 
-DOT_FONTNAME           = FreeSans
+DOT_FONTNAME           = 
 
 # The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs.
 # The default size is 10pt.
diff --git a/doc/FixedSizeVectorizable.dox b/doc/FixedSizeVectorizable.dox
index 8ae135173..49e38af76 100644
--- a/doc/FixedSizeVectorizable.dox
+++ b/doc/FixedSizeVectorizable.dox
@@ -4,7 +4,7 @@ namespace Eigen {
 
 The goal of this page is to explain what we mean by "fixed-size vectorizable".
 
-\section summary Executive Summary
+\section FixedSizeVectorizable_summary Executive Summary
 
 An Eigen object is called "fixed-size vectorizable" if it has fixed size and that size is a multiple of 16 bytes.
 
@@ -21,7 +21,7 @@ Examples include:
 \li Eigen::Quaterniond
 \li Eigen::Quaternionf
 
-\section explanation Explanation
+\section FixedSizeVectorizable_explanation Explanation
 
 First, "fixed-size" should be clear: an Eigen object has fixed size if its number of rows and its number of columns are fixed at compile-time. So for example Matrix3f has fixed size, but MatrixXf doesn't (the opposite of fixed-size is dynamic-size).
 
diff --git a/doc/PreprocessorDirectives.dox b/doc/PreprocessorDirectives.dox
index 4be2167ef..6ffa2fc26 100644
--- a/doc/PreprocessorDirectives.dox
+++ b/doc/PreprocessorDirectives.dox
@@ -87,6 +87,9 @@ run time. However, these assertions do cost time and can thus be turned off.
  - \b EIGEN_STACK_ALLOCATION_LIMIT - defines the maximum bytes for a buffer to be allocated on the stack. For internal
    temporary buffers, dynamic memory allocation is employed as a fall back. For fixed-size matrices or arrays, exceeding
    this threshold raises a compile time assertion. Use 0 to set no limit. Default is 128 KB.
+ - \b EIGEN_HAS_POSIX_MEMALIGN - defines whether aligned memory allocation can be performed through the \c posix_memalign
+   function. The availability of \c posix_memalign is automatically checked on most platform, but this option allows to
+   by-pass %Eigen's built-in rules.
 
 
 \section TopicPreprocessorDirectivesPlugins Plugins
diff --git a/doc/StlContainers.dox b/doc/StlContainers.dox
index d8d0d529c..e0f8714a9 100644
--- a/doc/StlContainers.dox
+++ b/doc/StlContainers.dox
@@ -4,7 +4,7 @@ namespace Eigen {
 
 \eigenAutoToc
 
-\section summary Executive summary
+\section StlContainers_summary Executive summary
 
 Using STL containers on \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types", or classes having members of such types, requires taking the following two steps:
 
@@ -28,7 +28,7 @@ std::map<int, Eigen::Vector4f, std::less<int>,
 \endcode
 Note that the third parameter "std::less<int>" is just the default value, but we have to include it because we want to specify the fourth parameter, which is the allocator type.
 
-\section vector The case of std::vector
+\section StlContainers_vector The case of std::vector
 
 The situation with std::vector was even worse (explanation below) so we had to specialize it for the Eigen::aligned_allocator type. In practice you \b must use the Eigen::aligned_allocator (not another aligned allocator), \b and \#include <Eigen/StdVector>.
 
diff --git a/doc/StructHavingEigenMembers.dox b/doc/StructHavingEigenMembers.dox
index 74a8d5217..bd4fa7599 100644
--- a/doc/StructHavingEigenMembers.dox
+++ b/doc/StructHavingEigenMembers.dox
@@ -4,11 +4,11 @@ namespace Eigen {
 
 \eigenAutoToc
 
-\section summary Executive Summary
+\section StructHavingEigenMembers_summary Executive Summary
 
 If you define a structure having members of \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen types", you must overload its "operator new" so that it generates 16-bytes-aligned pointers. Fortunately, Eigen provides you with a macro EIGEN_MAKE_ALIGNED_OPERATOR_NEW that does that for you.
 
-\section what What kind of code needs to be changed?
+\section StructHavingEigenMembers_what What kind of code needs to be changed?
 
 The kind of code that needs to be changed is this:
 
@@ -27,7 +27,7 @@ Foo *foo = new Foo;
 
 In other words: you have a class that has as a member a \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen object", and then you dynamically create an object of that class.
 
-\section how How should such code be modified?
+\section StructHavingEigenMembers_how How should such code be modified?
 
 Very easy, you just need to put a EIGEN_MAKE_ALIGNED_OPERATOR_NEW macro in a public part of your class, like this:
 
@@ -50,7 +50,7 @@ This macro makes "new Foo" always return an aligned pointer.
 
 If this approach is too intrusive, see also the \ref othersolutions.
 
-\section why Why is this needed?
+\section StructHavingEigenMembers_why Why is this needed?
 
 OK let's say that your code looks like this:
 
@@ -81,7 +81,7 @@ The alignment attribute of the member v is then relative to the start of the cla
 
 The solution is to let class Foo have an aligned "operator new", as we showed in the previous section.
 
-\section movetotop Should I then put all the members of Eigen types at the beginning of my class?
+\section StructHavingEigenMembers_movetotop Should I then put all the members of Eigen types at the beginning of my class?
 
 That's not required. Since Eigen takes care of declaring 128-bit alignment, all members that need it are automatically 128-bit aligned relatively to the class. So code like this works fine:
 
@@ -95,15 +95,15 @@ public:
 };
 \endcode
 
-\section dynamicsize What about dynamic-size matrices and vectors?
+\section StructHavingEigenMembers_dynamicsize What about dynamic-size matrices and vectors?
 
 Dynamic-size matrices and vectors, such as Eigen::VectorXd, allocate dynamically their own array of coefficients, so they take care of requiring absolute alignment automatically. So they don't cause this issue. The issue discussed here is only with \ref TopicFixedSizeVectorizable  "fixed-size vectorizable matrices and vectors".
 
-\section bugineigen So is this a bug in Eigen?
+\section StructHavingEigenMembers_bugineigen So is this a bug in Eigen?
 
 No, it's not our bug. It's more like an inherent problem of the C++98 language specification, and seems to be taken care of in the upcoming language revision: <a href="http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2007/n2341.pdf">see this document</a>.
 
-\section conditional What if I want to do this conditionnally (depending on template parameters) ?
+\section StructHavingEigenMembers_conditional What if I want to do this conditionnally (depending on template parameters) ?
 
 For this situation, we offer the macro EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign). It will generate aligned operators like EIGEN_MAKE_ALIGNED_OPERATOR_NEW if NeedsToAlign is true. It will generate operators with the default alignment if NeedsToAlign is false.
 
@@ -128,7 +128,7 @@ Foo<3> *foo3 = new Foo<3>; // foo3 has only the system default alignment guarant
 \endcode
 
 
-\section othersolutions Other solutions
+\section StructHavingEigenMembers_othersolutions Other solutions
 
 In case putting the EIGEN_MAKE_ALIGNED_OPERATOR_NEW macro everywhere is too intrusive, there exists at least two other solutions.
 
diff --git a/doc/TemplateKeyword.dox b/doc/TemplateKeyword.dox
index c9944ae05..e06aba7ba 100644
--- a/doc/TemplateKeyword.dox
+++ b/doc/TemplateKeyword.dox
@@ -85,11 +85,11 @@ The precise rules are rather complicated, but ignoring some subtleties we can su
 - A <em>dependent name</em> is name that depends (directly or indirectly) on a template parameter. In the
   example, \c dst is a dependent name because it is of type <tt>MatrixBase&lt;Derived1&gt;</tt> which depends
   on the template parameter \c Derived1.
-- If the code contains either one of the contructions <tt>xxx.yyy</tt> or <tt>xxx-&gt;yyy</tt> and \c xxx is a
+- If the code contains either one of the constructs <tt>xxx.yyy</tt> or <tt>xxx-&gt;yyy</tt> and \c xxx is a
   dependent name and \c yyy refers to a member template, then the \c template keyword must be used before 
   \c yyy, leading to <tt>xxx.template yyy</tt> or <tt>xxx-&gt;template yyy</tt>.
-- If the code contains the contruction <tt>xxx::yyy</tt> and \c xxx is a dependent name and \c yyy refers to a
-  member typedef, then the \c typename keyword must be used before the whole construction, leading to
+- If the code contains the construct <tt>xxx::yyy</tt> and \c xxx is a dependent name and \c yyy refers to a
+  member typedef, then the \c typename keyword must be used before the whole construct, leading to
   <tt>typename xxx::yyy</tt>.
 
 As an example where the \c typename keyword is required, consider the following code in \ref TutorialSparse
diff --git a/doc/TopicMultithreading.dox b/doc/TopicMultithreading.dox
index ba5e26290..ba3547143 100644
--- a/doc/TopicMultithreading.dox
+++ b/doc/TopicMultithreading.dox
@@ -17,7 +17,7 @@ You can control the number of thread that will be used using either the OpenMP A
 Unless setNbThreads has been called, Eigen uses the number of threads specified by OpenMP. You can restore this bahavior by calling \code setNbThreads(0); \endcode
 You can query the number of threads that will be used with:
 \code
-n = Eigen::nbThreads(n);
+n = Eigen::nbThreads( );
 \endcode
 You can disable Eigen's multi threading at compile time by defining the EIGEN_DONT_PARALLELIZE preprocessor token.
 
diff --git a/doc/UsingNVCC.dox b/doc/UsingNVCC.dox
index e9df5de04..f8e755b79 100644
--- a/doc/UsingNVCC.dox
+++ b/doc/UsingNVCC.dox
@@ -15,7 +15,7 @@ Known issues:
  
  - \c nvcc with \c clang does not work (patch welcome)
  
- - \c nvcc 5.5 with gcc-4.7 (or greater) has issues with the standard \c <limits> header file. To workaround this, you can add the following before including any other files:
+ - \c nvcc 5.5 with gcc-4.7 (or greater) has issues with the standard \c \<limits\> header file. To workaround this, you can add the following before including any other files:
    \code
     // workaround issue between gcc >= 4.7 and cuda 5.5
     #if (defined __GNUC__) && (__GNUC__>4 || __GNUC_MINOR__>=7)
diff --git a/doc/snippets/MatrixBase_marked.cpp b/doc/snippets/MatrixBase_marked.cpp
deleted file mode 100644
index f60712178..000000000
--- a/doc/snippets/MatrixBase_marked.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef _MSC_VER
-  #warning deprecated
-#endif
-/*
-Matrix3d m = Matrix3d::Zero();
-m.part<Eigen::UpperTriangular>().setOnes();
-cout << "Here is the matrix m:" << endl << m << endl;
-Matrix3d n = Matrix3d::Ones();
-n.part<Eigen::LowerTriangular>() *= 2;
-cout << "Here is the matrix n:" << endl << n << endl;
-cout << "And now here is m.inverse()*n, taking advantage of the fact that"
-        " m is upper-triangular:" << endl
-     << m.marked<Eigen::UpperTriangular>().solveTriangular(n);
-*/
-\ No newline at end of file
diff --git a/doc/snippets/MatrixBase_part.cpp b/doc/snippets/MatrixBase_part.cpp
deleted file mode 100644
index d3e7f482e..000000000
--- a/doc/snippets/MatrixBase_part.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef _MSC_VER
-  #warning deprecated
-#endif
-/*
-Matrix3d m = Matrix3d::Zero();
-m.part<Eigen::StrictlyUpperTriangular>().setOnes();
-cout << "Here is the matrix m:" << endl << m << endl;
-cout << "And let us now compute m*m.adjoint() in a very optimized way" << endl
-     << "taking advantage of the symmetry." << endl;
-Matrix3d n;
-n.part<Eigen::SelfAdjoint>() = (m*m.adjoint()).lazy();
-cout << "The result is:" << endl << n << endl;
-*/
-\ No newline at end of file
diff --git a/doc/snippets/MatrixBase_extract.cpp b/doc/snippets/MatrixBase_triangularView.cpp
index c96220f72..03aa303f0 100644
--- a/doc/snippets/MatrixBase_extract.cpp
+++ b/doc/snippets/MatrixBase_triangularView.cpp
@@ -1,13 +1,9 @@
-#ifndef _MSC_VER
-  #warning deprecated
-#endif
-/* deprecated
 Matrix3i m = Matrix3i::Random();
 cout << "Here is the matrix m:" << endl << m << endl;
 cout << "Here is the upper-triangular matrix extracted from m:" << endl
-     << m.part<Eigen::UpperTriangular>() << endl;
+     << Matrix3i(m.triangularView<Eigen::Upper>()) << endl;
 cout << "Here is the strictly-upper-triangular matrix extracted from m:" << endl
-     << m.part<Eigen::StrictlyUpperTriangular>() << endl;
+     << Matrix3i(m.triangularView<Eigen::StrictlyUpper>()) << endl;
 cout << "Here is the unit-lower-triangular matrix extracted from m:" << endl
-     << m.part<Eigen::UnitLowerTriangular>() << endl;
-*/
-\ No newline at end of file
+     << Matrix3i(m.triangularView<Eigen::UnitLower>()) << endl;
+// FIXME need to implement output for triangularViews (Bug 885)
diff --git a/doc/snippets/Triangular_solve.cpp b/doc/snippets/Triangular_solve.cpp
new file mode 100644
index 000000000..548442467
--- /dev/null
+++ b/doc/snippets/Triangular_solve.cpp
@@ -0,0 +1,11 @@
+Matrix3d m = Matrix3d::Zero();
+m.triangularView<Eigen::Upper>().setOnes();
+cout << "Here is the matrix m:\n" << m << endl;
+Matrix3d n = Matrix3d::Ones();
+n.triangularView<Eigen::Lower>() *= 2;
+cout << "Here is the matrix n:\n" << n << endl;
+cout << "And now here is m.inverse()*n, taking advantage of the fact that"
+        " m is upper-triangular:\n"
+     << m.triangularView<Eigen::Upper>().solve(n) << endl;
+cout << "And this is n*m.inverse():\n"
+     << m.triangularView<Eigen::Upper>().solve<Eigen::OnTheRight>(n);
diff --git a/failtest/CMakeLists.txt b/failtest/CMakeLists.txt
index d3e82ecd9..8df0a7631 100644
--- a/failtest/CMakeLists.txt
+++ b/failtest/CMakeLists.txt
@@ -47,6 +47,8 @@ ei_add_failtest("sparse_ref_3")
 ei_add_failtest("sparse_ref_4")
 ei_add_failtest("sparse_ref_5")
 
+ei_add_failtest("sparse_storage_mismatch")
+
 ei_add_failtest("partialpivlu_int")
 ei_add_failtest("fullpivlu_int")
 ei_add_failtest("llt_int")
diff --git a/failtest/sparse_storage_mismatch.cpp b/failtest/sparse_storage_mismatch.cpp
new file mode 100644
index 000000000..51840d416
--- /dev/null
+++ b/failtest/sparse_storage_mismatch.cpp
@@ -0,0 +1,16 @@
+#include "../Eigen/Sparse"
+using namespace Eigen;
+
+typedef SparseMatrix<double,ColMajor> Mat1;
+#ifdef EIGEN_SHOULD_FAIL_TO_BUILD
+typedef SparseMatrix<double,RowMajor> Mat2;
+#else
+typedef SparseMatrix<double,ColMajor> Mat2;
+#endif
+
+int main()
+{
+  Mat1 a(10,10);
+  Mat2 b(10,10);
+  a += b;
+}
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 54ce7fb30..767e82f21 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -181,6 +181,7 @@ ei_add_test(array_for_matrix)
 ei_add_test(array_replicate)
 ei_add_test(array_reverse)
 ei_add_test(ref)
+ei_add_test(is_same_dense)
 ei_add_test(triangular)
 ei_add_test(selfadjoint)
 ei_add_test(product_selfadjoint)
@@ -252,6 +253,7 @@ ei_add_test(vectorwiseop)
 ei_add_test(special_numbers)
 ei_add_test(rvalue_types)
 ei_add_test(dense_storage)
+ei_add_test(ctorleak)
 
 # # ei_add_test(denseLM)
 
diff --git a/test/array.cpp b/test/array.cpp
index 0208ba7c9..90c75e9f0 100644
--- a/test/array.cpp
+++ b/test/array.cpp
@@ -22,6 +22,8 @@ template<typename ArrayType> void array(const ArrayType& m)
   ArrayType m1 = ArrayType::Random(rows, cols),
              m2 = ArrayType::Random(rows, cols),
              m3(rows, cols);
+  ArrayType m4 = m1; // copy constructor
+  VERIFY_IS_APPROX(m1, m4);
 
   ColVectorType cv1 = ColVectorType::Random(rows);
   RowVectorType rv1 = RowVectorType::Random(cols);
@@ -134,6 +136,8 @@ template<typename ArrayType> void comparisons(const ArrayType& m)
     VERIFY(! (m1 < m3).all() );
     VERIFY(! (m1 > m3).all() );
   }
+  VERIFY(!(m1 > m2 && m1 < m2).any());
+  VERIFY((m1 <= m2 || m1 >= m2).all());
 
   // comparisons array to scalar
   VERIFY( (m1 != (m1(r,c)+1) ).any() );
diff --git a/test/array_for_matrix.cpp b/test/array_for_matrix.cpp
index 9a50f99ab..9667e1f14 100644
--- a/test/array_for_matrix.cpp
+++ b/test/array_for_matrix.cpp
@@ -102,6 +102,7 @@ template<typename MatrixType> void comparisons(const MatrixType& m)
   VERIFY( (m1.array() > (m1(r,c)-1) ).any() );
   VERIFY( (m1.array() < (m1(r,c)+1) ).any() );
   VERIFY( (m1.array() == m1(r,c) ).any() );
+  VERIFY( m1.cwiseEqual(m1(r,c)).any() );
 
   // test Select
   VERIFY_IS_APPROX( (m1.array()<m2.array()).select(m1,m2), m1.cwiseMin(m2) );
diff --git a/test/array_replicate.cpp b/test/array_replicate.cpp
index f412d1aed..779c8fc2f 100644
--- a/test/array_replicate.cpp
+++ b/test/array_replicate.cpp
@@ -44,6 +44,19 @@ template<typename MatrixType> void replicate(const MatrixType& m)
   x2 << m2, m2, m2,
         m2, m2, m2;
   VERIFY_IS_APPROX(x2, (m2.template replicate<2,3>()));
+  
+  x2.resize(rows,3*cols);
+  x2 << m2, m2, m2;
+  VERIFY_IS_APPROX(x2, (m2.template replicate<1,3>()));
+  
+  vx1.resize(3*rows,cols);
+  vx1 << m2, m2, m2;
+  VERIFY_IS_APPROX(vx1+vx1, vx1+(m2.template replicate<3,1>()));
+  
+  vx1=m2+(m2.colwise().replicate(1));
+  
+  if(m2.cols()==1)
+    VERIFY_IS_APPROX(m2.coeff(0), (m2.template replicate<3,1>().coeff(m2.rows())));
 
   x2.resize(rows,f1);
   for (int j=0; j<f1; ++j)
diff --git a/test/bicgstab.cpp b/test/bicgstab.cpp
index 7a9a11330..4cc0dd31c 100644
--- a/test/bicgstab.cpp
+++ b/test/bicgstab.cpp
@@ -17,6 +17,9 @@ template<typename T, typename I> void test_bicgstab_T()
   BiCGSTAB<SparseMatrix<T,0,I>, IncompleteLUT<T,I> >              bicgstab_colmajor_ilut;
   //BiCGSTAB<SparseMatrix<T>, SSORPreconditioner<T> >     bicgstab_colmajor_ssor;
 
+  bicgstab_colmajor_diag.setTolerance(NumTraits<T>::epsilon()*4);
+  bicgstab_colmajor_ilut.setTolerance(NumTraits<T>::epsilon()*4);
+  
   CALL_SUBTEST( check_sparse_square_solving(bicgstab_colmajor_diag)  );
 //   CALL_SUBTEST( check_sparse_square_solving(bicgstab_colmajor_I)     );
   CALL_SUBTEST( check_sparse_square_solving(bicgstab_colmajor_ilut)     );
diff --git a/test/ctorleak.cpp b/test/ctorleak.cpp
index 145d91be4..c158f5e4e 100644
--- a/test/ctorleak.cpp
+++ b/test/ctorleak.cpp
@@ -4,48 +4,66 @@
 
 struct Foo
 {
-  static unsigned object_count;
-  static unsigned object_limit;
+  static Index object_count;
+  static Index object_limit;
   int dummy;
 
   Foo()
   {
 #ifdef EIGEN_EXCEPTIONS
     // TODO: Is this the correct way to handle this?
-    if (Foo::object_count > Foo::object_limit) { throw Foo::Fail(); }
+    if (Foo::object_count > Foo::object_limit) { std::cout << "\nThrow!\n"; throw Foo::Fail(); }
 #endif
+	  std::cout << '+';
     ++Foo::object_count;
   }
 
   ~Foo()
   {
+	  std::cout << '-';
     --Foo::object_count;
   }
 
   class Fail : public std::exception {};
 };
 
-unsigned Foo::object_count = 0;
-unsigned Foo::object_limit = 0;
+Index Foo::object_count = 0;
+Index Foo::object_limit = 0;
 
+#undef EIGEN_TEST_MAX_SIZE
+#define EIGEN_TEST_MAX_SIZE 3
 
 void test_ctorleak()
 {
-  typedef DenseIndex Index;
+  typedef Matrix<Foo, Dynamic, Dynamic> MatrixX;
+  typedef Matrix<Foo, Dynamic, 1> VectorX;
   Foo::object_count = 0;
   for(int i = 0; i < g_repeat; i++) {
     Index rows = internal::random<Index>(2,EIGEN_TEST_MAX_SIZE), cols = internal::random<Index>(2,EIGEN_TEST_MAX_SIZE);
-    Foo::object_limit = internal::random(0, rows*cols - 2);
+    Foo::object_limit = internal::random<Index>(0, rows*cols - 2);
+    std::cout << "object_limit =" << Foo::object_limit << std::endl;
 #ifdef EIGEN_EXCEPTIONS
     try
     {
 #endif
-      Matrix<Foo, Dynamic, Dynamic> m(rows, cols);
+    	std::cout <<       "\nMatrixX m(" << rows << ", " << cols << ");\n";
+      MatrixX m(rows, cols);
 #ifdef EIGEN_EXCEPTIONS
       VERIFY(false);  // not reached if exceptions are enabled
     }
     catch (const Foo::Fail&) { /* ignore */ }
 #endif
+    VERIFY_IS_EQUAL(Index(0), Foo::object_count);
+
+    {
+      Foo::object_limit = (rows+1)*(cols+1);
+      MatrixX A(rows, cols);
+      VERIFY_IS_EQUAL(Foo::object_count, rows*cols);
+      VectorX v=A.row(0);
+      VERIFY_IS_EQUAL(Foo::object_count, (rows+1)*cols);
+      v = A.col(0);
+      VERIFY_IS_EQUAL(Foo::object_count, rows*(cols+1));
+    }
+    VERIFY_IS_EQUAL(Index(0), Foo::object_count);
   }
-  VERIFY_IS_EQUAL(static_cast<unsigned>(0), Foo::object_count);
 }
diff --git a/test/eigensolver_selfadjoint.cpp b/test/eigensolver_selfadjoint.cpp
index 7b0077a6d..41b6d99ab 100644
--- a/test/eigensolver_selfadjoint.cpp
+++ b/test/eigensolver_selfadjoint.cpp
@@ -9,9 +9,45 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #include "main.h"
+#include "svd_fill.h"
 #include <limits>
 #include <Eigen/Eigenvalues>
 
+
+template<typename MatrixType> void selfadjointeigensolver_essential_check(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  RealScalar eival_eps = (std::min)(test_precision<RealScalar>(),  NumTraits<Scalar>::dummy_precision()*20000);
+  
+  SelfAdjointEigenSolver<MatrixType> eiSymm(m);
+  VERIFY_IS_EQUAL(eiSymm.info(), Success);
+  VERIFY_IS_APPROX(m.template selfadjointView<Lower>() * eiSymm.eigenvectors(),
+                   eiSymm.eigenvectors() * eiSymm.eigenvalues().asDiagonal());
+  VERIFY_IS_APPROX(m.template selfadjointView<Lower>().eigenvalues(), eiSymm.eigenvalues());
+  VERIFY_IS_UNITARY(eiSymm.eigenvectors());
+
+  if(m.cols()<=4)
+  {
+    SelfAdjointEigenSolver<MatrixType> eiDirect;
+    eiDirect.computeDirect(m);  
+    VERIFY_IS_EQUAL(eiDirect.info(), Success);
+    VERIFY_IS_APPROX(eiSymm.eigenvalues(), eiDirect.eigenvalues());
+    if(! eiSymm.eigenvalues().isApprox(eiDirect.eigenvalues(), eival_eps) )
+    {
+      std::cerr << "reference eigenvalues: " << eiSymm.eigenvalues().transpose() << "\n"
+                << "obtained eigenvalues:  " << eiDirect.eigenvalues().transpose() << "\n"
+                << "diff:                  " << (eiSymm.eigenvalues()-eiDirect.eigenvalues()).transpose() << "\n"
+                << "error (eps):           " << (eiSymm.eigenvalues()-eiDirect.eigenvalues()).norm() / eiSymm.eigenvalues().norm() << "  (" << eival_eps << ")\n";
+    }
+    VERIFY(eiSymm.eigenvalues().isApprox(eiDirect.eigenvalues(), eival_eps));
+    VERIFY_IS_APPROX(m.template selfadjointView<Lower>() * eiDirect.eigenvectors(),
+                    eiDirect.eigenvectors() * eiDirect.eigenvalues().asDiagonal());
+    VERIFY_IS_APPROX(m.template selfadjointView<Lower>().eigenvalues(), eiDirect.eigenvalues());
+    VERIFY_IS_UNITARY(eiDirect.eigenvectors());
+  }
+}
+
 template<typename MatrixType> void selfadjointeigensolver(const MatrixType& m)
 {
   typedef typename MatrixType::Index Index;
@@ -31,17 +67,8 @@ template<typename MatrixType> void selfadjointeigensolver(const MatrixType& m)
   MatrixType symmA =  a.adjoint() * a + a1.adjoint() * a1;
   MatrixType symmC = symmA;
   
-  // randomly nullify some rows/columns
-  {
-    Index count = 1;//internal::random<Index>(-cols,cols);
-    for(Index k=0; k<count; ++k)
-    {
-      Index i = internal::random<Index>(0,cols-1);
-      symmA.row(i).setZero();
-      symmA.col(i).setZero();
-    }
-  }
-  
+  svd_fill_random(symmA,Symmetric);
+
   symmA.template triangularView<StrictlyUpper>().setZero();
   symmC.template triangularView<StrictlyUpper>().setZero();
 
@@ -49,23 +76,13 @@ template<typename MatrixType> void selfadjointeigensolver(const MatrixType& m)
   MatrixType b1 = MatrixType::Random(rows,cols);
   MatrixType symmB = b.adjoint() * b + b1.adjoint() * b1;
   symmB.template triangularView<StrictlyUpper>().setZero();
+  
+  CALL_SUBTEST( selfadjointeigensolver_essential_check(symmA) );
 
   SelfAdjointEigenSolver<MatrixType> eiSymm(symmA);
-  SelfAdjointEigenSolver<MatrixType> eiDirect;
-  eiDirect.computeDirect(symmA);
   // generalized eigen pb
   GeneralizedSelfAdjointEigenSolver<MatrixType> eiSymmGen(symmC, symmB);
 
-  VERIFY_IS_EQUAL(eiSymm.info(), Success);
-  VERIFY((symmA.template selfadjointView<Lower>() * eiSymm.eigenvectors()).isApprox(
-          eiSymm.eigenvectors() * eiSymm.eigenvalues().asDiagonal(), largerEps));
-  VERIFY_IS_APPROX(symmA.template selfadjointView<Lower>().eigenvalues(), eiSymm.eigenvalues());
-  
-  VERIFY_IS_EQUAL(eiDirect.info(), Success);
-  VERIFY((symmA.template selfadjointView<Lower>() * eiDirect.eigenvectors()).isApprox(
-          eiDirect.eigenvectors() * eiDirect.eigenvalues().asDiagonal(), largerEps));
-  VERIFY_IS_APPROX(symmA.template selfadjointView<Lower>().eigenvalues(), eiDirect.eigenvalues());
-
   SelfAdjointEigenSolver<MatrixType> eiSymmNoEivecs(symmA, false);
   VERIFY_IS_EQUAL(eiSymmNoEivecs.info(), Success);
   VERIFY_IS_APPROX(eiSymm.eigenvalues(), eiSymmNoEivecs.eigenvalues());
@@ -141,6 +158,24 @@ template<typename MatrixType> void selfadjointeigensolver(const MatrixType& m)
   }
 }
 
+void bug_854()
+{
+  Matrix3d m;
+  m << 850.961, 51.966, 0,
+       51.966, 254.841, 0,
+            0,       0, 0;
+  selfadjointeigensolver_essential_check(m);
+}
+
+void bug_1014()
+{
+  Matrix3d m;
+  m <<        0.11111111111111114658, 0, 0,
+       0,     0.11111111111111109107, 0,
+       0, 0,  0.11111111111111107719;
+  selfadjointeigensolver_essential_check(m);
+}
+
 void test_eigensolver_selfadjoint()
 {
   int s = 0;
@@ -168,6 +203,9 @@ void test_eigensolver_selfadjoint()
     CALL_SUBTEST_6( selfadjointeigensolver(Matrix<double,1,1>()) );
     CALL_SUBTEST_7( selfadjointeigensolver(Matrix<double,2,2>()) );
   }
+  
+  CALL_SUBTEST_13( bug_854() );
+  CALL_SUBTEST_13( bug_1014() );
 
   // Test problem size constructors
   s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4);
diff --git a/test/geo_homogeneous.cpp b/test/geo_homogeneous.cpp
index 2f9d18c0f..bf63c69ec 100644
--- a/test/geo_homogeneous.cpp
+++ b/test/geo_homogeneous.cpp
@@ -94,6 +94,21 @@ template<typename Scalar,int Size> void homogeneous(void)
   VERIFY_IS_APPROX((aff  * pts2).colwise().hnormalized(), aff  * pts2.colwise().hnormalized());
   VERIFY_IS_APPROX((caff * pts2).colwise().hnormalized(), caff * pts2.colwise().hnormalized());
   VERIFY_IS_APPROX((proj * pts2).colwise().hnormalized(), (proj * pts2.colwise().hnormalized().colwise().homogeneous()).colwise().hnormalized());
+  
+  // Test combination of homogeneous
+  
+  VERIFY_IS_APPROX( (t2 * v0.homogeneous()).hnormalized(),
+                       (t2.template topLeftCorner<Size,Size>() * v0 + t2.template topRightCorner<Size,1>())
+                     / ((t2.template bottomLeftCorner<1,Size>()*v0).value() + t2(Size,Size)) );
+  
+  VERIFY_IS_APPROX( (t2 * pts.colwise().homogeneous()).colwise().hnormalized(),
+                    (Matrix<Scalar, Size+1, Dynamic>(t2 * pts1).colwise().hnormalized()) );
+  
+  VERIFY_IS_APPROX( (t2 .lazyProduct( v0.homogeneous() )).hnormalized(), (t2 * v0.homogeneous()).hnormalized() );
+  VERIFY_IS_APPROX( (t2 .lazyProduct  ( pts.colwise().homogeneous() )).colwise().hnormalized(), (t2 * pts1).colwise().hnormalized() );
+  
+  VERIFY_IS_APPROX( (v0.transpose().homogeneous() .lazyProduct( t2 )).hnormalized(), (v0.transpose().homogeneous()*t2).hnormalized() );
+  VERIFY_IS_APPROX( (pts.transpose().rowwise().homogeneous() .lazyProduct( t2 )).rowwise().hnormalized(), (pts1.transpose()*t2).rowwise().hnormalized() );
 }
 
 void test_geo_homogeneous()
diff --git a/test/inverse.cpp b/test/inverse.cpp
index b09989aca..5c6777a18 100644
--- a/test/inverse.cpp
+++ b/test/inverse.cpp
@@ -71,11 +71,11 @@ template<typename MatrixType> void inverse(const MatrixType& m)
   
   // check with submatrices
   {
-    Matrix<Scalar, MatrixType::RowsAtCompileTime+1, MatrixType::RowsAtCompileTime+1, MatrixType::Options> m3;
-    m3.setRandom();
-    m3.topLeftCorner(rows,rows) = m1;
-    m2 = m3.template topLeftCorner<MatrixType::RowsAtCompileTime,MatrixType::ColsAtCompileTime>().inverse();
-    VERIFY_IS_APPROX( (m3.template topLeftCorner<MatrixType::RowsAtCompileTime,MatrixType::ColsAtCompileTime>()), m2.inverse() );
+    Matrix<Scalar, MatrixType::RowsAtCompileTime+1, MatrixType::RowsAtCompileTime+1, MatrixType::Options> m5;
+    m5.setRandom();
+    m5.topLeftCorner(rows,rows) = m1;
+    m2 = m5.template topLeftCorner<MatrixType::RowsAtCompileTime,MatrixType::ColsAtCompileTime>().inverse();
+    VERIFY_IS_APPROX( (m5.template topLeftCorner<MatrixType::RowsAtCompileTime,MatrixType::ColsAtCompileTime>()), m2.inverse() );
   }
 #endif
 
diff --git a/test/is_same_dense.cpp b/test/is_same_dense.cpp
new file mode 100644
index 000000000..318ba8717
--- /dev/null
+++ b/test/is_same_dense.cpp
@@ -0,0 +1,30 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+void test_is_same_dense()
+{
+  MatrixXd m1(10,10);
+  Ref<MatrixXd> ref_m1(m1);
+  Ref<const MatrixXd> const_ref_m1(m1);
+  VERIFY(is_same_dense(m1,m1));
+  VERIFY(is_same_dense(m1,ref_m1));
+  VERIFY(is_same_dense(const_ref_m1,m1));
+  VERIFY(is_same_dense(const_ref_m1,ref_m1));
+  
+  VERIFY(is_same_dense(m1.block(0,0,m1.rows(),m1.cols()),m1));
+  VERIFY(!is_same_dense(m1.row(0),m1.col(0)));
+  
+  Ref<const MatrixXd> const_ref_m1_row(m1.row(1));
+  VERIFY(!is_same_dense(m1.row(1),const_ref_m1_row));
+  
+  Ref<const MatrixXd> const_ref_m1_col(m1.col(1));
+  VERIFY(is_same_dense(m1.col(1),const_ref_m1_col));
+}
diff --git a/test/lu.cpp b/test/lu.cpp
index 25f86755a..b90367438 100644
--- a/test/lu.cpp
+++ b/test/lu.cpp
@@ -100,7 +100,9 @@ template<typename MatrixType> void lu_invertible()
      LU.h
   */
   typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-  int size = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
+  Index size = MatrixType::RowsAtCompileTime;
+  if( size==Dynamic)
+    size = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
 
   MatrixType m1(size, size), m2(size, size), m3(size, size);
   FullPivLU<MatrixType> lu;
@@ -122,6 +124,10 @@ template<typename MatrixType> void lu_invertible()
   m2 = lu.solve(m3);
   VERIFY_IS_APPROX(m3, m1*m2);
   VERIFY_IS_APPROX(m2, lu.inverse()*m3);
+
+  // Regression test for Bug 302
+  MatrixType m4 = MatrixType::Random(size,size);
+  VERIFY_IS_APPROX(lu.solve(m3*m4), lu.solve(m3)*m4);
 }
 
 template<typename MatrixType> void lu_partial_piv()
@@ -171,6 +177,7 @@ void test_lu()
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( lu_non_invertible<Matrix3f>() );
+    CALL_SUBTEST_1( lu_invertible<Matrix3f>() );
     CALL_SUBTEST_1( lu_verify_assert<Matrix3f>() );
 
     CALL_SUBTEST_2( (lu_non_invertible<Matrix<double, 4, 6> >()) );
diff --git a/test/mapped_matrix.cpp b/test/mapped_matrix.cpp
index 5eba3ecb3..05a86e70b 100644
--- a/test/mapped_matrix.cpp
+++ b/test/mapped_matrix.cpp
@@ -22,7 +22,6 @@ template<typename VectorType> void map_class_vector(const VectorType& m)
 
   Index size = m.size();
 
-  // test Map.h
   Scalar* array1 = internal::aligned_new<Scalar>(size);
   Scalar* array2 = internal::aligned_new<Scalar>(size);
   Scalar* array3 = new Scalar[size+1];
@@ -56,23 +55,64 @@ template<typename MatrixType> void map_class_matrix(const MatrixType& m)
   typedef typename MatrixType::Scalar Scalar;
 
   Index rows = m.rows(), cols = m.cols(), size = rows*cols;
+  Scalar s1 = internal::random<Scalar>();
 
-  // test Map.h
+  // array1 and array2 -> aligned heap allocation
   Scalar* array1 = internal::aligned_new<Scalar>(size);
   for(int i = 0; i < size; i++) array1[i] = Scalar(1);
   Scalar* array2 = internal::aligned_new<Scalar>(size);
   for(int i = 0; i < size; i++) array2[i] = Scalar(1);
+  // array3unaligned -> unaligned pointer to heap
   Scalar* array3 = new Scalar[size+1];
   for(int i = 0; i < size+1; i++) array3[i] = Scalar(1);
   Scalar* array3unaligned = size_t(array3)%EIGEN_ALIGN_BYTES == 0 ? array3+1 : array3;
-  Map<MatrixType, Aligned>(array1, rows, cols) = MatrixType::Ones(rows,cols);
-  Map<MatrixType>(array2, rows, cols) = Map<MatrixType>(array1, rows, cols);
-  Map<MatrixType>(array3unaligned, rows, cols) = Map<MatrixType>(array1, rows, cols);
-  MatrixType ma1 = Map<MatrixType>(array1, rows, cols);
-  MatrixType ma2 = Map<MatrixType, Aligned>(array2, rows, cols);
+  Scalar array4[256];
+  if(size<=256)
+    for(int i = 0; i < size; i++) array4[i] = Scalar(1);
+  
+  Map<MatrixType> map1(array1, rows, cols);
+  Map<MatrixType, Aligned> map2(array2, rows, cols);
+  Map<MatrixType> map3(array3unaligned, rows, cols);
+  Map<MatrixType> map4(array4, rows, cols);
+  
+  VERIFY_IS_EQUAL(map1, MatrixType::Ones(rows,cols));
+  VERIFY_IS_EQUAL(map2, MatrixType::Ones(rows,cols));
+  VERIFY_IS_EQUAL(map3, MatrixType::Ones(rows,cols));
+  map1 = MatrixType::Random(rows,cols);
+  map2 = map1;
+  map3 = map1;
+  MatrixType ma1 = map1;
+  MatrixType ma2 = map2;
+  MatrixType ma3 = map3;
+  VERIFY_IS_EQUAL(map1, map2);
+  VERIFY_IS_EQUAL(map1, map3);
   VERIFY_IS_EQUAL(ma1, ma2);
-  MatrixType ma3 = Map<MatrixType>(array3unaligned, rows, cols);
   VERIFY_IS_EQUAL(ma1, ma3);
+  VERIFY_IS_EQUAL(ma1, map3);
+  
+  VERIFY_IS_APPROX(s1*map1, s1*map2);
+  VERIFY_IS_APPROX(s1*ma1, s1*ma2);
+  VERIFY_IS_EQUAL(s1*ma1, s1*ma3);
+  VERIFY_IS_APPROX(s1*map1, s1*map3);
+  
+  map2 *= s1;
+  map3 *= s1;
+  VERIFY_IS_APPROX(s1*map1, map2);
+  VERIFY_IS_APPROX(s1*map1, map3);
+  
+  if(size<=256)
+  {
+    VERIFY_IS_EQUAL(map4, MatrixType::Ones(rows,cols));
+    map4 = map1;
+    MatrixType ma4 = map4;
+    VERIFY_IS_EQUAL(map1, map4);
+    VERIFY_IS_EQUAL(ma1, map4);
+    VERIFY_IS_EQUAL(ma1, ma4);
+    VERIFY_IS_APPROX(s1*map1, s1*map4);
+    
+    map4 *= s1;
+    VERIFY_IS_APPROX(s1*map1, map4);
+  }
 
   internal::aligned_delete(array1, size);
   internal::aligned_delete(array2, size);
@@ -86,7 +126,6 @@ template<typename VectorType> void map_static_methods(const VectorType& m)
 
   Index size = m.size();
 
-  // test Map.h
   Scalar* array1 = internal::aligned_new<Scalar>(size);
   Scalar* array2 = internal::aligned_new<Scalar>(size);
   Scalar* array3 = new Scalar[size+1];
diff --git a/test/mapstride.cpp b/test/mapstride.cpp
index b1dc9de2a..13523bb56 100644
--- a/test/mapstride.cpp
+++ b/test/mapstride.cpp
@@ -56,16 +56,30 @@ template<int Alignment,typename MatrixType> void map_class_matrix(const MatrixTy
   Index rows = _m.rows(), cols = _m.cols();
 
   MatrixType m = MatrixType::Random(rows,cols);
+  Scalar s1 = internal::random<Scalar>();
 
   Index arraysize = 2*(rows+4)*(cols+4);
 
-  Scalar* a_array = internal::aligned_new<Scalar>(arraysize+1);
-  Scalar* array = a_array;
+  Scalar* a_array1 = internal::aligned_new<Scalar>(arraysize+1);
+  Scalar* array1 = a_array1;
   if(Alignment!=Aligned)
-    array = (Scalar*)(ptrdiff_t(a_array) + (internal::packet_traits<Scalar>::AlignedOnScalar?sizeof(Scalar):sizeof(typename NumTraits<Scalar>::Real)));
+    array1 = (Scalar*)(std::ptrdiff_t(a_array1) + (internal::packet_traits<Scalar>::AlignedOnScalar?sizeof(Scalar):sizeof(typename NumTraits<Scalar>::Real)));
 
+  Scalar a_array2[256];
+  Scalar* array2 = a_array2;
+  if(Alignment!=Aligned)
+    array2 = (Scalar*)(std::ptrdiff_t(a_array2) + (internal::packet_traits<Scalar>::AlignedOnScalar?sizeof(Scalar):sizeof(typename NumTraits<Scalar>::Real)));
+  else
+    array2 = (Scalar*)(((std::size_t(a_array2)+EIGEN_ALIGN_BYTES-1)/EIGEN_ALIGN_BYTES)*EIGEN_ALIGN_BYTES);
+  Index maxsize2 = a_array2 - array2 + 256;
+  
   // test no inner stride and some dynamic outer stride
+  for(int k=0; k<2; ++k)
   {
+    if(k==1 && (m.innerSize()+1)*m.outerSize() > maxsize2)
+      break;
+    Scalar* array = (k==0 ? array1 : array2);
+    
     Map<MatrixType, Alignment, OuterStride<Dynamic> > map(array, rows, cols, OuterStride<Dynamic>(m.innerSize()+1));
     map = m;
     VERIFY(map.outerStride() == map.innerSize()+1);
@@ -75,11 +89,19 @@ template<int Alignment,typename MatrixType> void map_class_matrix(const MatrixTy
         VERIFY(array[map.outerStride()*i+j] == m.coeffByOuterInner(i,j));
         VERIFY(map.coeffByOuterInner(i,j) == m.coeffByOuterInner(i,j));
       }
+    VERIFY_IS_APPROX(s1*map,s1*m);
+    map *= s1;
+    VERIFY_IS_APPROX(map,s1*m);
   }
 
   // test no inner stride and an outer stride of +4. This is quite important as for fixed-size matrices,
   // this allows to hit the special case where it's vectorizable.
+  for(int k=0; k<2; ++k)
   {
+    if(k==1 && (m.innerSize()+4)*m.outerSize() > maxsize2)
+      break;
+    Scalar* array = (k==0 ? array1 : array2);
+    
     enum {
       InnerSize = MatrixType::InnerSizeAtCompileTime,
       OuterStrideAtCompileTime = InnerSize==Dynamic ? Dynamic : InnerSize+4
@@ -94,10 +116,18 @@ template<int Alignment,typename MatrixType> void map_class_matrix(const MatrixTy
         VERIFY(array[map.outerStride()*i+j] == m.coeffByOuterInner(i,j));
         VERIFY(map.coeffByOuterInner(i,j) == m.coeffByOuterInner(i,j));
       }
+    VERIFY_IS_APPROX(s1*map,s1*m);
+    map *= s1;
+    VERIFY_IS_APPROX(map,s1*m);
   }
 
   // test both inner stride and outer stride
+  for(int k=0; k<2; ++k)
   {
+    if(k==1 && (2*m.innerSize()+1)*(m.outerSize()*2) > maxsize2)
+      break;
+    Scalar* array = (k==0 ? array1 : array2);
+    
     Map<MatrixType, Alignment, Stride<Dynamic,Dynamic> > map(array, rows, cols, Stride<Dynamic,Dynamic>(2*m.innerSize()+1, 2));
     map = m;
     VERIFY(map.outerStride() == 2*map.innerSize()+1);
@@ -108,9 +138,12 @@ template<int Alignment,typename MatrixType> void map_class_matrix(const MatrixTy
         VERIFY(array[map.outerStride()*i+map.innerStride()*j] == m.coeffByOuterInner(i,j));
         VERIFY(map.coeffByOuterInner(i,j) == m.coeffByOuterInner(i,j));
       }
+    VERIFY_IS_APPROX(s1*map,s1*m);
+    map *= s1;
+    VERIFY_IS_APPROX(map,s1*m);
   }
 
-  internal::aligned_delete(a_array, arraysize+1);
+  internal::aligned_delete(a_array1, arraysize+1);
 }
 
 void test_mapstride()
diff --git a/test/permutationmatrices.cpp b/test/permutationmatrices.cpp
index 7b0dbc763..7fd4c5ebb 100644
--- a/test/permutationmatrices.cpp
+++ b/test/permutationmatrices.cpp
@@ -102,6 +102,30 @@ template<typename MatrixType> void permutationmatrices(const MatrixType& m)
   }  
 }
 
+template<typename T>
+void bug890()
+{
+  typedef Matrix<T, Dynamic, Dynamic> MatrixType;
+  typedef Matrix<T, Dynamic, 1> VectorType;
+  typedef Stride<Dynamic,Dynamic> S;
+  typedef Map<MatrixType, Aligned, S> MapType;
+  typedef PermutationMatrix<Dynamic> Perm;
+  
+  VectorType v1(2), v2(2), op(4), rhs(2);
+  v1 << 666,667;
+  op << 1,0,0,1;
+  rhs << 42,42;
+  
+  Perm P(2);
+  P.indices() << 1, 0;
+
+  MapType(v1.data(),2,1,S(1,1)) = P * MapType(rhs.data(),2,1,S(1,1));
+  VERIFY_IS_APPROX(v1, (P * rhs).eval());
+  
+  MapType(v1.data(),2,1,S(1,1)) = P.inverse() * MapType(rhs.data(),2,1,S(1,1));
+  VERIFY_IS_APPROX(v1, (P.inverse() * rhs).eval());
+}
+
 void test_permutationmatrices()
 {
   for(int i = 0; i < g_repeat; i++) {
@@ -113,4 +137,5 @@ void test_permutationmatrices()
     CALL_SUBTEST_6( permutationmatrices(Matrix<double,Dynamic,Dynamic,RowMajor>(20, 30)) );
     CALL_SUBTEST_7( permutationmatrices(MatrixXcf(15, 10)) );
   }
+  CALL_SUBTEST_5( bug890<double>() );
 }
diff --git a/test/simplicial_cholesky.cpp b/test/simplicial_cholesky.cpp
index b7cc2d351..649c817b4 100644
--- a/test/simplicial_cholesky.cpp
+++ b/test/simplicial_cholesky.cpp
@@ -35,8 +35,8 @@ template<typename T, typename I> void test_simplicial_cholesky_T()
   check_sparse_spd_determinant(ldlt_colmajor_lower_amd);
   check_sparse_spd_determinant(ldlt_colmajor_upper_amd);
   
-  check_sparse_spd_solving(ldlt_colmajor_lower_nat);
-  check_sparse_spd_solving(ldlt_colmajor_upper_nat);
+  check_sparse_spd_solving(ldlt_colmajor_lower_nat, 300, 1000);
+  check_sparse_spd_solving(ldlt_colmajor_upper_nat, 300, 1000);
 }
 
 void test_simplicial_cholesky()
diff --git a/test/sparse_product.cpp b/test/sparse_product.cpp
index 3bad3def7..f1e5b8e4c 100644
--- a/test/sparse_product.cpp
+++ b/test/sparse_product.cpp
@@ -88,6 +88,10 @@ template<typename SparseMatrixType> void sparse_product()
 
     VERIFY_IS_APPROX(dm4=m2*refMat3, refMat4=refMat2*refMat3);
     VERIFY_IS_APPROX(dm4=dm4+m2*refMat3, refMat4=refMat4+refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4+=m2*refMat3, refMat4+=refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4-=m2*refMat3, refMat4-=refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4.noalias()+=m2*refMat3, refMat4+=refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4.noalias()-=m2*refMat3, refMat4-=refMat2*refMat3);
     VERIFY_IS_APPROX(dm4=m2*(refMat3+refMat3), refMat4=refMat2*(refMat3+refMat3));
     VERIFY_IS_APPROX(dm4=m2t.transpose()*(refMat3+refMat5)*0.5, refMat4=refMat2t.transpose()*(refMat3+refMat5)*0.5);
     
@@ -101,6 +105,9 @@ template<typename SparseMatrixType> void sparse_product()
     VERIFY_IS_APPROX(dm4=refMat2*m3, refMat4=refMat2*refMat3);
     VERIFY_IS_APPROX(dm4=dm4+refMat2*m3, refMat4=refMat4+refMat2*refMat3);
     VERIFY_IS_APPROX(dm4+=refMat2*m3, refMat4+=refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4-=refMat2*m3, refMat4-=refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4.noalias()+=refMat2*m3, refMat4+=refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4.noalias()-=refMat2*m3, refMat4-=refMat2*refMat3);
     VERIFY_IS_APPROX(dm4=refMat2*m3t.transpose(), refMat4=refMat2*refMat3t.transpose());
     VERIFY_IS_APPROX(dm4=refMat2t.transpose()*m3, refMat4=refMat2t.transpose()*refMat3);
     VERIFY_IS_APPROX(dm4=refMat2t.transpose()*m3t.transpose(), refMat4=refMat2t.transpose()*refMat3t.transpose());
@@ -278,11 +285,35 @@ template<typename SparseMatrixType, typename DenseMatrixType> void sparse_produc
   VERIFY_IS_APPROX( m4(0,0), 0.0 );
 }
 
+template<typename Scalar>
+void bug_942()
+{
+  typedef Matrix<Scalar, Dynamic, 1>     Vector;
+  typedef SparseMatrix<Scalar, ColMajor> ColSpMat;
+  typedef SparseMatrix<Scalar, RowMajor> RowSpMat;
+  ColSpMat cmA(1,1);
+  cmA.insert(0,0) = 1;
+
+  RowSpMat rmA(1,1);
+  rmA.insert(0,0) = 1;
+
+  Vector d(1);
+  d[0] = 2;
+  
+  double res = 2;
+  
+  VERIFY_IS_APPROX( ( cmA*d.asDiagonal() ).eval().coeff(0,0), res );
+  VERIFY_IS_APPROX( ( d.asDiagonal()*rmA ).eval().coeff(0,0), res );
+  VERIFY_IS_APPROX( ( rmA*d.asDiagonal() ).eval().coeff(0,0), res );
+  VERIFY_IS_APPROX( ( d.asDiagonal()*cmA ).eval().coeff(0,0), res );
+}
+
 void test_sparse_product()
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( (sparse_product<SparseMatrix<double,ColMajor> >()) );
     CALL_SUBTEST_1( (sparse_product<SparseMatrix<double,RowMajor> >()) );
+    CALL_SUBTEST_1( (bug_942<double>()) );
     CALL_SUBTEST_2( (sparse_product<SparseMatrix<std::complex<double>, ColMajor > >()) );
     CALL_SUBTEST_2( (sparse_product<SparseMatrix<std::complex<double>, RowMajor > >()) );
     CALL_SUBTEST_3( (sparse_product<SparseMatrix<float,ColMajor,long int> >()) );
diff --git a/test/sparse_ref.cpp b/test/sparse_ref.cpp
index e7380ba21..d173ee658 100644
--- a/test/sparse_ref.cpp
+++ b/test/sparse_ref.cpp
@@ -47,12 +47,20 @@ EIGEN_DONT_INLINE void call_ref_1(Ref<SparseMatrix<float> > a, const B &b) { VER
 template<typename B>
 EIGEN_DONT_INLINE void call_ref_2(const Ref<const SparseMatrix<float> >& a, const B &b) { VERIFY_IS_EQUAL(a.toDense(),b.toDense()); }
 
+template<typename B>
+EIGEN_DONT_INLINE void call_ref_3(const Ref<const SparseMatrix<float>, StandardCompressedFormat>& a, const B &b) {
+  VERIFY(a.isCompressed());
+  VERIFY_IS_EQUAL(a.toDense(),b.toDense());
+}
+
 void call_ref()
 {
 //   SparseVector<std::complex<float> > ca = VectorXcf::Random(10).sparseView();
 //   SparseVector<float>                a  = VectorXf::Random(10).sparseView();
   SparseMatrix<float>               A = MatrixXf::Random(10,10).sparseView(0.5,1);
   SparseMatrix<float,RowMajor>      B = MatrixXf::Random(10,10).sparseView(0.5,1);
+  SparseMatrix<float>               C = MatrixXf::Random(10,10).sparseView(0.5,1);
+  C.reserve(VectorXi::Constant(C.outerSize(), 2));
   const SparseMatrix<float>&        Ac(A);
   Block<SparseMatrix<float> >       Ab(A,0,1, 3,3);
   const Block<SparseMatrix<float> > Abc(A,0,1,3,3);
@@ -61,12 +69,22 @@ void call_ref()
   VERIFY_EVALUATION_COUNT( call_ref_1(A, A),  0);
 //   VERIFY_EVALUATION_COUNT( call_ref_1(Ac, Ac),  0); // does not compile on purpose
   VERIFY_EVALUATION_COUNT( call_ref_2(A, A),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_3(A, A),  0);
   VERIFY_EVALUATION_COUNT( call_ref_2(A.transpose(), A.transpose()),  1);
+  VERIFY_EVALUATION_COUNT( call_ref_3(A.transpose(), A.transpose()),  1);
   VERIFY_EVALUATION_COUNT( call_ref_2(Ac,Ac), 0);
+  VERIFY_EVALUATION_COUNT( call_ref_3(Ac,Ac), 0);
   VERIFY_EVALUATION_COUNT( call_ref_2(A+A,2*Ac), 1);
+  VERIFY_EVALUATION_COUNT( call_ref_3(A+A,2*Ac), 1);
   VERIFY_EVALUATION_COUNT( call_ref_2(B, B),  1);
+  VERIFY_EVALUATION_COUNT( call_ref_3(B, B),  1);
   VERIFY_EVALUATION_COUNT( call_ref_2(B.transpose(), B.transpose()),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_3(B.transpose(), B.transpose()),  0);
   VERIFY_EVALUATION_COUNT( call_ref_2(A*A, A*A),  1);
+  VERIFY_EVALUATION_COUNT( call_ref_3(A*A, A*A),  1);
+  
+  VERIFY(!C.isCompressed());
+  VERIFY_EVALUATION_COUNT( call_ref_3(C, C),  1);
   
   Ref<SparseMatrix<float> > Ar(A);
   VERIFY_IS_APPROX(Ar+Ar, A+A);
diff --git a/test/sparse_solver.h b/test/sparse_solver.h
index a078851c3..1d3d25b53 100644
--- a/test/sparse_solver.h
+++ b/test/sparse_solver.h
@@ -9,6 +9,7 @@
 
 #include "sparse.h"
 #include <Eigen/SparseCore>
+#include <sstream>
 
 template<typename Solver, typename Rhs, typename DenseMat, typename DenseRhs>
 void check_sparse_solving(Solver& solver, const typename Solver::MatrixType& A, const Rhs& b, const DenseMat& dA, const DenseRhs& db)
@@ -25,14 +26,13 @@ void check_sparse_solving(Solver& solver, const typename Solver::MatrixType& A,
     solver.compute(A);
     if (solver.info() != Success)
     {
-      std::cerr << "sparse solver testing: factorization failed (check_sparse_solving)\n";
-      exit(0);
-      return;
+      std::cerr << "ERROR | sparse solver testing, factorization failed (" << typeid(Solver).name() << ")\n";
+      VERIFY(solver.info() == Success);
     }
     x = solver.solve(b);
     if (solver.info() != Success)
     {
-      std::cerr << "sparse solver testing: solving failed (" << typeid(Solver).name() << ")\n";
+      std::cerr << "WARNING | sparse solver testing: solving failed (" << typeid(Solver).name() << ")\n";
       return;
     }
     VERIFY(oldb.isApprox(b) && "sparse solver testing: the rhs should not be modified!");
@@ -42,43 +42,23 @@ void check_sparse_solving(Solver& solver, const typename Solver::MatrixType& A,
     // test the analyze/factorize API
     solver.analyzePattern(A);
     solver.factorize(A);
-    if (solver.info() != Success)
-    {
-      std::cerr << "sparse solver testing: factorization failed (check_sparse_solving)\n";
-      exit(0);
-      return;
-    }
+    VERIFY(solver.info() == Success && "factorization failed when using analyzePattern/factorize API");
     x = solver.solve(b);
-    if (solver.info() != Success)
-    {
-      std::cerr << "sparse solver testing: solving failed\n";
-      return;
-    }
+    VERIFY(solver.info() == Success && "solving failed when using analyzePattern/factorize API");
     VERIFY(oldb.isApprox(b) && "sparse solver testing: the rhs should not be modified!");
     VERIFY(x.isApprox(refX,test_precision<Scalar>()));
     
-    
     x.setZero();
     // test with Map
     MappedSparseMatrix<Scalar,Mat::Options,StorageIndex> Am(A.rows(), A.cols(), A.nonZeros(), const_cast<StorageIndex*>(A.outerIndexPtr()), const_cast<StorageIndex*>(A.innerIndexPtr()), const_cast<Scalar*>(A.valuePtr()));
     solver.compute(Am);
-    if (solver.info() != Success)
-    {
-      std::cerr << "sparse solver testing: factorization failed (check_sparse_solving)\n";
-      exit(0);
-      return;
-    }
+    VERIFY(solver.info() == Success && "factorization failed when using Map");
     DenseRhs dx(refX);
     dx.setZero();
     Map<DenseRhs> xm(dx.data(), dx.rows(), dx.cols());
     Map<const DenseRhs> bm(db.data(), db.rows(), db.cols());
     xm = solver.solve(bm);
-    if (solver.info() != Success)
-    {
-      std::cerr << "sparse solver testing: solving with a Map failed\n";
-      exit(0);
-      return;
-    }
+    VERIFY(solver.info() == Success && "solving failed when using Map");
     VERIFY(oldb.isApprox(bm) && "sparse solver testing: the rhs should not be modified!");
     VERIFY(xm.isApprox(refX,test_precision<Scalar>()));
   }
@@ -113,41 +93,35 @@ void check_sparse_solving(Solver& solver, const typename Solver::MatrixType& A,
 }
 
 template<typename Solver, typename Rhs>
-void check_sparse_solving_real_cases(Solver& solver, const typename Solver::MatrixType& A, const Rhs& b, const Rhs& refX)
+void check_sparse_solving_real_cases(Solver& solver, const typename Solver::MatrixType& A, const Rhs& b, const typename Solver::MatrixType& fullA, const Rhs& refX)
 {
   typedef typename Solver::MatrixType Mat;
   typedef typename Mat::Scalar Scalar;
   typedef typename Mat::RealScalar RealScalar;
   
   Rhs x(A.cols(), b.cols());
-  
+
   solver.compute(A);
   if (solver.info() != Success)
   {
-    std::cerr << "sparse solver testing: factorization failed (check_sparse_solving_real_cases)\n";
-    exit(0);
-    return;
+    std::cerr << "ERROR | sparse solver testing, factorization failed (" << typeid(Solver).name() << ")\n";
+    VERIFY(solver.info() == Success);
   }
   x = solver.solve(b);
+  
   if (solver.info() != Success)
   {
-    std::cerr << "sparse solver testing: solving failed\n";
+    std::cerr << "WARNING | sparse solver testing, solving failed (" << typeid(Solver).name() << ")\n";
     return;
   }
   
-  RealScalar res_error;
-  // Compute the norm of the relative error
-  if(refX.size() != 0)
-    res_error = (refX - x).norm()/refX.norm();
-  else
-  { 
-    // Compute the relative residual norm
-    res_error = (b - A * x).norm()/b.norm();
-  }
-  if (res_error > test_precision<Scalar>() ){
-    std::cerr << "Test " << g_test_stack.back() << " failed in " EI_PP_MAKE_STRING(__FILE__) 
-    << " (" << EI_PP_MAKE_STRING(__LINE__) << ")" << std::endl << std::endl;
-    abort();
+  RealScalar res_error = (fullA*x-b).norm()/b.norm();  
+  VERIFY( (res_error <= test_precision<Scalar>() ) && "sparse solver failed without noticing it"); 
+
+  
+  if(refX.size() != 0 && (refX - x).norm()/refX.norm() > test_precision<Scalar>())
+  {
+    std::cerr << "WARNING | found solution is different from the provided reference one\n";
   }
   
 }
@@ -160,7 +134,7 @@ void check_sparse_determinant(Solver& solver, const typename Solver::MatrixType&
   solver.compute(A);
   if (solver.info() != Success)
   {
-    std::cerr << "sparse solver testing: factorization failed (check_sparse_determinant)\n";
+    std::cerr << "WARNING | sparse solver testing: factorization failed (check_sparse_determinant)\n";
     return;
   }
 
@@ -177,7 +151,7 @@ void check_sparse_abs_determinant(Solver& solver, const typename Solver::MatrixT
   solver.compute(A);
   if (solver.info() != Success)
   {
-    std::cerr << "sparse solver testing: factorization failed (check_sparse_abs_determinant)\n";
+    std::cerr << "WARNING | sparse solver testing: factorization failed (check_sparse_abs_determinant)\n";
     return;
   }
 
@@ -224,13 +198,32 @@ inline std::string get_matrixfolder()
     mat_folder = mat_folder + static_cast<std::string>("/real/");
   return mat_folder;
 }
+std::string sym_to_string(int sym)
+{
+  if(sym==Symmetric) return "Symmetric ";
+  if(sym==SPD)       return "SPD ";
+  return "";
+}
+template<typename Derived>
+std::string solver_stats(const IterativeSolverBase<Derived> &solver)
+{
+  std::stringstream ss;
+  ss << solver.iterations() << " iters, error: " << solver.error();
+  return ss.str();
+}
+template<typename Derived>
+std::string solver_stats(const SparseSolverBase<Derived> &/*solver*/)
+{
+  return "";
+}
 #endif
 
-template<typename Solver> void check_sparse_spd_solving(Solver& solver)
+template<typename Solver> void check_sparse_spd_solving(Solver& solver, int maxSize = 300, int maxRealWorldSize = 100000)
 {
   typedef typename Solver::MatrixType Mat;
   typedef typename Mat::Scalar Scalar;
-  typedef SparseMatrix<Scalar,ColMajor, typename Mat::StorageIndex> SpMat;
+  typedef typename Mat::StorageIndex StorageIndex;
+  typedef SparseMatrix<Scalar,ColMajor, StorageIndex> SpMat;
   typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
   typedef Matrix<Scalar,Dynamic,1> DenseVector;
 
@@ -238,7 +231,7 @@ template<typename Solver> void check_sparse_spd_solving(Solver& solver)
   Mat A, halfA;
   DenseMatrix dA;
   for (int i = 0; i < g_repeat; i++) {
-    int size = generate_sparse_spd_problem(solver, A, halfA, dA);
+    int size = generate_sparse_spd_problem(solver, A, halfA, dA, maxSize);
 
     // generate the right hand sides
     int rhsCols = internal::random<int>(1,16);
@@ -248,12 +241,12 @@ template<typename Solver> void check_sparse_spd_solving(Solver& solver)
     DenseMatrix dB(size,rhsCols);
     initSparse<Scalar>(density, dB, B, ForceNonZeroDiag);
   
-    check_sparse_solving(solver, A,     b,  dA, b);
-    check_sparse_solving(solver, halfA, b,  dA, b);
-    check_sparse_solving(solver, A,     dB, dA, dB);
-    check_sparse_solving(solver, halfA, dB, dA, dB);
-    check_sparse_solving(solver, A,     B,  dA, dB);
-    check_sparse_solving(solver, halfA, B,  dA, dB);
+    CALL_SUBTEST( check_sparse_solving(solver, A,     b,  dA, b)  );
+    CALL_SUBTEST( check_sparse_solving(solver, halfA, b,  dA, b)  );
+    CALL_SUBTEST( check_sparse_solving(solver, A,     dB, dA, dB) );
+    CALL_SUBTEST( check_sparse_solving(solver, halfA, dB, dA, dB) );
+    CALL_SUBTEST( check_sparse_solving(solver, A,     B,  dA, dB) );
+    CALL_SUBTEST( check_sparse_solving(solver, halfA, B,  dA, dB) );
     
     // check only once
     if(i==0)
@@ -264,25 +257,43 @@ template<typename Solver> void check_sparse_spd_solving(Solver& solver)
   }
   
   // First, get the folder 
-#ifdef TEST_REAL_CASES  
-  if (internal::is_same<Scalar, float>::value 
-      || internal::is_same<Scalar, std::complex<float> >::value)
-    return ;
-  
-  std::string mat_folder = get_matrixfolder<Scalar>();
-  MatrixMarketIterator<Scalar> it(mat_folder);
-  for (; it; ++it)
+#ifdef TEST_REAL_CASES
+  // Test real problems with double precision only
+  if (internal::is_same<typename NumTraits<Scalar>::Real, double>::value)
   {
-    if (it.sym() == SPD){
-      Mat halfA;
-      PermutationMatrix<Dynamic, Dynamic, Index> pnull;
-      halfA.template selfadjointView<Solver::UpLo>() = it.matrix().template triangularView<Eigen::Lower>().twistedBy(pnull);
-      
-      std::cout<< " ==== SOLVING WITH MATRIX " << it.matname() << " ==== \n";
-      check_sparse_solving_real_cases(solver, it.matrix(), it.rhs(), it.refX());
-      check_sparse_solving_real_cases(solver, halfA, it.rhs(), it.refX());
+    std::string mat_folder = get_matrixfolder<Scalar>();
+    MatrixMarketIterator<Scalar> it(mat_folder);
+    for (; it; ++it)
+    {
+      if (it.sym() == SPD){
+        A = it.matrix();
+        if(A.diagonal().size() <= maxRealWorldSize)
+        {
+          DenseVector b = it.rhs();
+          DenseVector refX = it.refX();
+          PermutationMatrix<Dynamic, Dynamic, StorageIndex> pnull;
+          if(Solver::UpLo == (Lower|Upper))
+            halfA = A;
+          else
+            halfA.template selfadjointView<Solver::UpLo>() = A.template triangularView<Eigen::Lower>().twistedBy(pnull);
+          
+          std::cout << "INFO | Testing " << sym_to_string(it.sym()) << "sparse problem " << it.matname()
+                  << " (" << A.rows() << "x" << A.cols() << ") using " << typeid(Solver).name() << "..." << std::endl;
+          CALL_SUBTEST( check_sparse_solving_real_cases(solver, A,     b, A, refX) );
+          std::string stats = solver_stats(solver);
+          if(stats.size()>0)
+            std::cout << "INFO |  " << stats << std::endl;
+          CALL_SUBTEST( check_sparse_solving_real_cases(solver, halfA, b, A, refX) );
+        }
+        else
+        {
+          std::cout << "INFO | Skip sparse problem \"" << it.matname() << "\" (too large)" << std::endl;
+        }
+      }
     }
   }
+#else
+  EIGEN_UNUSED_VARIABLE(maxRealWorldSize);
 #endif
 }
 
@@ -320,7 +331,7 @@ Index generate_sparse_square_problem(Solver&, typename Solver::MatrixType& A, De
   return size;
 }
 
-template<typename Solver> void check_sparse_square_solving(Solver& solver)
+template<typename Solver> void check_sparse_square_solving(Solver& solver, int maxSize = 300, int maxRealWorldSize = 100000)
 {
   typedef typename Solver::MatrixType Mat;
   typedef typename Mat::Scalar Scalar;
@@ -333,7 +344,7 @@ template<typename Solver> void check_sparse_square_solving(Solver& solver)
   Mat A;
   DenseMatrix dA;
   for (int i = 0; i < g_repeat; i++) {
-    Index size = generate_sparse_square_problem(solver, A, dA);
+    Index size = generate_sparse_square_problem(solver, A, dA, maxSize);
 
     A.makeCompressed();
     DenseVector b = DenseVector::Random(size);
@@ -342,9 +353,9 @@ template<typename Solver> void check_sparse_square_solving(Solver& solver)
     double density = (std::max)(8./(size*rhsCols), 0.1);
     initSparse<Scalar>(density, dB, B, ForceNonZeroDiag);
     B.makeCompressed();
-    check_sparse_solving(solver, A, b,  dA, b);
-    check_sparse_solving(solver, A, dB, dA, dB);
-    check_sparse_solving(solver, A, B,  dA, dB);
+    CALL_SUBTEST(check_sparse_solving(solver, A, b,  dA, b));
+    CALL_SUBTEST(check_sparse_solving(solver, A, dB, dA, dB));
+    CALL_SUBTEST(check_sparse_solving(solver, A, B,  dA, dB));
     
     // check only once
     if(i==0)
@@ -356,17 +367,33 @@ template<typename Solver> void check_sparse_square_solving(Solver& solver)
   
   // First, get the folder 
 #ifdef TEST_REAL_CASES
-  if (internal::is_same<Scalar, float>::value 
-      || internal::is_same<Scalar, std::complex<float> >::value)
-    return ;
-  
-  std::string mat_folder = get_matrixfolder<Scalar>();
-  MatrixMarketIterator<Scalar> it(mat_folder);
-  for (; it; ++it)
+  // Test real problems with double precision only
+  if (internal::is_same<typename NumTraits<Scalar>::Real, double>::value)
   {
-    std::cout<< " ==== SOLVING WITH MATRIX " << it.matname() << " ==== \n";
-    check_sparse_solving_real_cases(solver, it.matrix(), it.rhs(), it.refX());
+    std::string mat_folder = get_matrixfolder<Scalar>();
+    MatrixMarketIterator<Scalar> it(mat_folder);
+    for (; it; ++it)
+    {
+      A = it.matrix();
+      if(A.diagonal().size() <= maxRealWorldSize)
+      {
+        DenseVector b = it.rhs();
+        DenseVector refX = it.refX();
+        std::cout << "INFO | Testing " << sym_to_string(it.sym()) << "sparse problem " << it.matname()
+                  << " (" << A.rows() << "x" << A.cols() << ") using " << typeid(Solver).name() << "..." << std::endl;
+        CALL_SUBTEST(check_sparse_solving_real_cases(solver, A, b, A, refX));
+        std::string stats = solver_stats(solver);
+        if(stats.size()>0)
+          std::cout << "INFO |  " << stats << std::endl;
+      }
+      else
+      {
+        std::cout << "INFO | SKIP sparse problem \"" << it.matname() << "\" (too large)" << std::endl;
+      }
+    }
   }
+#else
+  EIGEN_UNUSED_VARIABLE(maxRealWorldSize);
 #endif
 
 }
diff --git a/test/sparse_vector.cpp b/test/sparse_vector.cpp
index 5dc421976..d3975b99b 100644
--- a/test/sparse_vector.cpp
+++ b/test/sparse_vector.cpp
@@ -55,16 +55,16 @@ template<typename Scalar,typename Index> void sparse_vector(int rows, int cols)
   
   // test coeffRef with reallocation
   {
-    SparseVectorType v1(rows);
-    DenseVector v2 = DenseVector::Zero(rows);
+    SparseVectorType v4(rows);
+    DenseVector v5 = DenseVector::Zero(rows);
     for(int k=0; k<rows; ++k)
     {
       int i = internal::random<int>(0,rows-1);
       Scalar v = internal::random<Scalar>();
-      v1.coeffRef(i) += v;
-      v2.coeffRef(i) += v;
+      v4.coeffRef(i) += v;
+      v5.coeffRef(i) += v;
     }
-    VERIFY_IS_APPROX(v1,v2);
+    VERIFY_IS_APPROX(v4,v5);
   }
 
   v1.coeffRef(nonzerocoords[0]) = Scalar(5);
diff --git a/test/sparselu.cpp b/test/sparselu.cpp
index 37eb069a9..231c857ad 100644
--- a/test/sparselu.cpp
+++ b/test/sparselu.cpp
@@ -42,8 +42,8 @@ template<typename T> void test_sparselu_T()
   SparseLU<SparseMatrix<T, ColMajor, long int>, NaturalOrdering<long int> > sparselu_natural;
   
   check_sparse_square_solving(sparselu_colamd); 
-  check_sparse_square_solving(sparselu_amd);
-  check_sparse_square_solving(sparselu_natural);
+  check_sparse_square_solving(sparselu_amd, 300, 2000);
+  check_sparse_square_solving(sparselu_natural, 300, 2000);
   
   check_sparse_square_abs_determinant(sparselu_colamd);
   check_sparse_square_abs_determinant(sparselu_amd);
diff --git a/test/svd_common.h b/test/svd_common.h
index b44b79124..b06a8a0f2 100644
--- a/test/svd_common.h
+++ b/test/svd_common.h
@@ -16,6 +16,8 @@
 #error a macro SVD_FOR_MIN_NORM(MatrixType) must be defined prior to including svd_common.h
 #endif
 
+#include "svd_fill.h"
+
 // Check that the matrix m is properly reconstructed and that the U and V factors are unitary
 // The SVD must have already been computed.
 template<typename SvdType, typename MatrixType>
@@ -257,65 +259,6 @@ void svd_test_all_computation_options(const MatrixType& m, bool full_only)
   }
 }
 
-template<typename MatrixType>
-void svd_fill_random(MatrixType &m)
-{
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::RealScalar RealScalar;
-  typedef typename MatrixType::Index Index;
-  Index diagSize = (std::min)(m.rows(), m.cols());
-  RealScalar s = std::numeric_limits<RealScalar>::max_exponent10/4;
-  s = internal::random<RealScalar>(1,s);
-  Matrix<RealScalar,Dynamic,1> d =  Matrix<RealScalar,Dynamic,1>::Random(diagSize);
-  for(Index k=0; k<diagSize; ++k)
-    d(k) = d(k)*std::pow(RealScalar(10),internal::random<RealScalar>(-s,s));
-
-  bool dup     = internal::random<int>(0,10) < 3;
-  bool unit_uv = internal::random<int>(0,10) < (dup?7:3); // if we duplicate some diagonal entries, then increase the chance to preserve them using unitary U and V factors
-  
-  // duplicate some singular values
-  if(dup)
-  {
-    Index n = internal::random<Index>(0,d.size()-1);
-    for(Index i=0; i<n; ++i)
-      d(internal::random<Index>(0,d.size()-1)) = d(internal::random<Index>(0,d.size()-1));
-  }
-  
-  Matrix<Scalar,Dynamic,Dynamic> U(m.rows(),diagSize);
-  Matrix<Scalar,Dynamic,Dynamic> VT(diagSize,m.cols());
-  if(unit_uv)
-  {
-    // in very rare cases let's try with a pure diagonal matrix
-    if(internal::random<int>(0,10) < 1)
-    {
-      U.setIdentity();
-      VT.setIdentity();
-    }
-    else
-    {
-      createRandomPIMatrixOfRank(diagSize,U.rows(), U.cols(), U);
-      createRandomPIMatrixOfRank(diagSize,VT.rows(), VT.cols(), VT);
-    }
-  }
-  else
-  {
-    U.setRandom();
-    VT.setRandom();
-  }
-  
-  m = U * d.asDiagonal() * VT;
-  
-  // (partly) cancel some coeffs
-  if(!(dup && unit_uv))
-  {
-    Matrix<Scalar,Dynamic,1> samples(7);
-    samples << 0, 5.60844e-313, -5.60844e-313, 4.94e-324, -4.94e-324, -1./NumTraits<RealScalar>::highest(), 1./NumTraits<RealScalar>::highest();
-    Index n = internal::random<Index>(0,m.size()-1);
-    for(Index i=0; i<n; ++i)
-      m(internal::random<Index>(0,m.rows()-1), internal::random<Index>(0,m.cols()-1)) = samples(internal::random<Index>(0,6));
-  }
-}
-
 
 // work around stupid msvc error when constructing at compile time an expression that involves
 // a division by zero, even if the numeric type has floating point
diff --git a/test/svd_fill.h b/test/svd_fill.h
new file mode 100644
index 000000000..7e44b3d05
--- /dev/null
+++ b/test/svd_fill.h
@@ -0,0 +1,99 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+template<typename MatrixType>
+void svd_fill_random(MatrixType &m, int Option = 0)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename MatrixType::Index Index;
+  Index diagSize = (std::min)(m.rows(), m.cols());
+  RealScalar s = std::numeric_limits<RealScalar>::max_exponent10/4;
+  s = internal::random<RealScalar>(1,s);
+  Matrix<RealScalar,Dynamic,1> d =  Matrix<RealScalar,Dynamic,1>::Random(diagSize);
+  for(Index k=0; k<diagSize; ++k)
+    d(k) = d(k)*std::pow(RealScalar(10),internal::random<RealScalar>(-s,s));
+
+  bool dup     = internal::random<int>(0,10) < 3;
+  bool unit_uv = internal::random<int>(0,10) < (dup?7:3); // if we duplicate some diagonal entries, then increase the chance to preserve them using unitary U and V factors
+  
+  // duplicate some singular values
+  if(dup)
+  {
+    Index n = internal::random<Index>(0,d.size()-1);
+    for(Index i=0; i<n; ++i)
+      d(internal::random<Index>(0,d.size()-1)) = d(internal::random<Index>(0,d.size()-1));
+  }
+  
+  Matrix<Scalar,Dynamic,Dynamic> U(m.rows(),diagSize);
+  Matrix<Scalar,Dynamic,Dynamic> VT(diagSize,m.cols());
+  if(unit_uv)
+  {
+    // in very rare cases let's try with a pure diagonal matrix
+    if(internal::random<int>(0,10) < 1)
+    {
+      U.setIdentity();
+      VT.setIdentity();
+    }
+    else
+    {
+      createRandomPIMatrixOfRank(diagSize,U.rows(), U.cols(), U);
+      createRandomPIMatrixOfRank(diagSize,VT.rows(), VT.cols(), VT);
+    }
+  }
+  else
+  {
+    U.setRandom();
+    VT.setRandom();
+  }
+  
+  Matrix<Scalar,Dynamic,1> samples(7);
+  samples << 0, 5.60844e-313, -5.60844e-313, 4.94e-324, -4.94e-324, -1./NumTraits<RealScalar>::highest(), 1./NumTraits<RealScalar>::highest();
+  
+  if(Option==Symmetric)
+  {
+    m = U * d.asDiagonal() * U.transpose();
+    
+    // randomly nullify some rows/columns
+    {
+      Index count = internal::random<Index>(-diagSize,diagSize);
+      for(Index k=0; k<count; ++k)
+      {
+        Index i = internal::random<Index>(0,diagSize-1);
+        m.row(i).setZero();
+        m.col(i).setZero();
+      }
+      if(count<0)
+      // (partly) cancel some coeffs
+      if(!(dup && unit_uv))
+      {
+        
+        Index n = internal::random<Index>(0,m.size()-1);
+        for(Index k=0; k<n; ++k)
+        {
+          Index i = internal::random<Index>(0,m.rows()-1);
+          Index j = internal::random<Index>(0,m.cols()-1);
+          m(j,i) = m(i,j) = samples(internal::random<Index>(0,samples.size()-1));
+        }
+      }
+    }
+  }
+  else
+  {
+    m = U * d.asDiagonal() * VT;
+    // (partly) cancel some coeffs
+    if(!(dup && unit_uv))
+    {
+      Index n = internal::random<Index>(0,m.size()-1);
+      for(Index i=0; i<n; ++i)
+        m(internal::random<Index>(0,m.rows()-1), internal::random<Index>(0,m.cols()-1)) = samples(internal::random<Index>(0,samples.size()-1));
+    }
+  }
+}
+
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index ae6c3fe7e..54d3cc18b 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -34,8 +34,19 @@
 #include <random>
 #endif
 
+#ifdef _WIN32
+#include <winbase.h>
+#elif defined(__APPLE__)
+#include <mach/mach_time.h>
+#else
+#include <time.h>
+#endif
+
 #ifdef EIGEN_USE_THREADS
-#include <future>
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+#include <thread>
 #endif
 
 #ifdef EIGEN_USE_GPU
@@ -65,6 +76,7 @@
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h"
@@ -78,6 +90,7 @@
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h"
 
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h"
diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
index 9dea2055a..77b92ee9b 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
@@ -277,7 +277,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_l
 template <class NList>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NList::HeadType::type array_prod(const NList&) {
   return arg_prod<NList>::value;
-};
+}
 
 template<std::size_t n, typename t>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array<t, n>& a) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
index 037219f23..4dbdbfb3e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
@@ -59,12 +59,12 @@ namespace Eigen {
   * \ref TopicStorageOrders
   */
 
-template<typename Scalar_, std::size_t NumIndices_, int Options_>
-class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_> >
+template<typename Scalar_, std::size_t NumIndices_, int Options_, typename IndexType_>
+class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
 {
   public:
-    typedef Tensor<Scalar_, NumIndices_, Options_> Self;
-    typedef TensorBase<Tensor<Scalar_, NumIndices_, Options_> > Base;
+    typedef Tensor<Scalar_, NumIndices_, Options_, IndexType_> Self;
+    typedef TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexType_> > Base;
     typedef typename Eigen::internal::nested<Self>::type Nested;
     typedef typename internal::traits<Self>::StorageKind StorageKind;
     typedef typename internal::traits<Self>::Index Index;
@@ -86,13 +86,13 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_> >
     typedef DSizes<Index, NumIndices_> Dimensions;
 
   protected:
-    TensorStorage<Scalar, NumIndices, Dynamic, Options> m_storage;
+    TensorStorage<Scalar, Dimensions, Options> m_storage;
 
   public:
     // Metadata
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                         rank()                   const { return NumIndices; }
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                         dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<DenseIndex, NumIndices_>& dimensions()    const { return m_storage.dimensions(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions&             dimensions()             const { return m_storage.dimensions(); }
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                         size()                   const { return m_storage.size(); }
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar                        *data()                        { return m_storage.data(); }
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar                  *data()                  const { return m_storage.data(); }
@@ -105,7 +105,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_> >
 
 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
-    inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
+    EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
     {
       // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
@@ -341,7 +341,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_> >
     }
 
 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
-    template<typename... IndexTypes>
+    template<typename... IndexTypes> EIGEN_DEVICE_FUNC 
     void resize(Index firstDimension, IndexTypes... otherDimensions)
     {
       // The number of dimensions used to resize a tensor must be equal to the rank of the tensor.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index 86e72c3a4..c7cfbfce0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -53,8 +53,15 @@ class TensorBase<Derived, ReadOnlyAccessors>
     }
     template <typename RandomGenerator> EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<RandomGenerator, const Derived>
-    random() const {
-      return nullaryExpr(RandomGenerator());
+    random(const RandomGenerator& gen = RandomGenerator()) const {
+      return nullaryExpr(gen);
+    }
+
+    // Tensor generation
+    template <typename Generator> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorGeneratorOp<Generator, const Derived>
+    generate(const Generator& generator) const {
+      return TensorGeneratorOp<Generator, const Derived>(derived(), generator);
     }
 
     // Generic unary operation support.
@@ -78,6 +85,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
     }
 
     EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_rsqrt_op<Scalar>, const Derived>
+    rsqrt() const {
+      return unaryExpr(internal::scalar_rsqrt_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived>
     square() const {
       return unaryExpr(internal::scalar_square_op<Scalar>());
@@ -158,9 +171,9 @@ class TensorBase<Derived, ReadOnlyAccessors>
     }
 
     template <typename NewType> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_cast_op<Scalar, NewType>, const Derived>
+    EIGEN_STRONG_INLINE const TensorConversionOp<NewType, const Derived>
     cast() const {
-      return unaryExpr(internal::scalar_cast_op<Scalar, NewType>());
+      return TensorConversionOp<NewType, const Derived>(derived());
     }
 
     // Generic binary operation support.
@@ -454,8 +467,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
     }
 
   protected:
-    template <typename Scalar, std::size_t NumIndices, int Options> friend class Tensor;
-    template <typename Scalar, int Options> friend class TensorVarDim;
+    template <typename Scalar, std::size_t NumIndices, int Options, typename IndexType> friend class Tensor;
     template <typename OtherDerived, int AccessLevel> friend class TensorBase;
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
@@ -471,8 +483,7 @@ class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyA
     typedef typename internal::packet_traits<Scalar>::type PacketReturnType;
     static const int NumDimensions = DerivedTraits::NumDimensions;
 
-    template <typename Scalar, std::size_t NumIndices, int Options> friend class Tensor;
-    template <typename Scalar, int Options> friend class TensorVarDim;
+    template <typename Scalar, std::size_t NumIndices, int Options, typename IndexType> friend class Tensor;
     template <typename OtherDerived, int AccessLevel> friend class TensorBase;
 
     EIGEN_DEVICE_FUNC
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index f7254a24d..270383020 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -32,7 +32,7 @@ enum {
 template<typename Scalar, typename Index, int side,
          typename Tensor,
          typename nocontract_t, typename contract_t,
-         size_t packet_size, bool inner_dim_contiguous>
+         int packet_size, bool inner_dim_contiguous>
 class BaseTensorContractionMapper {
   public:
   EIGEN_DEVICE_FUNC
@@ -162,14 +162,14 @@ class BaseTensorContractionMapper {
 template<typename Scalar, typename Index, int side,
          typename Tensor,
          typename nocontract_t, typename contract_t,
-         size_t packet_size,
+         int packet_size,
          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
 class TensorContractionInputMapper;
 
 template<typename Scalar, typename Index, int side,
          typename Tensor,
          typename nocontract_t, typename contract_t,
-         size_t packet_size,
+         int packet_size,
          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
 class TensorContractionSubMapper {
  public:
@@ -231,7 +231,7 @@ class TensorContractionSubMapper {
 template<typename Scalar, typename Index, int side,
          typename Tensor,
          typename nocontract_t, typename contract_t,
-         size_t packet_size = (Tensor::PacketAccess ? packet_traits<Scalar>::size : 1),
+         int packet_size = (Tensor::PacketAccess ? packet_traits<Scalar>::size : 1),
          bool inner_dim_contiguous = false, bool inner_dim_reordered = (side != Lhs), int Alignment=Unaligned>
 class TensorContractionInputMapper
     : public BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous> {
@@ -507,7 +507,7 @@ struct TensorContractionEvaluatorBase
       internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
   static const int RDims =
       internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
-  static const int ContractDims = internal::array_size<Indices>::value;
+  static const unsigned int ContractDims = internal::array_size<Indices>::value;
   static const int NumDims = internal::max_n_1<LDims + RDims - 2 * ContractDims>::size;
 
   typedef array<Index, LDims> left_dim_mapper_t;
@@ -545,7 +545,7 @@ struct TensorContractionEvaluatorBase
         eval_right_dims[i] = m_rightImpl.dimensions()[i];
       }
       // We keep the pairs of contracting indices.
-      for (int i = 0; i < ContractDims; i++) {
+      for (unsigned int i = 0; i < ContractDims; i++) {
         eval_op_indices[i].first = op.indices()[i].first;
         eval_op_indices[i].second = op.indices()[i].second;
       }
@@ -559,7 +559,7 @@ struct TensorContractionEvaluatorBase
       }
       // We need to flip all the pairs of contracting indices as well as
       // reversing the dimensions.
-      for (int i = 0; i < ContractDims; i++) {
+      for (unsigned int i = 0; i < ContractDims; i++) {
         eval_op_indices[i].first = LDims - 1 - op.indices()[i].second;
         eval_op_indices[i].second = RDims - 1 - op.indices()[i].first;
       }
@@ -591,12 +591,12 @@ struct TensorContractionEvaluatorBase
     // dimensions and right non-contracting dimensions.
     m_lhs_inner_dim_contiguous = true;
     int dim_idx = 0;
-    int nocontract_idx = 0;
+    unsigned int nocontract_idx = 0;
 
     for (int i = 0; i < LDims; i++) {
       // find if we are contracting on index i of left tensor
       bool contracting = false;
-      for (int j = 0; j < ContractDims; j++) {
+      for (unsigned int j = 0; j < ContractDims; j++) {
         if (eval_op_indices[j].first == i) {
           contracting = true;
           break;
@@ -624,7 +624,7 @@ struct TensorContractionEvaluatorBase
     for (int i = 0; i < RDims; i++) {
       bool contracting = false;
       // find if we are contracting on index i of right tensor
-      for (int j = 0; j < ContractDims; j++) {
+      for (unsigned int j = 0; j < ContractDims; j++) {
         if (eval_op_indices[j].second == i) {
           contracting = true;
           break;
@@ -651,7 +651,7 @@ struct TensorContractionEvaluatorBase
     // each tensor, we'll only look at the first tensor here.
     m_rhs_inner_dim_contiguous = true;
     m_rhs_inner_dim_reordered = false;
-    for (int i = 0; i < ContractDims; i++) {
+    for (unsigned int i = 0; i < ContractDims; i++) {
       Index left = eval_op_indices[i].first;
       Index right = eval_op_indices[i].second;
 
@@ -751,8 +751,8 @@ struct TensorContractionEvaluatorBase
     typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
     typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
     typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
-    const int lhs_packet_size = internal::packet_traits<LhsScalar>::size;
-    const int rhs_packet_size = internal::packet_traits<RhsScalar>::size;
+    const Index lhs_packet_size = internal::packet_traits<LhsScalar>::size;
+    const Index rhs_packet_size = internal::packet_traits<RhsScalar>::size;
     typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
                                                    LeftEvaluator, left_nocontract_t,
                                                    contract_t, lhs_packet_size,
@@ -916,8 +916,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
     typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
 
-    const int lhs_packet_size = internal::packet_traits<LhsScalar>::size;
-    const int rhs_packet_size = internal::packet_traits<RhsScalar>::size;
+    const Index lhs_packet_size = internal::packet_traits<LhsScalar>::size;
+    const Index rhs_packet_size = internal::packet_traits<RhsScalar>::size;
 
     typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
                                                    LeftEvaluator, left_nocontract_t,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index 9259c864e..ed87d3100 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -46,8 +46,8 @@ struct packRhsAndKernelArg {
   const Index n_block_idx;
   const Index m_blocks;
   const Index n_blocks;
-  std::vector<Promise>* kernel_promises;
-  const std::vector<Future>* lhs_futures;
+  std::vector<Notification*>* kernel_notifications;
+  const std::vector<Notification*>* lhs_notifications;
   const bool need_to_pack;
 };
 
@@ -219,17 +219,13 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
       blockBs.push_back(static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar))));
     }
 
-    // lhs_futures starts with all null futures
-    std::vector<Future> lhs_futures(num_threads);
+    // lhs_notifications starts with all null Notifications
+    std::vector<Notification*> lhs_notifications(num_threads, nullptr);
 
     // this should really be numBlockAs * n_blocks;
-    const Index num_kernel_promises = num_threads * n_blocks;
-    std::vector<Promise> kernel_promises(num_kernel_promises);
-    std::vector<Future> kernel_futures(num_kernel_promises);
-    for (std::size_t i = 0; i < kernel_promises.size(); ++i) {
-      kernel_promises[i].set_value();
-      kernel_futures[i] = kernel_promises[i].get_future();
-    }
+    const Index num_kernel_notifications = num_threads * n_blocks;
+    std::vector<Notification*> kernel_notifications(num_kernel_notifications,
+                                                    nullptr);
 
     for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) {
       const Index k_start = k_block_idx * kc;
@@ -245,11 +241,16 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
           eigen_assert(actual_mc > 0);
 
           Index blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads;
+
           for (int i = 0; i < n_blocks; ++i) {
-            Index future_id = (blockAId * n_blocks + i);
-            wait_until_ready(&kernel_futures[future_id]);
-            kernel_promises[future_id] = Promise();
-            kernel_futures[future_id] = kernel_promises[future_id].get_future();
+            Index notification_id = (blockAId * n_blocks + i);
+            // Wait for any current kernels using this slot to complete
+            // before using it.
+            if (kernel_notifications[notification_id]) {
+              wait_until_ready(kernel_notifications[notification_id]);
+              delete kernel_notifications[notification_id];
+            }
+            kernel_notifications[notification_id] = new Notification();
           }
           const packLArg arg = {
             blockAs[blockAId], // blockA
@@ -260,8 +261,12 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
             actual_kc,  // kc
           };
 
-          lhs_futures[blockAId] =
-              this->m_device.enqueue(&Self::packLhs<packLArg, LhsPacker>, arg);
+          // Delete any existing notification since we may be
+          // replacing it.  The algorithm should ensure that there are
+          // no existing waiters on this notification.
+          delete lhs_notifications[blockAId];
+          lhs_notifications[blockAId] =
+          this->m_device.enqueue(&Self::packLhs<packLArg, LhsPacker>, arg);
         }
 
         // now start kernels.
@@ -278,7 +283,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
             for (Index i = num_blocks; i < num_threads; ++i) {
               Index blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads;
               Index future_id = (blockAId * n_blocks + n_block_idx);
-              wait_until_ready(&kernel_futures[future_id]);
+              wait_until_ready(kernel_notifications[future_id]);
             }
           }
 
@@ -301,26 +306,36 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
             n_block_idx, // n_block_idx
             m_blocks, // m_blocks
             n_blocks, // n_blocks
-            &kernel_promises, // kernel_promises
-            &lhs_futures, // lhs_futures
+            &kernel_notifications, // kernel notifications
+            &lhs_notifications,    // lhs notifications
             need_to_pack, // need_to_pack
           };
 
-          this->m_device.enqueueNoFuture(&Self::packRhsAndKernel<packRKArg, RhsPacker, GebpKernel>, arg);
+          // We asynchronously kick off this function, which ends up
+          // notifying the appropriate kernel_notifications objects,
+          // which this thread waits on before exiting.
+          this->m_device.enqueueNoNotification(&Self::packRhsAndKernel<packRKArg, RhsPacker, GebpKernel>, arg);
         }
       }
     }
 
     // Make sure all the kernels are done.
-    for (int i = 0; i < kernel_futures.size(); ++i) {
-      wait_until_ready(&kernel_futures[i]);
+    for (size_t i = 0; i < kernel_notifications.size(); ++i) {
+      wait_until_ready(kernel_notifications[i]);
+      delete kernel_notifications[i];
+    }
+
+    // No need to wait for lhs notifications since they should have
+    // already been waited on.  Just clean them up.
+    for (size_t i = 0; i < lhs_notifications.size(); ++i) {
+      delete lhs_notifications[i];
     }
 
     // deallocate all of the memory for both A and B's
-    for (int i = 0; i < blockAs.size(); i++) {
+    for (size_t i = 0; i < blockAs.size(); i++) {
       this->m_device.deallocate(blockAs[i]);
     }
-    for (int i = 0; i < blockBs.size(); i++) {
+    for (size_t i = 0; i < blockBs.size(); i++) {
       this->m_device.deallocate(blockBs[i]);
     }
 
@@ -360,15 +375,15 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
       const Index m_base_start = arg.m + arg.mc*mt_block_idx;
       if (m_base_start < arg.max_m) {
         Index blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads;
-
-        wait_until_ready(&(*arg.lhs_futures)[blockAId]);
+        wait_until_ready((*arg.lhs_notifications)[blockAId]);
         const Index actual_mc = (std::min)(m_base_start + arg.mc, arg.max_m) - m_base_start;
         gebp(arg.output.getSubMapper(m_base_start, arg.n),
              (*arg.blockAs)[blockAId], arg.blockB,
              actual_mc, arg.kc, arg.nc, 1.0, -1, -1, 0, 0);
 
+        // Notify that the kernel is done.
         const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx;
-        (*arg.kernel_promises)[set_idx].set_value();
+        (*arg.kernel_notifications)[set_idx]->Notify();
       }
     }
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
new file mode 100644
index 000000000..fb1f1f6ea
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -0,0 +1,206 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H
+
+namespace Eigen {
+
+/** \class TensorConversionOp
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor conversion class. This class makes it possible to vectorize
+  * type casting operations when the number of scalars per packet in the source
+  * and the destination type differ
+  */
+namespace internal {
+template<typename TargetType, typename XprType>
+struct traits<TensorConversionOp<TargetType, XprType> >
+{
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef TargetType Scalar;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename traits<XprType>::StorageKind StorageKind;
+  typedef typename traits<XprType>::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = traits<XprType>::NumDimensions;
+  static const int Layout = traits<XprType>::Layout;
+  enum { Flags = 0 };
+};
+
+template<typename TargetType, typename XprType>
+struct eval<TensorConversionOp<TargetType, XprType>, Eigen::Dense>
+{
+  typedef const TensorConversionOp<TargetType, XprType>& type;
+};
+
+template<typename TargetType, typename XprType>
+struct nested<TensorConversionOp<TargetType, XprType>, 1, typename eval<TensorConversionOp<TargetType, XprType> >::type>
+{
+  typedef TensorConversionOp<TargetType, XprType> type;
+};
+
+}  // end namespace internal
+
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket, int SrcCoeffRatio, int TgtCoeffRatio>
+struct PacketConverter {
+  PacketConverter(const TensorEvaluator& impl)
+      : m_impl(impl) {}
+
+  template<int LoadMode, typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
+    return internal::pcast<SrcPacket, TgtPacket>(m_impl.template packet<LoadMode>(index));
+  }
+
+ private:
+  const TensorEvaluator& m_impl;
+};
+
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 2, 1> {
+  PacketConverter(const TensorEvaluator& impl)
+      : m_impl(impl) {}
+
+  template<int LoadMode, typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
+    const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+
+    SrcPacket src1 = m_impl.template packet<LoadMode>(index);
+    SrcPacket src2 = m_impl.template packet<LoadMode>(index + SrcPacketSize);
+    TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2);
+    return result;
+  }
+
+ private:
+  const TensorEvaluator& m_impl;
+};
+
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, 2> {
+  PacketConverter(const TensorEvaluator& impl)
+      : m_impl(impl), m_maxIndex(impl.dimensions().TotalSize()) {}
+
+  template<int LoadMode, typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
+    const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+    // Only call m_impl.packet() when we have direct access to the underlying data. This
+    // ensures that we don't compute the subexpression twice. We may however load some
+    // coefficients twice, but in practice this doesn't negatively impact performance.
+    if (m_impl.data() && (index + SrcPacketSize < m_maxIndex)) {
+      // Force unaligned memory loads since we can't ensure alignment anymore
+      return internal::pcast<SrcPacket, TgtPacket>(m_impl.template packet<Unaligned>(index));
+    } else {
+      const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size;
+      EIGEN_ALIGN_DEFAULT typename internal::unpacket_traits<TgtPacket>::type values[TgtPacketSize];
+      for (int i = 0; i < TgtPacketSize; ++i) {
+        values[i] = m_impl.coeff(index+i);
+      }
+      TgtPacket rslt = internal::pload<TgtPacket>(values);
+      return rslt;
+    }
+  }
+
+ private:
+  const TensorEvaluator& m_impl;
+  const typename TensorEvaluator::Index m_maxIndex;
+};
+
+template<typename TargetType, typename XprType>
+class TensorConversionOp : public TensorBase<TensorConversionOp<TargetType, XprType>, ReadOnlyAccessors>
+{
+  public:
+    typedef typename internal::traits<TensorConversionOp>::Scalar Scalar;
+    typedef typename internal::traits<TensorConversionOp>::Packet Packet;
+    typedef typename internal::traits<TensorConversionOp>::StorageKind StorageKind;
+    typedef typename internal::traits<TensorConversionOp>::Index Index;
+    typedef typename internal::nested<TensorConversionOp>::type Nested;
+    typedef typename XprType::CoeffReturnType CoeffReturnType;
+    typedef typename XprType::PacketReturnType PacketReturnType;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConversionOp(const XprType& xpr)
+        : m_xpr(xpr) {}
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+};
+
+
+
+
+// Eval as rvalue
+template<typename TargetType, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
+{
+  typedef TensorConversionOp<TargetType, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  typedef TargetType Scalar;
+  typedef TargetType CoeffReturnType;
+  typedef typename internal::remove_all<typename internal::traits<ArgType>::Scalar>::type SrcType;
+  typedef typename internal::traits<XprType>::Packet PacketReturnType;
+  typedef typename internal::packet_traits<SrcType>::type PacketSourceType;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess && internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : m_impl(op.expression(), device)
+  {
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/)
+  {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup()
+  {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    internal::scalar_cast_op<SrcType, TargetType> converter;
+    return converter(m_impl.coeff(index));
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
+    const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
+    PacketConverter<TensorEvaluator<ArgType, Device>, PacketSourceType, PacketReturnType,
+                    SrcCoeffRatio, TgtCoeffRatio> converter(m_impl);
+    return converter.template packet<LoadMode>(index);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+  protected:
+    TensorEvaluator<ArgType, Device> m_impl;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 1db5f1232..6b8f71b96 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -877,7 +877,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
         const int blocksPerProcessor = (std::min)(maxBlocksPerProcessor, maxSharedMem / shared_mem);
         const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
 
-        dim3 num_blocks(num_x_blocks, min<int>(num_y_blocks, ceil(numP, block_size.y)));
+        dim3 num_blocks(num_x_blocks, std::min<int>(num_y_blocks, ceil(numP, block_size.y)));
 
 
         //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
@@ -935,7 +935,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
         const int blocksPerProcessor = (std::min)(maxBlocksPerProcessor, maxSharedMem / shared_mem);
         const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
 
-        dim3 num_blocks(num_x_blocks, num_y_blocks, min<int>(num_z_blocks, ceil(numP, block_size.z)));
+        dim3 num_blocks(num_x_blocks, num_y_blocks, std::min<int>(num_z_blocks, ceil(numP, block_size.z)));
 
 
         //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
index efd207507..1018395a1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
@@ -38,19 +38,151 @@ struct DefaultDevice {
 // We should really use a thread pool here but first we need to find a portable thread pool library.
 #ifdef EIGEN_USE_THREADS
 
-typedef std::future<void> Future;
-typedef std::promise<void> Promise;
+// The implementation of the ThreadPool type ensures that the Schedule method
+// runs the functions it is provided in FIFO order when the scheduling is done
+// by a single thread.
+class ThreadPool {
+ public:
+  // Construct a pool that contains "num_threads" threads.
+  explicit ThreadPool(int num_threads) {
+    for (int i = 0; i < num_threads; i++) {
+      threads_.push_back(new std::thread([this]() { WorkerLoop(); }));
+    }
+  }
 
-static EIGEN_STRONG_INLINE void wait_until_ready(const Future* f) {
-  f->wait();
-}
-static EIGEN_STRONG_INLINE void get_when_ready(Future* f) {
-  f->get();
+  // Wait until all scheduled work has finished and then destroy the
+  // set of threads.
+  ~ThreadPool()
+  {
+    {
+      // Wait for all work to get done.
+      std::unique_lock<std::mutex> l(mu_);
+      empty_.wait(l, [this]() { return pending_.empty(); });
+      exiting_ = true;
+
+      // Wakeup all waiters.
+      for (auto w : waiters_) {
+        w->ready = true;
+        w->work = nullptr;
+        w->cv.notify_one();
+      }
+    }
+
+    // Wait for threads to finish.
+    for (auto t : threads_) {
+      t->join();
+      delete t;
+    }
+  }
+
+  // Schedule fn() for execution in the pool of threads. The functions are
+  // executed in the order in which they are scheduled.
+  void Schedule(std::function<void()> fn) {
+    std::unique_lock<std::mutex> l(mu_);
+    if (waiters_.empty()) {
+      pending_.push_back(fn);
+    } else {
+      Waiter* w = waiters_.back();
+      waiters_.pop_back();
+      w->ready = true;
+      w->work = fn;
+      w->cv.notify_one();
+    }
+  }
+
+ protected:
+  void WorkerLoop() {
+    std::unique_lock<std::mutex> l(mu_);
+    Waiter w;
+    while (!exiting_) {
+      std::function<void()> fn;
+      if (pending_.empty()) {
+        // Wait for work to be assigned to me
+        w.ready = false;
+        waiters_.push_back(&w);
+        w.cv.wait(l, [&w]() { return w.ready; });
+        fn = w.work;
+        w.work = nullptr;
+      } else {
+        // Pick up pending work
+        fn = pending_.front();
+        pending_.pop_front();
+        if (pending_.empty()) {
+          empty_.notify_all();
+        }
+      }
+      if (fn) {
+        mu_.unlock();
+        fn();
+        mu_.lock();
+      }
+    }
+  }
+
+ private:
+  struct Waiter {
+    std::condition_variable cv;
+    std::function<void()> work;
+    bool ready;
+  };
+
+  std::mutex mu_;
+  std::vector<std::thread*> threads_;               // All threads
+  std::vector<Waiter*> waiters_;                    // Stack of waiting threads.
+  std::deque<std::function<void()>> pending_;       // Queue of pending work
+  std::condition_variable empty_;                   // Signaled on pending_.empty()
+  bool exiting_ = false;
+};
+
+
+// Notification is an object that allows a user to to wait for another
+// thread to signal a notification that an event has occurred.
+//
+// Multiple threads can wait on the same Notification object.
+// but only one caller must call Notify() on the object.
+class Notification {
+ public:
+  Notification() : notified_(false) {}
+  ~Notification() {}
+
+  void Notify() {
+    std::unique_lock<std::mutex> l(mu_);
+    eigen_assert(!notified_);
+    notified_ = true;
+    cv_.notify_all();
+  }
+
+  void WaitForNotification() {
+    std::unique_lock<std::mutex> l(mu_);
+    cv_.wait(l, [this]() { return notified_; } );
+  }
+
+ private:
+  std::mutex mu_;
+  std::condition_variable cv_;
+  bool notified_;
+};
+
+// Runs an arbitrary function and then calls Notify() on the passed in
+// Notification.
+template <typename Function, typename... Args> struct FunctionWrapper
+{
+  static void run(Notification* n, Function f, Args... args) {
+    f(args...);
+    n->Notify();
+  }
+};
+
+static EIGEN_STRONG_INLINE void wait_until_ready(Notification* n) {
+  if (n) {
+    n->WaitForNotification();
+  }
 }
 
 
+// Build a thread pool device on top the an existing pool of threads.
 struct ThreadPoolDevice {
-  ThreadPoolDevice(size_t num_cores) : num_threads_(num_cores) { }
+  ThreadPoolDevice(ThreadPool* pool, size_t num_cores) : pool_(pool), num_threads_(num_cores) { }
 
   EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
     return internal::aligned_malloc(num_bytes);
@@ -73,15 +205,21 @@ struct ThreadPoolDevice {
   }
 
   template <class Function, class... Args>
-  EIGEN_STRONG_INLINE Future enqueue(Function&& f, Args&&... args) const {
-    return std::async(std::launch::async, f, args...);
+  EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const {
+    Notification* n = new Notification();
+    std::function<void()> func =
+      std::bind(&FunctionWrapper<Function, Args...>::run, n, f, args...);
+    pool_->Schedule(func);
+    return n;
   }
   template <class Function, class... Args>
-  EIGEN_STRONG_INLINE void enqueueNoFuture(Function&& f, Args&&... args) const {
-    std::async(std::launch::async, f, args...);
+  EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const {
+    std::function<void()> func = std::bind(f, args...);
+    pool_->Schedule(func);
   }
 
  private:
+  ThreadPool* pool_;
   size_t num_threads_;
 };
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index 43917cbc3..836daea65 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -74,16 +74,16 @@ struct fixed_size_tensor_index_linearization_helper<Index, NumIndices, 0, RowMaj
 
 // Fixed size
 #ifndef EIGEN_EMULATE_CXX11_META_H
-template <typename std::size_t... Indices>
-struct Sizes : internal::numeric_list<std::size_t, Indices...> {
-  typedef internal::numeric_list<std::size_t, Indices...> Base;
-  static const std::size_t total_size = internal::arg_prod(Indices...);
+template <typename std::ptrdiff_t... Indices>
+struct Sizes : internal::numeric_list<std::ptrdiff_t, Indices...> {
+  typedef internal::numeric_list<std::ptrdiff_t, Indices...> Base;
+  static const std::ptrdiff_t total_size = internal::arg_prod(Indices...);
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const {
     return Base::count;
   }
 
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t TotalSize() {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t TotalSize() {
     return internal::arg_prod(Indices...);
   }
 
@@ -94,7 +94,7 @@ struct Sizes : internal::numeric_list<std::size_t, Indices...> {
   }
 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
   template <typename... DenseIndex> Sizes(DenseIndex...) { }
-  explicit Sizes(std::initializer_list<std::size_t> /*l*/) {
+  explicit Sizes(std::initializer_list<std::ptrdiff_t> /*l*/) {
     // todo: add assertion
   }
 #endif
@@ -114,8 +114,8 @@ struct Sizes : internal::numeric_list<std::size_t, Indices...> {
   }
 };
 
-template <typename std::size_t... Indices>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes<Indices...>&) {
+template <typename std::ptrdiff_t... Indices>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes<Indices...>&) {
   return Sizes<Indices...>::total_size;
 }
 
@@ -173,18 +173,18 @@ template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0
 
   template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
-    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, false>::run(indices, *static_cast<const Base*>(this));
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, false>::run(indices, *reinterpret_cast<const Base*>(this));
   }
   template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
-    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, true>::run(indices, *static_cast<const Base*>(this));
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, true>::run(indices, *reinterpret_cast<const Base*>(this));
   }
 };
 
 template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes<V1, V2, V3, V4, V5>&) {
   return Sizes<V1, V2, V3, V4, V5>::total_size;
-};
+}
 
 #endif
 
@@ -225,7 +225,7 @@ struct DSizes : array<DenseIndex, NumDims> {
     return NumDims;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex TotalSize() const {
     return internal::array_prod(*static_cast<const Base*>(this));
   }
 
@@ -281,10 +281,10 @@ struct DSizes : array<DenseIndex, NumDims> {
   }
 
   // A constexpr would be so much better here
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfColMajor(const array<DenseIndex, NumDims>& indices) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfColMajor(const array<DenseIndex, NumDims>& indices) const {
     return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, false>::run(indices, *static_cast<const Base*>(this));
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfRowMajor(const array<DenseIndex, NumDims>& indices) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfRowMajor(const array<DenseIndex, NumDims>& indices) const {
     return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, true>::run(indices, *static_cast<const Base*>(this));
   }
 };
@@ -327,13 +327,13 @@ template <typename DenseIndex, std::size_t NumDims> struct array_size<DSizes<Den
   static const size_t value = NumDims;
 };
 #ifndef EIGEN_EMULATE_CXX11_META_H
-template <typename std::size_t... Indices> struct array_size<const Sizes<Indices...> > {
-static const size_t value = Sizes<Indices...>::count;
+template <typename std::ptrdiff_t... Indices> struct array_size<const Sizes<Indices...> > {
+static const std::ptrdiff_t value = Sizes<Indices...>::count;
 };
-template <typename std::size_t... Indices> struct array_size<Sizes<Indices...> > {
-static const size_t value = Sizes<Indices...>::count;
+template <typename std::ptrdiff_t... Indices> struct array_size<Sizes<Indices...> > {
+static const std::ptrdiff_t value = Sizes<Indices...>::count;
 };
-template <std::size_t n, typename std::size_t... Indices> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes<Indices...>&) {
+template <std::ptrdiff_t n, typename std::ptrdiff_t... Indices> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<Indices...>&) {
   return get<n, internal::numeric_list<std::size_t, Indices...> >::value;
 }
 #else
@@ -345,7 +345,7 @@ template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::s
 };
 template <std::size_t n, std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes<V1,V2,V3,V4,V5>&) {
   return get<n, typename Sizes<V1,V2,V3,V4,V5>::Base>::value;
-};
+}
 
 #endif
 
@@ -369,7 +369,7 @@ struct sizes_match_up_to_dim<Dims1, Dims2, 0> {
 
 template <typename Dims1, typename Dims2>
 bool dimensions_match(Dims1& dims1, Dims2& dims2) {
-  if (internal::array_size<Dims1>::value != internal::array_size<Dims2>::value) {
+  if (static_cast<size_t>(internal::array_size<Dims1>::value) != static_cast<size_t>(internal::array_size<Dims2>::value)) {
     return false;
   }
   return internal::sizes_match_up_to_dim<Dims1, Dims2, internal::array_size<Dims1>::value-1>::run(dims1, dims2);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index 9198c17ef..a38af84d5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -132,13 +132,20 @@ struct TensorEvaluator<const Derived, Device>
     CoordAccess = NumCoords > 0,
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device&)
-      : m_data(m.data()), m_dims(m.dimensions())
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
+      : m_data(m.data()), m_dims(m.dimensions()), m_device(device)
   { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+    if (internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value && data) {
+      m_device.memcpy((void*)data, m_data, m_dims.TotalSize() * sizeof(Scalar));
+      return false;
+    }
+    return true;
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
@@ -172,6 +179,7 @@ struct TensorEvaluator<const Derived, Device>
  protected:
   const Scalar* m_data;
   Dimensions m_dims;
+  const Device& m_device;
 };
 
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index bb2f8b977..6ea588e4b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -131,7 +131,7 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
       const Index blocksize = std::max<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
       const Index numblocks = size / blocksize;
 
-      std::vector<Future> results;
+      std::vector<Notification*> results;
       results.reserve(numblocks);
       for (int i = 0; i < numblocks; ++i) {
         results.push_back(device.enqueue(&EvalRange<Evaluator, Index>::run, evaluator, i*blocksize, (i+1)*blocksize));
@@ -142,7 +142,8 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
       }
 
       for (int i = 0; i < numblocks; ++i) {
-        get_when_ready(&results[i]);
+        wait_until_ready(results[i]);
+        delete results[i];
       }
 
     }
@@ -157,7 +158,11 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
 template <typename Evaluator, typename Index>
 __global__ void
 __launch_bounds__(1024)
-EigenMetaKernel_NonVectorizable(Evaluator eval, Index size) {
+EigenMetaKernel_NonVectorizable(Evaluator memcopied_eval, Index size) {
+  // Cuda memcopies the kernel arguments. That's fine for POD, but for more
+  // complex types such as evaluators we should really conform to the C++
+  // standard and call a proper copy constructor.
+  Evaluator eval(memcopied_eval);
 
   const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
   const Index step_size = blockDim.x * gridDim.x;
@@ -171,7 +176,11 @@ EigenMetaKernel_NonVectorizable(Evaluator eval, Index size) {
 template <typename Evaluator, typename Index>
 __global__ void
 __launch_bounds__(1024)
-EigenMetaKernel_Vectorizable(Evaluator eval, Index size) {
+EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) {
+  // Cuda memcopies the kernel arguments. That's fine for POD, but for more
+  // complex types such as evaluators we should really conform to the C++
+  // standard and call a proper copy constructor.
+  Evaluator eval(memcopied_eval);
 
   const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
   const Index step_size = blockDim.x * gridDim.x;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
index b66b3ec2c..194c68929 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
@@ -276,16 +276,20 @@ class TensorSelectOp : public TensorBase<TensorSelectOp<IfXprType, ThenXprType,
     typedef typename Eigen::internal::traits<TensorSelectOp>::StorageKind StorageKind;
     typedef typename Eigen::internal::traits<TensorSelectOp>::Index Index;
 
+    EIGEN_DEVICE_FUNC
     TensorSelectOp(const IfXprType& a_condition,
                    const ThenXprType& a_then,
                    const ElseXprType& a_else)
       : m_condition(a_condition), m_then(a_then), m_else(a_else)
     { }
 
+    EIGEN_DEVICE_FUNC
     const IfXprType& ifExpression() const { return m_condition; }
 
+    EIGEN_DEVICE_FUNC
     const ThenXprType& thenExpression() const { return m_then; }
 
+    EIGEN_DEVICE_FUNC
     const ElseXprType& elseExpression() const { return m_else; }
 
   protected:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index 94b3f957b..76998b690 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -23,12 +23,12 @@ namespace Eigen {
   * Eigen::TensorFixedSize<float, Size<3,5,7>> t;
   */
 
-template<typename Scalar_, typename Dimensions_, int Options_>
-class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_> >
+template<typename Scalar_, typename Dimensions_, int Options_, typename IndexType>
+class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> >
 {
   public:
-    typedef TensorFixedSize<Scalar_, Dimensions_, Options_> Self;
-    typedef TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_> > Base;
+    typedef TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> Self;
+    typedef TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> > Base;
     typedef typename Eigen::internal::nested<Self>::type Nested;
     typedef typename internal::traits<Self>::StorageKind StorageKind;
     typedef typename internal::traits<Self>::Index Index;
@@ -50,7 +50,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
   static const std::size_t NumIndices = Dimensions::count;
 
   protected:
-  TensorStorage<Scalar, NumIndices, Dimensions::total_size, Options, Dimensions> m_storage;
+  TensorStorage<Scalar, Dimensions, Options> m_storage;
 
   public:
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                      rank()                   const { return NumIndices; }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index 7bec2b10a..a4224c372 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -12,8 +12,8 @@
 
 namespace Eigen {
 
-template<typename Scalar_, std::size_t NumIndices_, int Options_ = 0> class Tensor;
-template<typename Scalar_, typename Dimensions, int Options_ = 0> class TensorFixedSize;
+template<typename Scalar_, std::size_t NumIndices_, int Options_ = 0, typename IndexType = DenseIndex> class Tensor;
+template<typename Scalar_, typename Dimensions, int Options_ = 0, typename IndexType = DenseIndex> class TensorFixedSize;
 template<typename PlainObjectType, int Options_ = Unaligned> class TensorMap;
 template<typename PlainObjectType> class TensorRef;
 template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value> class TensorBase;
@@ -25,6 +25,7 @@ template<typename IfXprType, typename ThenXprType, typename ElseXprType> class T
 template<typename Op, typename Dims, typename XprType> class TensorReductionOp;
 template<typename Axis, typename LeftXprType, typename RightXprType> class TensorConcatenationOp;
 template<typename Dimensions, typename LeftXprType, typename RightXprType> class TensorContractionOp;
+template<typename TargetType, typename XprType> class TensorConversionOp;
 template<typename Dimensions, typename InputXprType, typename KernelXprType> class TensorConvolutionOp;
 template<typename PatchDim, typename XprType> class TensorPatchOp;
 template<DenseIndex Rows, DenseIndex Cols, typename XprType> class TensorImagePatchOp;
@@ -37,6 +38,7 @@ template<typename ReverseDimensions, typename XprType> class TensorReverseOp;
 template<typename PaddingDimensions, typename XprType> class TensorPaddingOp;
 template<typename Shuffle, typename XprType> class TensorShufflingOp;
 template<typename Strides, typename XprType> class TensorStridingOp;
+template<typename Generator, typename XprType> class TensorGeneratorOp;
 template<typename LeftXprType, typename RightXprType> class TensorAssignOp;
 
 template<typename XprType> class TensorEvalToOp;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index 25f085a59..1b031b7a1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -182,12 +182,46 @@ template <typename T> struct ProdReducer
   }
 };
 
+
+// Random number generation
+namespace {
+#ifdef __CUDA_ARCH__
+__device__ int get_random_seed() {
+    return clock();
+}
+#else
+int get_random_seed() {
+#ifdef _WIN32
+    SYSTEMTIME st;
+    GetSystemTime(&st);
+    return st.wSecond + 1000 * st.wMilliseconds;
+#elif defined __APPLE__
+    return static_cast<int>(mach_absolute_time());
+#else
+    timespec ts;
+    clock_gettime(CLOCK_REALTIME, &ts);
+    return static_cast<int>(ts.tv_nsec);
+#endif
+}
+#endif
+}
+
 #if !defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)
 // We're not compiling a cuda kernel
-template <typename T> struct UniformRandomGenerator {
+template <typename T> class UniformRandomGenerator {
 
+ public:
   static const bool PacketAccess = true;
 
+  UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
+    if (!deterministic) {
+      srand(get_random_seed());
+    }
+  }
+  UniformRandomGenerator(const UniformRandomGenerator& other) {
+    m_deterministic = other.m_deterministic;
+  }
+
   template<typename Index>
   T operator()(Index, Index = 0) const {
     return random<T>();
@@ -201,53 +235,149 @@ template <typename T> struct UniformRandomGenerator {
     }
     return internal::pload<typename internal::packet_traits<T>::type>(values);
   }
+
+ private:
+  bool m_deterministic;
+};
+
+#if __cplusplus > 199711
+template <> class UniformRandomGenerator<float> {
+ public:
+  static const bool PacketAccess = true;
+
+  UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
+    if (!deterministic) {
+      m_generator.seed(get_random_seed());
+    }
+  }
+  UniformRandomGenerator(const UniformRandomGenerator<float>& other) {
+    m_generator.seed(other(0, 0) * UINT_MAX);
+    m_deterministic = other.m_deterministic;
+  }
+
+  template<typename Index>
+  float operator()(Index, Index = 0) const {
+    return m_distribution(m_generator);
+  }
+  template<typename Index>
+  typename internal::packet_traits<float>::type packetOp(Index i, Index j = 0) const {
+    const int packetSize = internal::packet_traits<float>::size;
+    EIGEN_ALIGN_DEFAULT float values[packetSize];
+    for (int k = 0; k < packetSize; ++k) {
+      values[k] = this->operator()(i, j);
+    }
+    return internal::pload<typename internal::packet_traits<float>::type>(values);
+  }
+
+ private:
+  UniformRandomGenerator& operator = (const UniformRandomGenerator&);
+  // Make sure m_deterministic comes first to match the layout of the cpu
+  // version of the code.
+  bool m_deterministic;
+  mutable std::mt19937 m_generator;
+  mutable std::uniform_real_distribution<float> m_distribution;
+};
+
+template <> class UniformRandomGenerator<double> {
+ public:
+  static const bool PacketAccess = true;
+
+  UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
+    if (!deterministic) {
+      m_generator.seed(get_random_seed());
+    }
+  }
+  UniformRandomGenerator(const UniformRandomGenerator<double>& other) {
+    m_generator.seed(other(0, 0) * UINT_MAX);
+    m_deterministic = other.m_deterministic;
+  }
+
+  template<typename Index>
+  double operator()(Index, Index = 0) const {
+    return m_distribution(m_generator);
+  }
+  template<typename Index>
+  typename internal::packet_traits<double>::type packetOp(Index i, Index j = 0) const {
+    const int packetSize = internal::packet_traits<double>::size;
+    EIGEN_ALIGN_DEFAULT double values[packetSize];
+    for (int k = 0; k < packetSize; ++k) {
+      values[k] = this->operator()(i, j);
+    }
+    return internal::pload<typename internal::packet_traits<double>::type>(values);
+  }
+
+ private:
+  UniformRandomGenerator& operator = (const UniformRandomGenerator&);
+  // Make sure m_deterministic comes first to match the layout of the cpu
+  // version of the code.
+  bool m_deterministic;
+  mutable std::mt19937 m_generator;
+  mutable std::uniform_real_distribution<double> m_distribution;
 };
+#endif
 
 #else
 
 // We're compiling a cuda kernel
-template <typename T> struct UniformRandomGenerator;
-
-template <> struct UniformRandomGenerator<float> {
+template <typename T> class UniformRandomGenerator;
 
+template <> class UniformRandomGenerator<float> {
+ public:
   static const bool PacketAccess = true;
 
-  EIGEN_DEVICE_FUNC UniformRandomGenerator() {
+  __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    curand_init(0, tid, 0, &m_state);
+    const int seed = deterministic ? 0 : get_random_seed();
+    curand_init(seed, tid, 0, &m_state);
   }
 
-  template<typename Index> EIGEN_DEVICE_FUNC
-  float operator()(Index, Index = 0) const {
+  __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
+    m_deterministic = other.m_deterministic;
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    const int seed = m_deterministic ? 0 : get_random_seed();
+     curand_init(seed, tid, 0, &m_state);
+  }
+
+  template<typename Index>
+  __device__ float operator()(Index, Index = 0) const {
     return curand_uniform(&m_state);
   }
-  template<typename Index> EIGEN_DEVICE_FUNC
-  float4 packetOp(Index, Index = 0) const {
+  template<typename Index>
+  __device__ float4 packetOp(Index, Index = 0) const {
     return curand_uniform4(&m_state);
   }
 
  private:
+  bool m_deterministic;
   mutable curandStatePhilox4_32_10_t m_state;
 };
 
-template <> struct UniformRandomGenerator<double> {
-
+template <> class UniformRandomGenerator<double> {
+ public:
   static const bool PacketAccess = true;
 
-  EIGEN_DEVICE_FUNC UniformRandomGenerator() {
+  __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    curand_init(0, tid, 0, &m_state);
+    const int seed = deterministic ? 0 : get_random_seed();
+    curand_init(seed, tid, 0, &m_state);
   }
-  template<typename Index> EIGEN_DEVICE_FUNC
-  double operator()(Index, Index = 0) const {
+  __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
+    m_deterministic = other.m_deterministic;
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    const int seed = m_deterministic ? 0 : get_random_seed();
+    curand_init(seed, tid, 0, &m_state);
+  }
+  template<typename Index>
+  __device__ double operator()(Index, Index = 0) const {
     return curand_uniform_double(&m_state);
   }
-  template<typename Index> EIGEN_DEVICE_FUNC
-  double2 packetOp(Index, Index = 0) const {
+  template<typename Index>
+  __device__ double2 packetOp(Index, Index = 0) const {
     return curand_uniform2_double(&m_state);
   }
 
  private:
+  bool m_deterministic;
   mutable curandStatePhilox4_32_10_t m_state;
 };
 
@@ -256,12 +386,19 @@ template <> struct UniformRandomGenerator<double> {
 
 #if (!defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)) && __cplusplus > 199711
 // We're not compiling a cuda kernel
-template <typename T> struct NormalRandomGenerator {
-
+template <typename T> class NormalRandomGenerator {
+ public:
   static const bool PacketAccess = true;
 
-  NormalRandomGenerator() : m_distribution(0, 1) {}
-  NormalRandomGenerator(const NormalRandomGenerator& other) : m_distribution(other.m_distribution) { }
+  NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_distribution(0, 1) {
+    if (!deterministic) {
+      m_generator.seed(get_random_seed());
+    }
+  }
+  NormalRandomGenerator(const NormalRandomGenerator& other)
+      : m_deterministic(other.m_deterministic), m_distribution(other.m_distribution) {
+    m_generator.seed(other(0, 0) * UINT_MAX);
+  }
 
   template<typename Index>
   T operator()(Index, Index = 0) const {
@@ -277,61 +414,117 @@ template <typename T> struct NormalRandomGenerator {
     return internal::pload<typename internal::packet_traits<T>::type>(values);
   }
 
+ private:
+  bool m_deterministic;
   mutable std::normal_distribution<T> m_distribution;
-  mutable std::default_random_engine m_generator;
+  mutable std::mt19937 m_generator;
 };
 
 #elif defined (EIGEN_USE_GPU) && defined(__CUDACC__) && defined(__CUDA_ARCH__)
 
 // We're compiling a cuda kernel
-template <typename T> struct NormalRandomGenerator;
-
-template <> struct NormalRandomGenerator<float> {
+template <typename T> class NormalRandomGenerator;
 
+template <> class NormalRandomGenerator<float> {
+ public:
   static const bool PacketAccess = true;
 
-  EIGEN_DEVICE_FUNC NormalRandomGenerator() {
+  __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    curand_init(0, tid, 0, &m_state);
+    const int seed = deterministic ? 0 : get_random_seed();
+    curand_init(seed, tid, 0, &m_state);
   }
-
-  template<typename Index> EIGEN_DEVICE_FUNC
-  float operator()(Index, Index = 0) const {
+  __device__ NormalRandomGenerator(const NormalRandomGenerator<float>& other) {
+    m_deterministic = other.m_deterministic;
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    const int seed = m_deterministic ? 0 : get_random_seed();
+    curand_init(seed, tid, 0, &m_state);
+  }
+  template<typename Index>
+   __device__ float operator()(Index, Index = 0) const {
     return curand_normal(&m_state);
   }
-  template<typename Index> EIGEN_DEVICE_FUNC
-  float4 packetOp(Index, Index = 0) const {
+  template<typename Index>
+   __device__ float4 packetOp(Index, Index = 0) const {
     return curand_normal4(&m_state);
   }
 
  private:
+  bool m_deterministic;
   mutable curandStatePhilox4_32_10_t m_state;
 };
 
-template <> struct NormalRandomGenerator<double> {
-
+template <> class NormalRandomGenerator<double> {
+ public:
   static const bool PacketAccess = true;
 
-  EIGEN_DEVICE_FUNC NormalRandomGenerator() {
+  __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    curand_init(0, tid, 0, &m_state);
+    const int seed = deterministic ? 0 : get_random_seed();
+    curand_init(seed, tid, 0, &m_state);
   }
-  template<typename Index> EIGEN_DEVICE_FUNC
-  double operator()(Index, Index = 0) const {
+  __device__ NormalRandomGenerator(const NormalRandomGenerator<double>& other) {
+    m_deterministic = other.m_deterministic;
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    const int seed = m_deterministic ? 0 : get_random_seed();
+    curand_init(seed, tid, 0, &m_state);
+  }
+  template<typename Index>
+  __device__ double operator()(Index, Index = 0) const {
     return curand_normal_double(&m_state);
   }
-  template<typename Index> EIGEN_DEVICE_FUNC
-  double2 packetOp(Index, Index = 0) const {
+  template<typename Index>
+  __device__ double2 packetOp(Index, Index = 0) const {
     return curand_normal2_double(&m_state);
   }
 
  private:
+  bool m_deterministic;
   mutable curandStatePhilox4_32_10_t m_state;
 };
 
+#else
+
+template <typename T> class NormalRandomGenerator {
+ public:
+  NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {}
+
+ private:
+  bool m_deterministic;
+};
+
 #endif
 
 
+template <typename T, typename Index, size_t NumDims>
+class GaussianGenerator {
+ public:
+  static const bool PacketAccess = false;
+
+  EIGEN_DEVICE_FUNC GaussianGenerator(const array<T, NumDims>& means,
+                                      const array<T, NumDims>& std_devs)
+      : m_means(means)
+  {
+    for (size_t i = 0; i < NumDims; ++i) {
+      m_two_sigmas[i] = std_devs[i] * std_devs[i] * 2;
+    }
+  }
+
+  T operator()(const array<Index, NumDims>& coordinates) const {
+    T tmp = T(0);
+    for (size_t i = 0; i < NumDims; ++i) {
+      T offset = coordinates[i] - m_means[i];
+      tmp += offset * offset / m_two_sigmas[i];
+    }
+    return std::exp(-tmp);
+  }
+
+ private:
+  array<T, NumDims> m_means;
+  array<T, NumDims> m_two_sigmas;
+};
+
+
 } // end namespace internal
 } // end namespace Eigen
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
new file mode 100644
index 000000000..3a181d6c3
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@@ -0,0 +1,181 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H
+
+namespace Eigen {
+
+/** \class TensorGenerator
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor generator class.
+  *
+  *
+  */
+namespace internal {
+template<typename Generator, typename XprType>
+struct traits<TensorGeneratorOp<Generator, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename Generator, typename XprType>
+struct eval<TensorGeneratorOp<Generator, XprType>, Eigen::Dense>
+{
+  typedef const TensorGeneratorOp<Generator, XprType>& type;
+};
+
+template<typename Generator, typename XprType>
+struct nested<TensorGeneratorOp<Generator, XprType>, 1, typename eval<TensorGeneratorOp<Generator, XprType> >::type>
+{
+  typedef TensorGeneratorOp<Generator, XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename Generator, typename XprType>
+class TensorGeneratorOp : public TensorBase<TensorGeneratorOp<Generator, XprType>, ReadOnlyAccessors>
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorGeneratorOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorGeneratorOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorGeneratorOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorGeneratorOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorGeneratorOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorGeneratorOp(const XprType& expr, const Generator& generator)
+      : m_xpr(expr), m_generator(generator) {}
+
+    EIGEN_DEVICE_FUNC
+    const Generator& generator() const { return m_generator; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const Generator m_generator;
+};
+
+
+// Eval as rvalue
+template<typename Generator, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
+{
+  typedef TensorGeneratorOp<Generator, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  static const int NumDims = internal::array_size<Dimensions>::value;
+  typedef typename XprType::Scalar Scalar;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_generator(op.generator())
+  {
+    TensorEvaluator<ArgType, Device> impl(op.expression(), device);
+    m_dimensions = impl.dimensions();
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_strides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
+      }
+    } else {
+      m_strides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
+      }
+    }
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    array<Index, NumDims> coords;
+    extract_coordinates(index, coords);
+    return m_generator(coords);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = coeff(index+i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void extract_coordinates(Index index, array<Index, NumDims>& coords) const {
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_strides[i];
+        index -= idx * m_strides[i];
+        coords[i] = idx;
+      }
+      coords[0] = index;
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_strides[i];
+        index -= idx * m_strides[i];
+        coords[i] = idx;
+      }
+      coords[NumDims-1] = index;
+    }
+  }
+
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_strides;
+  Generator m_generator;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
index bdc6ddb87..3b6f2c730 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
@@ -30,7 +30,7 @@ std::ostream& operator << (std::ostream& os, const TensorBase<T, ReadOnlyAccesso
   typedef typename internal::remove_const<typename T::Scalar>::type Scalar;
   typedef typename T::Index Index;
   typedef typename TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Dimensions Dimensions;
-  const Index total_size = tensor.dimensions().TotalSize();
+  const Index total_size = internal::array_prod(tensor.dimensions());
 
   // Print the tensor as a 1d vector or a 2d matrix.
   if (internal::array_size<Dimensions>::value == 1) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
index eed0a9f05..620b6a8cb 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
 #define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
 
-#ifdef EIGEN_HAS_CONSTEXPR
+#if defined(EIGEN_HAS_CONSTEXPR) && defined(EIGEN_HAS_VARIADIC_TEMPLATES)
 
 namespace Eigen {
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
index 11c7ce443..108c45a32 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -73,7 +73,7 @@ struct TensorIntDivisor {
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const {
     const int N = 32;
     eigen_assert(numerator >= 0);
-    eigen_assert(numerator <= (1ull<<N) - 1);
+    eigen_assert(static_cast<unsigned long long>(numerator) <= (1ull<<N) - 1);
 
     uint32_t t1 = (multiplier * numerator) >> 32;
     uint32_t t = (static_cast<uint32_t>(numerator) - t1) >> shift1;
@@ -87,6 +87,68 @@ struct TensorIntDivisor {
 };
 
 
+// Optimized version for signed 32 bit integers.
+// Derived from Hacker's Delight.
+template <>
+class TensorIntDivisor<int> {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
+    magic = 0;
+    shift = 0;
+  }
+  // Must have 2 <= divider
+  EIGEN_DEVICE_FUNC TensorIntDivisor(int divider)  {
+    eigen_assert(divider >= 2);
+    calcMagic(divider);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int n) const {
+#ifdef __CUDA_ARCH__
+    return (__umulhi(magic, n) >> shift);
+#else
+  uint64_t v = static_cast<uint64_t>(magic) * static_cast<uint64_t>(n);
+  return (static_cast<unsigned int>(v >> 32) >> shift);
+#endif
+  }
+
+private:
+  // Compute the magic numbers. See Hacker's Delight section 10 for an in
+  // depth explanation.
+  EIGEN_DEVICE_FUNC void calcMagic(int d) {
+   const unsigned two31 = 0x80000000;     // 2**31.
+   unsigned ad = d;
+   unsigned t = two31 + (ad >> 31);
+   unsigned anc = t - 1 - t%ad;     // Absolute value of nc.
+   int p = 31;                      // Init. p.
+   unsigned q1 = two31/anc;         // Init. q1 = 2**p/|nc|.
+   unsigned r1 = two31 - q1*anc;    // Init. r1 = rem(2**p, |nc|).
+   unsigned q2 = two31/ad;          // Init. q2 = 2**p/|d|.
+   unsigned r2 = two31 - q2*ad;     // Init. r2 = rem(2**p, |d|).
+   unsigned delta = 0;
+   do {
+      p = p + 1;
+      q1 = 2*q1;           // Update q1 = 2**p/|nc|.
+      r1 = 2*r1;           // Update r1 = rem(2**p, |nc|).
+      if (r1 >= anc) {     // (Must be an unsigned
+         q1 = q1 + 1;      // comparison here).
+         r1 = r1 - anc;}
+      q2 = 2*q2;           // Update q2 = 2**p/|d|.
+      r2 = 2*r2;           // Update r2 = rem(2**p, |d|).
+      if (r2 >= ad) {      // (Must be an unsigned
+         q2 = q2 + 1;      // comparison here).
+         r2 = r2 - ad;}
+      delta = ad - r2;
+   } while (q1 < delta || (q1 == delta && r1 == 0));
+
+   magic = (unsigned)(q2 + 1);
+   shift = p - 32;
+  }
+
+  unsigned int magic;
+  int shift;
+};
+
+
 template <typename T>
 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor<T>& divisor) {
   return divisor.divide(numerator);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 01ba0a80f..fa1e6931c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -283,6 +283,26 @@ class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, X
 };
 
 
+// Fixme: figure out the exact threshold
+namespace {
+template <typename Index, typename Device> struct MemcpyTriggerForSlicing {
+  EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const Device& device) : threshold_(2 * device.numThreads()) { }
+  EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > threshold_; }
+
+ private:
+  Index threshold_;
+};
+
+// It is very expensive to start the memcpy kernel on GPU: we therefore only
+// use it for large copies.
+#ifdef EIGEN_USE_GPU
+template <typename Index> struct MemcpyTriggerForSlicing<Index, GpuDevice>  {
+  EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const GpuDevice&) { }
+  EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > 4*1024*1024; }
+};
+#endif
+}
+
 // Eval as rvalue
 template<typename StartIndices, typename Sizes, typename ArgType, typename Device>
 struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
@@ -346,7 +366,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
     m_impl.evalSubExprsIfNeeded(NULL);
-    if (internal::is_arithmetic<Scalar>::value && data && m_impl.data()) {
+    if (internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value && data && m_impl.data()) {
       Index contiguous_values = 1;
       if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
         for (int i = 0; i < NumDims; ++i) {
@@ -364,7 +384,8 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
         }
       }
       // Use memcpy if it's going to be faster than using the regular evaluation.
-      if (contiguous_values > 2 * m_device.numThreads()) {
+      const MemcpyTriggerForSlicing<Index, Device> trigger(m_device);
+      if (trigger(contiguous_values)) {
         Scalar* src = m_impl.data();
         for (int i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) {
           Index offset = srcCoeff(i);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index de5747905..95116aaee 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -53,28 +53,34 @@ struct preserve_inner_most_dims {
   static const bool value = false;
 };
 
-#ifdef EIGEN_HAS_CONSTEXPR
+#if defined(EIGEN_HAS_CONSTEXPR) && defined(EIGEN_HAS_VARIADIC_TEMPLATES)
 template <typename ReducedDims, int NumTensorDims>
 struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
-  static const bool value = indices_statically_known_to_increase<ReducedDims>()() &&
-                            index_statically_eq<ReducedDims>()(0, 0) &&
-                            index_statically_eq<ReducedDims>()(array_size<ReducedDims>::value-1, array_size<ReducedDims>::value-1);
+  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>()();
+  static const bool tmp2 = index_statically_eq<ReducedDims>()(0, 0);
+  static const bool tmp3 = index_statically_eq<ReducedDims>()(array_size<ReducedDims>::value-1, array_size<ReducedDims>::value-1);
+  static const bool value = tmp1 & tmp2 & tmp3;
 };
 template <typename ReducedDims, int NumTensorDims>
 struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
-  static const bool value = indices_statically_known_to_increase<ReducedDims>()() &&
-                            index_statically_eq<ReducedDims>()(0, NumTensorDims - array_size<ReducedDims>::value) &&
-                            index_statically_eq<ReducedDims>()(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
+  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>()();
+  static const bool tmp2 = index_statically_eq<ReducedDims>()(0, NumTensorDims - array_size<ReducedDims>::value);
+  static const bool tmp3 = index_statically_eq<ReducedDims>()(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
+  static const bool value = tmp1 & tmp2 & tmp3;
+
 };
 template <typename ReducedDims, int NumTensorDims>
 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
-  static const bool value = indices_statically_known_to_increase<ReducedDims>()() &&
-                            index_statically_gt<ReducedDims>()(0, 0);
+  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>()();
+  static const bool tmp2 = index_statically_gt<ReducedDims>()(0, 0);
+  static const bool value = tmp1 & tmp2;
+
 };
 template <typename ReducedDims, int NumTensorDims>
 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
-  static const bool value = indices_statically_known_to_increase<ReducedDims>()() &&
-                            index_statically_lt<ReducedDims>()(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
+  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>()();
+  static const bool tmp2 = index_statically_lt<ReducedDims>()(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
+  static const bool value = tmp1 & tmp2;
 };
 #endif
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
index acdbc181d..fba7b20a9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
@@ -295,7 +295,7 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef
       Index index = 0;
       if (PlainObjectType::Options & RowMajor) {
         index += indices[0];
-        for (int i = 1; i < NumIndices; ++i) {
+        for (size_t i = 1; i < NumIndices; ++i) {
           index = index * dims[i] + indices[i];
         }
       } else {
@@ -313,7 +313,7 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef
       Index index = 0;
       if (PlainObjectType::Options & RowMajor) {
         index += indices[0];
-        for (int i = 1; i < NumIndices; ++i) {
+        for (size_t i = 1; i < NumIndices; ++i) {
           index = index * dims[i] + indices[i];
         }
       } else {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
index 1b227e8c2..f567b8c03 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -30,21 +31,22 @@ namespace Eigen {
   *
   * \sa Tensor
   */
-template<typename T, DenseIndex NumIndices_, DenseIndex Size, int Options_, typename Dimensions = void> class TensorStorage;
+template<typename T, typename Dimensions, int Options_> class TensorStorage;
 
 
 // Pure fixed-size storage
-template<typename T, DenseIndex NumIndices_, DenseIndex Size, int Options_, typename FixedDimensions>
-class TensorStorage
+template<typename T, int Options_, typename FixedDimensions>
+class TensorStorage<T, FixedDimensions, Options_>
 {
  private:
+  static const std::size_t Size = FixedDimensions::total_size;
+
   EIGEN_ALIGN_DEFAULT T m_data[Size];
   FixedDimensions m_dimensions;
 
  public:
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE TensorStorage() {
-    EIGEN_STATIC_ASSERT(Size == FixedDimensions::total_size, YOU_MADE_A_PROGRAMMING_MISTAKE)
   }
 
   EIGEN_DEVICE_FUNC
@@ -60,63 +62,46 @@ class TensorStorage
 };
 
 
-
-// pure-dynamic, but without specification of all dimensions explicitly
-template<typename T, DenseIndex NumIndices_, int Options_>
-class TensorStorage<T, NumIndices_, Dynamic, Options_, void>
-  : public TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_numeric_list_repeated<DenseIndex, NumIndices_, Dynamic>::type>
-{
-  typedef TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_numeric_list_repeated<DenseIndex, NumIndices_, Dynamic>::type> Base_;
-
-  public:
-    TensorStorage() { }
-    TensorStorage(const TensorStorage<T, NumIndices_, Dynamic, Options_, void>& other) : Base_(other) { }
-
-    TensorStorage(internal::constructor_without_unaligned_array_assert) : Base_(internal::constructor_without_unaligned_array_assert()) {}
-    TensorStorage(DenseIndex size, const array<DenseIndex, NumIndices_>& dimensions) : Base_(size, dimensions) {}
-
-  //      TensorStorage<T, NumIndices_, Dynamic, Options_, void>& operator=(const TensorStorage<T, NumIndices_, Dynamic, Options_, void>&) = default;
-};
-
 // pure dynamic
-template<typename T, DenseIndex NumIndices_, int Options_>
-class TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_numeric_list_repeated<DenseIndex, NumIndices_, Dynamic>::type>
+template<typename T, int Options_, typename IndexType, std::size_t NumIndices_>
+class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_>
 {
-    T *m_data;
-    DSizes<DenseIndex, NumIndices_> m_dimensions;
-
-    typedef TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_numeric_list_repeated<DenseIndex, NumIndices_, Dynamic>::type> Self_;
   public:
-    TensorStorage() : m_data(0), m_dimensions() {}
-    TensorStorage(internal::constructor_without_unaligned_array_assert)
-      : m_data(0), m_dimensions(internal::template repeat<NumIndices_, DenseIndex>(0)) {}
-    TensorStorage(DenseIndex size, const array<DenseIndex, NumIndices_>& dimensions)
+    typedef IndexType Index;
+    typedef DSizes<IndexType, NumIndices_> Dimensions;
+    typedef TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_> Self;
+
+    EIGEN_DEVICE_FUNC TensorStorage() : m_data(0), m_dimensions() {}
+    EIGEN_DEVICE_FUNC TensorStorage(internal::constructor_without_unaligned_array_assert)
+      : m_data(0), m_dimensions(internal::template repeat<NumIndices_, Index>(0)) {}
+    EIGEN_DEVICE_FUNC TensorStorage(Index size, const array<Index, NumIndices_>& dimensions)
         : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size)), m_dimensions(dimensions)
       { EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN }
-      TensorStorage(const Self_& other)
+
+    EIGEN_DEVICE_FUNC TensorStorage(const Self& other)
       : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(internal::array_prod(other.m_dimensions)))
       , m_dimensions(other.m_dimensions)
     {
       internal::smart_copy(other.m_data, other.m_data+internal::array_prod(other.m_dimensions), m_data);
     }
-    Self_& operator=(const Self_& other)
+    EIGEN_DEVICE_FUNC Self& operator=(const Self& other)
     {
       if (this != &other) {
-        Self_ tmp(other);
+        Self tmp(other);
         this->swap(tmp);
       }
       return *this;
     }
 
-    ~TensorStorage() { internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, internal::array_prod(m_dimensions)); }
-    void swap(Self_& other)
+    EIGEN_DEVICE_FUNC  ~TensorStorage() { internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, internal::array_prod(m_dimensions)); }
+    EIGEN_DEVICE_FUNC  void swap(Self& other)
     { std::swap(m_data,other.m_data); std::swap(m_dimensions,other.m_dimensions); }
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<DenseIndex, NumIndices_>& dimensions() const {return m_dimensions;}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {return m_dimensions;}
 
-    EIGEN_DEVICE_FUNC void resize(DenseIndex size, const array<DenseIndex, NumIndices_>& nbDimensions)
+    EIGEN_DEVICE_FUNC void resize(Index size, const array<Index, NumIndices_>& nbDimensions)
     {
-      const DenseIndex currentSz = internal::array_prod(m_dimensions);
+      const Index currentSz = internal::array_prod(m_dimensions);
       if(size != currentSz)
       {
         internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, currentSz);
@@ -132,7 +117,11 @@ class TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_nu
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T *data() { return m_data; }
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T *data() const { return m_data; }
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex size() const { return m_dimensions.TotalSize(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); }
+
+ private:
+  T *m_data;
+  Dimensions m_dimensions;
 };
 
 } // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
index 66ddfd554..ba09298c3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
@@ -44,12 +44,12 @@ class compute_tensor_flags
 };
 
 
-template<typename Scalar_, std::size_t NumIndices_, int Options_>
-struct traits<Tensor<Scalar_, NumIndices_, Options_> >
+template<typename Scalar_, std::size_t NumIndices_, int Options_, typename IndexType_>
+struct traits<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
 {
   typedef Scalar_ Scalar;
   typedef Dense StorageKind;
-  typedef DenseIndex Index;
+  typedef IndexType_ Index;
   static const int NumDimensions = NumIndices_;
   static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
   enum {
@@ -59,12 +59,12 @@ struct traits<Tensor<Scalar_, NumIndices_, Options_> >
 };
 
 
-template<typename Scalar_, typename Dimensions, int Options_>
-struct traits<TensorFixedSize<Scalar_, Dimensions, Options_> >
+template<typename Scalar_, typename Dimensions, int Options_, typename IndexType_>
+struct traits<TensorFixedSize<Scalar_, Dimensions, Options_, IndexType_> >
 {
   typedef Scalar_ Scalar;
   typedef Dense StorageKind;
-  typedef DenseIndex Index;
+  typedef IndexType_ Index;
   static const int NumDimensions = array_size<Dimensions>::value;
   static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
   enum {
@@ -107,28 +107,28 @@ struct traits<TensorRef<PlainObjectType> >
 };
 
 
-template<typename _Scalar, std::size_t NumIndices_, int Options>
-struct eval<Tensor<_Scalar, NumIndices_, Options>, Eigen::Dense>
+template<typename _Scalar, std::size_t NumIndices_, int Options, typename IndexType_>
+struct eval<Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense>
 {
-  typedef const Tensor<_Scalar, NumIndices_, Options>& type;
+  typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>& type;
 };
 
-template<typename _Scalar, std::size_t NumIndices_, int Options>
-struct eval<const Tensor<_Scalar, NumIndices_, Options>, Eigen::Dense>
+template<typename _Scalar, std::size_t NumIndices_, int Options, typename IndexType_>
+struct eval<const Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense>
 {
-  typedef const Tensor<_Scalar, NumIndices_, Options>& type;
+  typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>& type;
 };
 
-template<typename Scalar_, typename Dimensions, int Options>
-struct eval<TensorFixedSize<Scalar_, Dimensions, Options>, Eigen::Dense>
+template<typename Scalar_, typename Dimensions, int Options, typename IndexType_>
+struct eval<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense>
 {
-  typedef const TensorFixedSize<Scalar_, Dimensions, Options>& type;
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
 };
 
-template<typename Scalar_, typename Dimensions, int Options>
-struct eval<const TensorFixedSize<Scalar_, Dimensions, Options>, Eigen::Dense>
+template<typename Scalar_, typename Dimensions, int Options, typename IndexType_>
+struct eval<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense>
 {
-  typedef const TensorFixedSize<Scalar_, Dimensions, Options>& type;
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
 };
 
 template<typename PlainObjectType, int Options>
@@ -156,28 +156,28 @@ struct eval<const TensorRef<PlainObjectType>, Eigen::Dense>
 };
 
 
-template <typename Scalar_, std::size_t NumIndices_, int Options_>
-struct nested<Tensor<Scalar_, NumIndices_, Options_> >
+template <typename Scalar_, std::size_t NumIndices_, int Options_, typename IndexType_>
+struct nested<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
 {
-  typedef const Tensor<Scalar_, NumIndices_, Options_>& type;
+  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>& type;
 };
 
-template <typename Scalar_, std::size_t NumIndices_, int Options_>
-struct nested<const Tensor<Scalar_, NumIndices_, Options_> >
+template <typename Scalar_, std::size_t NumIndices_, int Options_, typename IndexType_>
+struct nested<const Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
 {
-  typedef const Tensor<Scalar_, NumIndices_, Options_>& type;
+  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>& type;
 };
 
-template <typename Scalar_, typename Dimensions, int Options>
-struct nested<TensorFixedSize<Scalar_, Dimensions, Options> >
+template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
+struct nested<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> >
 {
-  typedef const TensorFixedSize<Scalar_, Dimensions, Options>& type;
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
 };
 
-template <typename Scalar_, typename Dimensions, int Options>
-struct nested<const TensorFixedSize<Scalar_, Dimensions, Options> >
+template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
+struct nested<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> >
 {
-  typedef const TensorFixedSize<Scalar_, Dimensions, Options>& type;
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
 };
 
 
diff --git a/unsupported/Eigen/src/IterativeSolvers/GMRES.h b/unsupported/Eigen/src/IterativeSolvers/GMRES.h
index 3e733e053..05e5862a5 100644
--- a/unsupported/Eigen/src/IterativeSolvers/GMRES.h
+++ b/unsupported/Eigen/src/IterativeSolvers/GMRES.h
@@ -16,192 +16,191 @@ namespace Eigen {
 namespace internal {
 
 /**
- * Generalized Minimal Residual Algorithm based on the
- * Arnoldi algorithm implemented with Householder reflections.
- *
- * Parameters:
- *  \param mat       matrix of linear system of equations
- *  \param Rhs       right hand side vector of linear system of equations
- *  \param x         on input: initial guess, on output: solution
- *  \param precond   preconditioner used
- *  \param iters     on input: maximum number of iterations to perform
- *                   on output: number of iterations performed
- *  \param restart   number of iterations for a restart
- *  \param tol_error on input: relative residual tolerance
- *                   on output: residuum achieved
- *
- * \sa IterativeMethods::bicgstab()
- *
- *
- * For references, please see:
- *
- * Saad, Y. and Schultz, M. H.
- * GMRES: A Generalized Minimal Residual Algorithm for Solving Nonsymmetric Linear Systems.
- * SIAM J.Sci.Stat.Comp. 7, 1986, pp. 856 - 869.
- *
- * Saad, Y.
- * Iterative Methods for Sparse Linear Systems.
- * Society for Industrial and Applied Mathematics, Philadelphia, 2003.
- *
- * Walker, H. F.
- * Implementations of the GMRES method.
- * Comput.Phys.Comm. 53, 1989, pp. 311 - 320.
- *
- * Walker, H. F.
- * Implementation of the GMRES Method using Householder Transformations.
- * SIAM J.Sci.Stat.Comp. 9, 1988, pp. 152 - 163.
- *
- */
+* Generalized Minimal Residual Algorithm based on the
+* Arnoldi algorithm implemented with Householder reflections.
+*
+* Parameters:
+*  \param mat       matrix of linear system of equations
+*  \param Rhs       right hand side vector of linear system of equations
+*  \param x         on input: initial guess, on output: solution
+*  \param precond   preconditioner used
+*  \param iters     on input: maximum number of iterations to perform
+*                   on output: number of iterations performed
+*  \param restart   number of iterations for a restart
+*  \param tol_error on input: relative residual tolerance
+*                   on output: residuum achieved
+*
+* \sa IterativeMethods::bicgstab()
+*
+*
+* For references, please see:
+*
+* Saad, Y. and Schultz, M. H.
+* GMRES: A Generalized Minimal Residual Algorithm for Solving Nonsymmetric Linear Systems.
+* SIAM J.Sci.Stat.Comp. 7, 1986, pp. 856 - 869.
+*
+* Saad, Y.
+* Iterative Methods for Sparse Linear Systems.
+* Society for Industrial and Applied Mathematics, Philadelphia, 2003.
+*
+* Walker, H. F.
+* Implementations of the GMRES method.
+* Comput.Phys.Comm. 53, 1989, pp. 311 - 320.
+*
+* Walker, H. F.
+* Implementation of the GMRES Method using Householder Transformations.
+* SIAM J.Sci.Stat.Comp. 9, 1988, pp. 152 - 163.
+*
+*/
 template<typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
 bool gmres(const MatrixType & mat, const Rhs & rhs, Dest & x, const Preconditioner & precond,
-		Index &iters, const Index &restart, typename Dest::RealScalar & tol_error) {
+    Index &iters, const Index &restart, typename Dest::RealScalar & tol_error) {
 
-	using std::sqrt;
-	using std::abs;
+  using std::sqrt;
+  using std::abs;
 
-	typedef typename Dest::RealScalar RealScalar;
-	typedef typename Dest::Scalar Scalar;
-	typedef Matrix < Scalar, Dynamic, 1 > VectorType;
-	typedef Matrix < Scalar, Dynamic, Dynamic > FMatrixType;
+  typedef typename Dest::RealScalar RealScalar;
+  typedef typename Dest::Scalar Scalar;
+  typedef Matrix < Scalar, Dynamic, 1 > VectorType;
+  typedef Matrix < Scalar, Dynamic, Dynamic > FMatrixType;
 
-	RealScalar tol = tol_error;
-	const Index maxIters = iters;
-	iters = 0;
+  RealScalar tol = tol_error;
+  const Index maxIters = iters;
+  iters = 0;
 
-	const Index m = mat.rows();
+  const Index m = mat.rows();
 
-	// residual and preconditioned residual
-	const VectorType p0 = rhs - mat*x;
-	VectorType r0 = precond.solve(p0);
+  // residual and preconditioned residual
+  VectorType p0 = rhs - mat*x;
+  VectorType r0 = precond.solve(p0);
 
-	const RealScalar r0Norm = r0.norm();
+  const RealScalar r0Norm = r0.norm();
 
-	// is initial guess already good enough?
-	if(r0Norm == 0) {
-		tol_error=0;
-		return true;
-	}
-
-	// storage for Hessenberg matrix and Householder data
-	FMatrixType H = FMatrixType::Zero(m, restart + 1);
-	VectorType w = VectorType::Zero(restart + 1);
-	VectorType tau = VectorType::Zero(restart + 1);
-
-	// storage for Jacobi rotations
-	std::vector < JacobiRotation < Scalar > > G(restart);
-
-	// generate first Householder vector
-	VectorType e(m-1);
-	RealScalar beta;
-	r0.makeHouseholder(e, tau.coeffRef(0), beta);
-	w(0)=(Scalar) beta;
-	H.bottomLeftCorner(m - 1, 1) = e;
-
-	for (Index k = 1; k <= restart; ++k) {
-
-		++iters;
-
-		VectorType v = VectorType::Unit(m, k - 1), workspace(m);
-
-		// apply Householder reflections H_{1} ... H_{k-1} to v
-		for (Index i = k - 1; i >= 0; --i) {
-			v.tail(m - i).applyHouseholderOnTheLeft(H.col(i).tail(m - i - 1), tau.coeffRef(i), workspace.data());
-		}
-
-		// apply matrix M to v:  v = mat * v;
-		VectorType t=mat*v;
-		v=precond.solve(t);
-
-		// apply Householder reflections H_{k-1} ... H_{1} to v
-		for (Index i = 0; i < k; ++i) {
-			v.tail(m - i).applyHouseholderOnTheLeft(H.col(i).tail(m - i - 1), tau.coeffRef(i), workspace.data());
-		}
-
-		if (v.tail(m - k).norm() != 0.0) {
-			if (k <= restart) {
-
-				// generate new Householder vector
-				VectorType e(m - k - 1);
-				RealScalar beta;
-				v.tail(m - k).makeHouseholder(e, tau.coeffRef(k), beta);
-				H.col(k).tail(m - k - 1) = e;
-
-				// apply Householder reflection H_{k} to v
-				v.tail(m - k).applyHouseholderOnTheLeft(H.col(k).tail(m - k - 1), tau.coeffRef(k), workspace.data());
-
-			}
-		}
-
-		if (k > 1) {
-			for (Index i = 0; i < k - 1; ++i) {
-				// apply old Givens rotations to v
-				v.applyOnTheLeft(i, i + 1, G[i].adjoint());
-			}
-		}
-
-		if (k<m && v(k) != (Scalar) 0) {
-
-			// determine next Givens rotation
-			G[k - 1].makeGivens(v(k - 1), v(k));
-
-			// apply Givens rotation to v and w
-			v.applyOnTheLeft(k - 1, k, G[k - 1].adjoint());
-			w.applyOnTheLeft(k - 1, k, G[k - 1].adjoint());
-		}
-
-		// insert coefficients into upper matrix triangle
-		H.col(k - 1).head(k) = v.head(k);
-
-		bool stop=(k==m || abs(w(k)) < tol * r0Norm || iters == maxIters);
-
-		if (stop || k == restart) {
-
-			// solve upper triangular system
-			VectorType y = w.head(k);
-			H.topLeftCorner(k, k).template triangularView < Eigen::Upper > ().solveInPlace(y);
+  // is initial guess already good enough?
+  if(r0Norm == 0)
+  {
+    tol_error = 0;
+    return true;
+  }
 
-			// use Horner-like scheme to calculate solution vector
-			VectorType x_new = y(k - 1) * VectorType::Unit(m, k - 1);
+  // storage for Hessenberg matrix and Householder data
+  FMatrixType H   = FMatrixType::Zero(m, restart + 1);
+  VectorType w    = VectorType::Zero(restart + 1);
+  VectorType tau  = VectorType::Zero(restart + 1);
+
+  // storage for Jacobi rotations
+  std::vector < JacobiRotation < Scalar > > G(restart);
+  
+  // storage for temporaries
+  VectorType t(m), v(m), workspace(m), x_new(m);
+
+  // generate first Householder vector
+  Ref<VectorType> H0_tail = H.col(0).tail(m - 1);
+  RealScalar beta;
+  r0.makeHouseholder(H0_tail, tau.coeffRef(0), beta);
+  w(0) = Scalar(beta);
+  
+  for (Index k = 1; k <= restart; ++k)
+  {
+    ++iters;
 
-			// apply Householder reflection H_{k} to x_new
-			x_new.tail(m - k + 1).applyHouseholderOnTheLeft(H.col(k - 1).tail(m - k), tau.coeffRef(k - 1), workspace.data());
+    v = VectorType::Unit(m, k - 1);
 
-			for (Index i = k - 2; i >= 0; --i) {
-				x_new += y(i) * VectorType::Unit(m, i);
-				// apply Householder reflection H_{i} to x_new
-				x_new.tail(m - i).applyHouseholderOnTheLeft(H.col(i).tail(m - i - 1), tau.coeffRef(i), workspace.data());
-			}
+    // apply Householder reflections H_{1} ... H_{k-1} to v
+    // TODO: use a HouseholderSequence
+    for (Index i = k - 1; i >= 0; --i) {
+      v.tail(m - i).applyHouseholderOnTheLeft(H.col(i).tail(m - i - 1), tau.coeffRef(i), workspace.data());
+    }
 
-			x += x_new;
+    // apply matrix M to v:  v = mat * v;
+    t.noalias() = mat * v;
+    v = precond.solve(t);
 
-			if (stop) {
-				return true;
-			} else {
-				k=0;
+    // apply Householder reflections H_{k-1} ... H_{1} to v
+    // TODO: use a HouseholderSequence
+    for (Index i = 0; i < k; ++i) {
+      v.tail(m - i).applyHouseholderOnTheLeft(H.col(i).tail(m - i - 1), tau.coeffRef(i), workspace.data());
+    }
 
-				// reset data for restart
-				const VectorType p0 = rhs - mat*x;
-				r0 = precond.solve(p0);
+    if (v.tail(m - k).norm() != 0.0)
+    {
+      if (k <= restart)
+      {
+        // generate new Householder vector
+        Ref<VectorType> Hk_tail = H.col(k).tail(m - k - 1);
+        v.tail(m - k).makeHouseholder(Hk_tail, tau.coeffRef(k), beta);
+
+        // apply Householder reflection H_{k} to v
+        v.tail(m - k).applyHouseholderOnTheLeft(Hk_tail, tau.coeffRef(k), workspace.data());
+      }
+    }
 
-				// clear Hessenberg matrix and Householder data
-				H = FMatrixType::Zero(m, restart + 1);
-				w = VectorType::Zero(restart + 1);
-				tau = VectorType::Zero(restart + 1);
+    if (k > 1)
+    {
+      for (Index i = 0; i < k - 1; ++i)
+      {
+        // apply old Givens rotations to v
+        v.applyOnTheLeft(i, i + 1, G[i].adjoint());
+      }
+    }
 
-				// generate first Householder vector
-				RealScalar beta;
-				r0.makeHouseholder(e, tau.coeffRef(0), beta);
-				w(0)=(Scalar) beta;
-				H.bottomLeftCorner(m - 1, 1) = e;
+    if (k<m && v(k) != (Scalar) 0)
+    {
+      // determine next Givens rotation
+      G[k - 1].makeGivens(v(k - 1), v(k));
 
-			}
+      // apply Givens rotation to v and w
+      v.applyOnTheLeft(k - 1, k, G[k - 1].adjoint());
+      w.applyOnTheLeft(k - 1, k, G[k - 1].adjoint());
+    }
 
-		}
+    // insert coefficients into upper matrix triangle
+    H.col(k-1).head(k) = v.head(k);
 
+    bool stop = (k==m || abs(w(k)) < tol * r0Norm || iters == maxIters);
 
-	}
+    if (stop || k == restart)
+    {
+      // solve upper triangular system
+      Ref<VectorType> y = w.head(k);
+      H.topLeftCorner(k, k).template triangularView <Upper>().solveInPlace(y);
+
+      // use Horner-like scheme to calculate solution vector
+      x_new.setZero();
+      for (Index i = k - 1; i >= 0; --i)
+      {
+        x_new(i) += y(i);
+        // apply Householder reflection H_{i} to x_new
+        x_new.tail(m - i).applyHouseholderOnTheLeft(H.col(i).tail(m - i - 1), tau.coeffRef(i), workspace.data());
+      }
+
+      x += x_new;
+
+      if(stop)
+      {
+        return true;
+      }
+      else
+      {
+        k=0;
+
+        // reset data for restart
+        p0.noalias() = rhs - mat*x;
+        r0 = precond.solve(p0);
+
+        // clear Hessenberg matrix and Householder data
+        H.setZero();
+        w.setZero();
+        tau.setZero();
+
+        // generate first Householder vector
+        r0.makeHouseholder(H0_tail, tau.coeffRef(0), beta);
+        w(0) = Scalar(beta);
+      }
+    }
+  }
 
-	return false;
+  return false;
 
 }
 
@@ -317,8 +316,8 @@ public:
         failed = true;
     }
     m_info = failed ? NumericalIssue
-           : m_error <= Base::m_tolerance ? Success
-           : NoConvergence;
+          : m_error <= Base::m_tolerance ? Success
+          : NoConvergence;
     m_isInitialized = true;
   }
 
diff --git a/unsupported/Eigen/src/LevenbergMarquardt/LMpar.h b/unsupported/Eigen/src/LevenbergMarquardt/LMpar.h
index 731862341..9a4836547 100644
--- a/unsupported/Eigen/src/LevenbergMarquardt/LMpar.h
+++ b/unsupported/Eigen/src/LevenbergMarquardt/LMpar.h
@@ -30,7 +30,7 @@ namespace internal {
     using std::abs;
     typedef typename QRSolver::MatrixType MatrixType;
     typedef typename QRSolver::Scalar Scalar;
-    typedef typename QRSolver::StorageIndex StorageIndex;
+//    typedef typename QRSolver::StorageIndex StorageIndex;
 
     /* Local variables */
     Index j;
diff --git a/unsupported/Eigen/src/SparseExtra/MarketIO.h b/unsupported/Eigen/src/SparseExtra/MarketIO.h
index 100e617b2..cdc14f86e 100644
--- a/unsupported/Eigen/src/SparseExtra/MarketIO.h
+++ b/unsupported/Eigen/src/SparseExtra/MarketIO.h
@@ -163,7 +163,7 @@ bool loadMarket(SparseMatrixType& mat, const std::string& filename)
       if(M > 0 && N > 0 && NNZ > 0) 
       {
         readsizes = true;
-        std::cout << "sizes: " << M << "," << N << "," << NNZ << "\n";
+        //std::cout << "sizes: " << M << "," << N << "," << NNZ << "\n";
         mat.resize(M,N);
         mat.reserve(NNZ);
       }
diff --git a/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h b/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h
index bf13cf21f..02916ea6f 100644
--- a/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h
+++ b/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h
@@ -41,20 +41,18 @@ enum {
 template <typename Scalar>
 class MatrixMarketIterator 
 {
+    typedef typename NumTraits<Scalar>::Real RealScalar;
   public:
     typedef Matrix<Scalar,Dynamic,1> VectorType; 
     typedef SparseMatrix<Scalar,ColMajor> MatrixType; 
   
   public:
-    MatrixMarketIterator(const std::string folder):m_sym(0),m_isvalid(false),m_matIsLoaded(false),m_hasRhs(false),m_hasrefX(false),m_folder(folder)
+    MatrixMarketIterator(const std::string &folder)
+      : m_sym(0), m_isvalid(false), m_matIsLoaded(false), m_hasRhs(false), m_hasrefX(false), m_folder(folder)
     {
       m_folder_id = opendir(folder.c_str());
-      if (!m_folder_id){
-        m_isvalid = false;
-        std::cerr << "The provided Matrix folder could not be opened \n\n";
-        abort();
-      }
-      Getnextvalidmatrix();
+      if(m_folder_id)
+        Getnextvalidmatrix();
     }
     
     ~MatrixMarketIterator()
@@ -81,16 +79,30 @@ class MatrixMarketIterator
       std::string matrix_file = m_folder + "/" + m_matname + ".mtx";
       if ( !loadMarket(m_mat, matrix_file)) 
       {
+        std::cerr << "Warning loadMarket failed when loading \"" << matrix_file << "\"" << std::endl;
         m_matIsLoaded = false;
         return m_mat;
       }
       m_matIsLoaded = true; 
-      
+
       if (m_sym != NonSymmetric) 
-      { // Store the upper part of the matrix. It is needed by the solvers dealing with nonsymmetric matrices ??
-        MatrixType B; 
-        B = m_mat;
-        m_mat = B.template selfadjointView<Lower>();
+      {
+        // Check whether we need to restore a full matrix:
+        RealScalar diag_norm  = m_mat.diagonal().norm();
+        RealScalar lower_norm = m_mat.template triangularView<Lower>().norm();
+        RealScalar upper_norm = m_mat.template triangularView<Upper>().norm();
+        if(lower_norm>diag_norm && upper_norm==diag_norm)
+        {
+          // only the lower part is stored
+          MatrixType tmp(m_mat);
+          m_mat = tmp.template selfadjointView<Lower>();
+        }
+        else if(upper_norm>diag_norm && lower_norm==diag_norm)
+        {
+          // only the upper part is stored
+          MatrixType tmp(m_mat);
+          m_mat = tmp.template selfadjointView<Upper>();
+        }
       }
       return m_mat; 
     }
@@ -143,6 +155,8 @@ class MatrixMarketIterator
         m_refX.resize(m_mat.cols());
         m_hasrefX = loadMarketVector(m_refX, lhs_file);
       }
+      else
+        m_refX.resize(0);
       return m_refX; 
     }
     
@@ -150,8 +164,9 @@ class MatrixMarketIterator
     
     inline int sym() { return m_sym; }
     
-    inline bool hasRhs() {return m_hasRhs; }
-    inline bool hasrefX() {return m_hasrefX; }
+    bool hasRhs() {return m_hasRhs; }
+    bool hasrefX() {return m_hasrefX; }
+    bool isFolderValid() { return bool(m_folder_id); }
     
   protected:
     
diff --git a/unsupported/Eigen/src/Splines/Spline.h b/unsupported/Eigen/src/Splines/Spline.h
index c46f728bc..d1636f466 100644
--- a/unsupported/Eigen/src/Splines/Spline.h
+++ b/unsupported/Eigen/src/Splines/Spline.h
@@ -471,7 +471,7 @@ namespace Eigen
     r = p;
     for (DenseIndex k=1; k<=static_cast<DenseIndex>(n); ++k)
     {
-      for (DenseIndex j=p; j>=0; --j) N_(k,j) *= r;
+      for (j=p; j>=0; --j) N_(k,j) *= r;
       r *= p-k;
     }
   }
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 9eabfb620..f438d4107 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -104,6 +104,7 @@ if(EIGEN_TEST_CXX11)
   ei_add_test(cxx11_tensor_assign "-std=c++0x")
   ei_add_test(cxx11_tensor_dimension "-std=c++0x")
   ei_add_test(cxx11_tensor_index_list "-std=c++0x")
+  ei_add_test(cxx11_tensor_mixed_indices "-std=c++0x")
   ei_add_test(cxx11_tensor_comparisons "-std=c++0x")
   ei_add_test(cxx11_tensor_contraction "-std=c++0x")
   ei_add_test(cxx11_tensor_convolution "-std=c++0x")
@@ -134,6 +135,7 @@ if(EIGEN_TEST_CXX11)
   ei_add_test(cxx11_tensor_reverse "-std=c++0x")
   ei_add_test(cxx11_tensor_layout_swap "-std=c++0x")
   ei_add_test(cxx11_tensor_io "-std=c++0x")
+  ei_add_test(cxx11_tensor_generator "-std=c++0x")
 
   # These tests needs nvcc
 #  ei_add_test(cxx11_tensor_device "-std=c++0x")
diff --git a/unsupported/test/cxx11_tensor_casts.cpp b/unsupported/test/cxx11_tensor_casts.cpp
index 4f7ff7067..729e43327 100644
--- a/unsupported/test/cxx11_tensor_casts.cpp
+++ b/unsupported/test/cxx11_tensor_casts.cpp
@@ -17,7 +17,7 @@ using Eigen::array;
 static void test_simple_cast()
 {
   Tensor<float, 2> ftensor(20,30);
-  ftensor.setRandom();
+  ftensor = ftensor.random() * 100.f;
   Tensor<char, 2> chartensor(20,30);
   chartensor.setRandom();
   Tensor<std::complex<float>, 2> cplextensor(20,30);
@@ -35,7 +35,81 @@ static void test_simple_cast()
 }
 
 
+static void test_vectorized_cast()
+{
+  Tensor<int, 2> itensor(20,30);
+  itensor = itensor.random() / 1000;
+  Tensor<float, 2> ftensor(20,30);
+  ftensor.setRandom();
+  Tensor<double, 2> dtensor(20,30);
+  dtensor.setRandom();
+
+  ftensor = itensor.cast<float>();
+  dtensor = itensor.cast<double>();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_EQUAL(itensor(i,j), static_cast<int>(ftensor(i,j)));
+      VERIFY_IS_EQUAL(dtensor(i,j), static_cast<double>(ftensor(i,j)));
+    }
+  }
+}
+
+
+static void test_float_to_int_cast()
+{
+  Tensor<float, 2> ftensor(20,30);
+  ftensor = ftensor.random() * 1000.0f;
+  Tensor<double, 2> dtensor(20,30);
+  dtensor = dtensor.random() * 1000.0;
+
+  Tensor<int, 2> i1tensor = ftensor.cast<int>();
+  Tensor<int, 2> i2tensor = dtensor.cast<int>();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_EQUAL(i1tensor(i,j), static_cast<int>(ftensor(i,j)));
+      VERIFY_IS_EQUAL(i2tensor(i,j), static_cast<int>(dtensor(i,j)));
+    }
+  }
+}
+
+
+static void test_big_to_small_type_cast()
+{
+  Tensor<double, 2> dtensor(20, 30);
+  dtensor.setRandom();
+  Tensor<float, 2> ftensor(20, 30);
+  ftensor = dtensor.cast<float>();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_APPROX(dtensor(i,j), static_cast<double>(ftensor(i,j)));
+    }
+  }
+}
+
+
+static void test_small_to_big_type_cast()
+{
+  Tensor<float, 2> ftensor(20, 30);
+  ftensor.setRandom();
+  Tensor<double, 2> dtensor(20, 30);
+  dtensor = ftensor.cast<double>();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_APPROX(dtensor(i,j), static_cast<double>(ftensor(i,j)));
+    }
+  }
+}
+
+
 void test_cxx11_tensor_casts()
 {
    CALL_SUBTEST(test_simple_cast());
+   CALL_SUBTEST(test_vectorized_cast());
+   CALL_SUBTEST(test_float_to_int_cast());
+   CALL_SUBTEST(test_big_to_small_type_cast());
+   CALL_SUBTEST(test_small_to_big_type_cast());
 }
diff --git a/unsupported/test/cxx11_tensor_chipping.cpp b/unsupported/test/cxx11_tensor_chipping.cpp
index bfc2bad18..1832dec8b 100644
--- a/unsupported/test/cxx11_tensor_chipping.cpp
+++ b/unsupported/test/cxx11_tensor_chipping.cpp
@@ -345,8 +345,8 @@ static void test_chip_raw_data_col_major()
   Tensor<float, 5, ColMajor> tensor(2,3,5,7,11);
   tensor.setRandom();
 
-  typedef TensorEvaluator<decltype(tensor.template chip<4>(3)), DefaultDevice> Evaluator4;
-  auto chip = Evaluator4(tensor.template chip<4>(3), DefaultDevice());
+  typedef TensorEvaluator<decltype(tensor.chip<4>(3)), DefaultDevice> Evaluator4;
+  auto chip = Evaluator4(tensor.chip<4>(3), DefaultDevice());
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 5; ++k) {
@@ -358,20 +358,20 @@ static void test_chip_raw_data_col_major()
     }
   }
 
-  typedef TensorEvaluator<decltype(tensor.template chip<0>(0)), DefaultDevice> Evaluator0;
-  auto chip0 = Evaluator0(tensor.template chip<0>(0), DefaultDevice());
+  typedef TensorEvaluator<decltype(tensor.chip<0>(0)), DefaultDevice> Evaluator0;
+  auto chip0 = Evaluator0(tensor.chip<0>(0), DefaultDevice());
   VERIFY_IS_EQUAL(chip0.data(), static_cast<float*>(0));
 
-  typedef TensorEvaluator<decltype(tensor.template chip<1>(0)), DefaultDevice> Evaluator1;
-  auto chip1 = Evaluator1(tensor.template chip<1>(0), DefaultDevice());
+  typedef TensorEvaluator<decltype(tensor.chip<1>(0)), DefaultDevice> Evaluator1;
+  auto chip1 = Evaluator1(tensor.chip<1>(0), DefaultDevice());
   VERIFY_IS_EQUAL(chip1.data(), static_cast<float*>(0));
 
-  typedef TensorEvaluator<decltype(tensor.template chip<2>(0)), DefaultDevice> Evaluator2;
-  auto chip2 = Evaluator2(tensor.template chip<2>(0), DefaultDevice());
+  typedef TensorEvaluator<decltype(tensor.chip<2>(0)), DefaultDevice> Evaluator2;
+  auto chip2 = Evaluator2(tensor.chip<2>(0), DefaultDevice());
   VERIFY_IS_EQUAL(chip2.data(), static_cast<float*>(0));
 
-  typedef TensorEvaluator<decltype(tensor.template chip<3>(0)), DefaultDevice> Evaluator3;
-  auto chip3 = Evaluator3(tensor.template chip<3>(0), DefaultDevice());
+  typedef TensorEvaluator<decltype(tensor.chip<3>(0)), DefaultDevice> Evaluator3;
+  auto chip3 = Evaluator3(tensor.chip<3>(0), DefaultDevice());
   VERIFY_IS_EQUAL(chip3.data(), static_cast<float*>(0));
 }
 
@@ -380,8 +380,8 @@ static void test_chip_raw_data_row_major()
   Tensor<float, 5, RowMajor> tensor(11,7,5,3,2);
   tensor.setRandom();
 
-  typedef TensorEvaluator<decltype(tensor.template chip<0>(3)), DefaultDevice> Evaluator0;
-  auto chip = Evaluator0(tensor.template chip<0>(3), DefaultDevice());
+  typedef TensorEvaluator<decltype(tensor.chip<0>(3)), DefaultDevice> Evaluator0;
+  auto chip = Evaluator0(tensor.chip<0>(3), DefaultDevice());
   for (int i = 0; i < 7; ++i) {
     for (int j = 0; j < 5; ++j) {
       for (int k = 0; k < 3; ++k) {
@@ -393,20 +393,20 @@ static void test_chip_raw_data_row_major()
     }
   }
 
-  typedef TensorEvaluator<decltype(tensor.template chip<1>(0)), DefaultDevice> Evaluator1;
-  auto chip1 = Evaluator1(tensor.template chip<1>(0), DefaultDevice());
+  typedef TensorEvaluator<decltype(tensor.chip<1>(0)), DefaultDevice> Evaluator1;
+  auto chip1 = Evaluator1(tensor.chip<1>(0), DefaultDevice());
   VERIFY_IS_EQUAL(chip1.data(), static_cast<float*>(0));
 
-  typedef TensorEvaluator<decltype(tensor.template chip<2>(0)), DefaultDevice> Evaluator2;
-  auto chip2 = Evaluator2(tensor.template chip<2>(0), DefaultDevice());
+  typedef TensorEvaluator<decltype(tensor.chip<2>(0)), DefaultDevice> Evaluator2;
+  auto chip2 = Evaluator2(tensor.chip<2>(0), DefaultDevice());
   VERIFY_IS_EQUAL(chip2.data(), static_cast<float*>(0));
 
-  typedef TensorEvaluator<decltype(tensor.template chip<3>(0)), DefaultDevice> Evaluator3;
-  auto chip3 = Evaluator3(tensor.template chip<3>(0), DefaultDevice());
+  typedef TensorEvaluator<decltype(tensor.chip<3>(0)), DefaultDevice> Evaluator3;
+  auto chip3 = Evaluator3(tensor.chip<3>(0), DefaultDevice());
   VERIFY_IS_EQUAL(chip3.data(), static_cast<float*>(0));
 
-  typedef TensorEvaluator<decltype(tensor.template chip<4>(0)), DefaultDevice> Evaluator4;
-  auto chip4 = Evaluator4(tensor.template chip<4>(0), DefaultDevice());
+  typedef TensorEvaluator<decltype(tensor.chip<4>(0)), DefaultDevice> Evaluator4;
+  auto chip4 = Evaluator4(tensor.chip<4>(0), DefaultDevice());
   VERIFY_IS_EQUAL(chip4.data(), static_cast<float*>(0));
 }
 
diff --git a/unsupported/test/cxx11_tensor_concatenation.cpp b/unsupported/test/cxx11_tensor_concatenation.cpp
index cc9dfb769..03ef12e63 100644
--- a/unsupported/test/cxx11_tensor_concatenation.cpp
+++ b/unsupported/test/cxx11_tensor_concatenation.cpp
@@ -47,10 +47,10 @@ static void test_static_dimension_failure()
 
   // This can be worked around in this case.
   Tensor<int, 3, DataLayout> concatenation = left
-      .reshape(Tensor<int, 3>::Dimensions{{2, 3, 1}})
+      .reshape(Tensor<int, 3>::Dimensions(2, 3, 1))
       .concatenate(right, 0);
   Tensor<int, 2, DataLayout> alternative = left
-      .concatenate(right.reshape(Tensor<int, 2>::Dimensions{{2, 3}}), 0);
+      .concatenate(right.reshape(Tensor<int, 2>::Dimensions{{{2, 3}}}), 0);
 }
 
 template<int DataLayout>
diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp
index 2bcae90b8..f4acdc504 100644
--- a/unsupported/test/cxx11_tensor_contraction.cpp
+++ b/unsupported/test/cxx11_tensor_contraction.cpp
@@ -360,7 +360,7 @@ static void test_large_contraction()
   t_result = t_left.contract(t_right, dims);
   m_result = m_left * m_right;
 
-  for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
+  for (int i = 0; i < t_result.dimensions().TotalSize(); i++) {
     VERIFY(&t_result.data()[i] != &m_result.data()[i]);
     VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
   }
@@ -388,7 +388,7 @@ static void test_matrix_vector()
   t_result = t_left.contract(t_right, dims);
   m_result = m_left * m_right;
 
-  for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
+  for (int i = 0; i < t_result.dimensions().TotalSize(); i++) {
     VERIFY(internal::isApprox(t_result(i), m_result(i, 0), 1));
   }
 }
@@ -412,7 +412,7 @@ static void test_tensor_vector()
   MapXf m_right(t_right.data(), 1, 7);
   Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result = m_left.transpose() * m_right.transpose();
 
-  for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
+  for (int i = 0; i < t_result.dimensions().TotalSize(); i++) {
     VERIFY(internal::isApprox(t_result(i), m_result(i, 0), 1));
   }
 }
@@ -443,7 +443,7 @@ static void test_small_blocking_factors()
   Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> m_right(t_right.data(), 93, 140);
   Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result = m_left * m_right;
 
-  for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
+  for (int i = 0; i < t_result.dimensions().TotalSize(); i++) {
     VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
   }
 }
diff --git a/unsupported/test/cxx11_tensor_convolution.cpp b/unsupported/test/cxx11_tensor_convolution.cpp
index 3a12dae62..e3d4675eb 100644
--- a/unsupported/test/cxx11_tensor_convolution.cpp
+++ b/unsupported/test/cxx11_tensor_convolution.cpp
@@ -25,7 +25,7 @@ static void test_evals()
 
   Tensor<float, 2, DataLayout> result(2,3);
   result.setZero();
-  Eigen::array<Tensor<float, 2>::Index, 1> dims3({0});
+  Eigen::array<Tensor<float, 2>::Index, 1> dims3{{0}};
 
   typedef TensorEvaluator<decltype(input.convolve(kernel, dims3)), DefaultDevice> Evaluator;
   Evaluator eval(input.convolve(kernel, dims3), DefaultDevice());
@@ -51,7 +51,9 @@ static void test_expr()
   kernel.setRandom();
 
   Tensor<float, 2, DataLayout> result(2,2);
-  Eigen::array<ptrdiff_t, 2> dims({0, 1});
+  Eigen::array<ptrdiff_t, 2> dims;
+  dims[0] = 0;
+  dims[1] = 1;
   result = input.convolve(kernel, dims);
 
   VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0,0) + input(0,1)*kernel(0,1) +
@@ -75,7 +77,8 @@ static void test_modes() {
   kernel(1) = 1.0f;
   kernel(2) = 0.0f;
 
-  const Eigen::array<ptrdiff_t, 1> dims({0});
+  Eigen::array<ptrdiff_t, 1> dims;
+  dims[0] = 0;
   Eigen::array<std::pair<ptrdiff_t, ptrdiff_t>, 1> padding;
 
   // Emulate VALID mode (as defined in
@@ -116,9 +119,12 @@ static void test_strides() {
   input.setRandom();
   kernel.setRandom();
 
-  const Eigen::array<ptrdiff_t, 1> dims({0});
-  const Eigen::array<ptrdiff_t, 1> stride_of_3({3});
-  const Eigen::array<ptrdiff_t, 1> stride_of_2({2});
+  Eigen::array<ptrdiff_t, 1> dims;
+  dims[0] = 0;
+  Eigen::array<ptrdiff_t, 1> stride_of_3;
+  stride_of_3[0] = 3;
+  Eigen::array<ptrdiff_t, 1> stride_of_2;
+  stride_of_2[0] = 2;
 
   Tensor<float, 1, DataLayout> result;
   result = input.stride(stride_of_3).convolve(kernel, dims).stride(stride_of_2);
diff --git a/unsupported/test/cxx11_tensor_dimension.cpp b/unsupported/test/cxx11_tensor_dimension.cpp
index 0cc4e86f7..247d312ae 100644
--- a/unsupported/test/cxx11_tensor_dimension.cpp
+++ b/unsupported/test/cxx11_tensor_dimension.cpp
@@ -21,7 +21,7 @@ static void test_dynamic_size()
   VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2);
   VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3);
   VERIFY_IS_EQUAL((int)Eigen::internal::array_get<2>(dimensions), 7);
-  VERIFY_IS_EQUAL(dimensions.TotalSize(), (size_t)2*3*7);
+  VERIFY_IS_EQUAL(dimensions.TotalSize(), 2*3*7);
   VERIFY_IS_EQUAL((int)dimensions[0], 2);
   VERIFY_IS_EQUAL((int)dimensions[1], 3);
   VERIFY_IS_EQUAL((int)dimensions[2], 7);
@@ -34,7 +34,7 @@ static void test_fixed_size()
   VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2);
   VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3);
   VERIFY_IS_EQUAL((int)Eigen::internal::array_get<2>(dimensions), 7);
-  VERIFY_IS_EQUAL(dimensions.TotalSize(), (size_t)2*3*7);
+  VERIFY_IS_EQUAL(dimensions.TotalSize(), 2*3*7);
 }
 
 
diff --git a/unsupported/test/cxx11_tensor_generator.cpp b/unsupported/test/cxx11_tensor_generator.cpp
new file mode 100644
index 000000000..dcb928714
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_generator.cpp
@@ -0,0 +1,91 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+struct Generator1D {
+  Generator1D() { }
+
+  float operator()(const array<Eigen::DenseIndex, 1>& coordinates) const {
+    return coordinates[0];
+  }
+};
+
+template <int DataLayout>
+static void test_1D()
+{
+  Tensor<float, 1> vec(6);
+  Tensor<float, 1> result = vec.generate(Generator1D());
+
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(result(i), i);
+  }
+}
+
+
+struct Generator2D {
+  Generator2D() { }
+
+  float operator()(const array<Eigen::DenseIndex, 2>& coordinates) const {
+    return 3 * coordinates[0] + 11 * coordinates[1];
+  }
+};
+
+template <int DataLayout>
+static void test_2D()
+{
+  Tensor<float, 2> matrix(5, 7);
+  Tensor<float, 2> result = matrix.generate(Generator2D());
+
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      VERIFY_IS_EQUAL(result(i, j), 3*i + 11*j);
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_gaussian()
+{
+  int rows = 32;
+  int cols = 48;
+  array<float, 2> means;
+  means[0] = rows / 2.0f;
+  means[1] = cols / 2.0f;
+  array<float, 2> std_devs;
+  std_devs[0] = 3.14f;
+  std_devs[1] = 2.7f;
+  internal::GaussianGenerator<float, Eigen::DenseIndex, 2> gaussian_gen(means, std_devs);
+
+  Tensor<float, 2> matrix(rows, cols);
+  Tensor<float, 2> result = matrix.generate(gaussian_gen);
+
+  for (int i = 0; i < rows; ++i) {
+    for (int j = 0; j < cols; ++j) {
+      float g_rows = powf(rows/2.0f - i, 2) / (3.14f * 3.14f) * 0.5f;
+      float g_cols = powf(cols/2.0f - j, 2) / (2.7f * 2.7f) * 0.5f;
+      float gaussian = expf(-g_rows - g_cols);
+      VERIFY_IS_EQUAL(result(i, j), gaussian);
+    }
+  }
+}
+
+
+void test_cxx11_tensor_generator()
+{
+  CALL_SUBTEST(test_1D<ColMajor>());
+  CALL_SUBTEST(test_1D<RowMajor>());
+  CALL_SUBTEST(test_2D<ColMajor>());
+  CALL_SUBTEST(test_2D<RowMajor>());
+  CALL_SUBTEST(test_gaussian<ColMajor>());
+  CALL_SUBTEST(test_gaussian<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_intdiv.cpp b/unsupported/test/cxx11_tensor_intdiv.cpp
index a510dc695..a50356c74 100644
--- a/unsupported/test/cxx11_tensor_intdiv.cpp
+++ b/unsupported/test/cxx11_tensor_intdiv.cpp
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -12,9 +12,9 @@
 #include <Eigen/CXX11/Tensor>
 
 
-static void test_signed_32bit()
+void test_signed_32bit()
 {
-  for (int32_t i = 1; i < 25000; ++i) {
+  for (int32_t i = 2; i < 25000; ++i) {
     const Eigen::internal::TensorIntDivisor<int32_t> div(i);
 
     for (int32_t j = 0; j < 25000; ++j) {
@@ -26,7 +26,7 @@ static void test_signed_32bit()
 }
 
 
-static void test_unsigned_32bit()
+void test_unsigned_32bit()
 {
   for (uint32_t i = 1; i < 25000; ++i) {
     const Eigen::internal::TensorIntDivisor<uint32_t> div(i);
@@ -40,7 +40,7 @@ static void test_unsigned_32bit()
 }
 
 
-static void test_signed_64bit()
+void test_signed_64bit()
 {
   for (int64_t i = 2; i < 25000; ++i) {
     const Eigen::internal::TensorIntDivisor<int64_t> div(i);
@@ -54,7 +54,7 @@ static void test_signed_64bit()
 }
 
 
-static void test_unsigned_64bit()
+void test_unsigned_64bit()
 {
   for (uint64_t i = 2; i < 25000; ++i) {
     const Eigen::internal::TensorIntDivisor<uint64_t> div(i);
@@ -70,8 +70,8 @@ static void test_unsigned_64bit()
 
 void test_cxx11_tensor_intdiv()
 {
-  CALL_SUBTEST(test_signed_32bit());
-  CALL_SUBTEST(test_unsigned_32bit());
-  CALL_SUBTEST(test_signed_64bit());
-  CALL_SUBTEST(test_unsigned_64bit());
+  CALL_SUBTEST_1(test_signed_32bit());
+  CALL_SUBTEST_2(test_unsigned_32bit());
+  CALL_SUBTEST_3(test_signed_64bit());
+  CALL_SUBTEST_4(test_unsigned_64bit());
 }
diff --git a/unsupported/test/cxx11_tensor_mixed_indices.cpp b/unsupported/test/cxx11_tensor_mixed_indices.cpp
new file mode 100644
index 000000000..72f826216
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_mixed_indices.cpp
@@ -0,0 +1,53 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+
+static void test_simple()
+{
+  Tensor<float, 1, ColMajor> vec1({6});
+  Tensor<float, 1, ColMajor, int> vec2({6});
+
+  vec1(0) = 4.0;  vec2(0) = 0.0;
+  vec1(1) = 8.0;  vec2(1) = 1.0;
+  vec1(2) = 15.0; vec2(2) = 2.0;
+  vec1(3) = 16.0; vec2(3) = 3.0;
+  vec1(4) = 23.0; vec2(4) = 4.0;
+  vec1(5) = 42.0; vec2(5) = 5.0;
+
+  float data3[6];
+  TensorMap<Tensor<float, 1, ColMajor>> vec3(data3, 6);
+  vec3 = vec1.sqrt();
+  float data4[6];
+  TensorMap<Tensor<float, 1, ColMajor, int>> vec4(data4, 6);
+  vec4 = vec2.square();
+
+  VERIFY_IS_APPROX(vec3(0), sqrtf(4.0));
+  VERIFY_IS_APPROX(vec3(1), sqrtf(8.0));
+  VERIFY_IS_APPROX(vec3(2), sqrtf(15.0));
+  VERIFY_IS_APPROX(vec3(3), sqrtf(16.0));
+  VERIFY_IS_APPROX(vec3(4), sqrtf(23.0));
+  VERIFY_IS_APPROX(vec3(5), sqrtf(42.0));
+
+  VERIFY_IS_APPROX(vec4(0), 0.0f);
+  VERIFY_IS_APPROX(vec4(1), 1.0f);
+  VERIFY_IS_APPROX(vec4(2), 2.0f * 2.0f);
+  VERIFY_IS_APPROX(vec4(3), 3.0f * 3.0f);
+  VERIFY_IS_APPROX(vec4(4), 4.0f * 4.0f);
+  VERIFY_IS_APPROX(vec4(5), 5.0f * 5.0f);
+}
+
+
+void test_cxx11_tensor_mixed_indices()
+{
+  CALL_SUBTEST(test_simple());
+}
diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp
index cf9fd4803..733154543 100644
--- a/unsupported/test/cxx11_tensor_morphing.cpp
+++ b/unsupported/test/cxx11_tensor_morphing.cpp
@@ -22,11 +22,11 @@ static void test_simple_reshape()
   Tensor<float, 2> tensor3(6,7);
   Tensor<float, 2> tensor4(2,21);
 
-  Tensor<float, 3>::Dimensions dim1{{2,3,7}};
+  Tensor<float, 3>::Dimensions dim1(2,3,7);
   tensor2 = tensor1.reshape(dim1);
-  Tensor<float, 2>::Dimensions dim2{{6,7}};
+  Tensor<float, 2>::Dimensions dim2(6,7);
   tensor3 = tensor1.reshape(dim2);
-  Tensor<float, 2>::Dimensions dim3{{2,21}};
+  Tensor<float, 2>::Dimensions dim3(2,21);
   tensor4 = tensor1.reshape(dim1).reshape(dim3);
 
   for (int i = 0; i < 2; ++i) {
@@ -50,8 +50,8 @@ static void test_reshape_in_expr() {
 
   TensorMap<Tensor<float, 5>> tensor1(m1.data(), 2,3,5,7,11);
   TensorMap<Tensor<float, 5>> tensor2(m2.data(), 3,5,7,11,13);
-  Tensor<float, 2>::Dimensions newDims1{{2,3*5*7*11}};
-  Tensor<float, 2>::Dimensions newDims2{{3*5*7*11,13}};
+  Tensor<float, 2>::Dimensions newDims1(2,3*5*7*11);
+  Tensor<float, 2>::Dimensions newDims2(3*5*7*11,13);
   typedef Tensor<float, 1>::DimensionPair DimPair;
   array<DimPair, 1> contract_along{{DimPair(1, 0)}};
   Tensor<float, 2> tensor3(2,13);
@@ -72,7 +72,7 @@ static void test_reshape_as_lvalue()
   tensor.setRandom();
 
   Tensor<float, 2> tensor2d(6,7);
-  Tensor<float, 3>::Dimensions dim{{2,3,7}};
+  Tensor<float, 3>::Dimensions dim(2,3,7);
   tensor2d.reshape(dim) = tensor;
 
   float scratch[2*3*1*7*1];
@@ -221,33 +221,33 @@ static void test_slice_raw_data()
   Eigen::DSizes<ptrdiff_t, 4> extents(1,1,1,1);
   typedef TensorEvaluator<decltype(tensor.slice(offsets, extents)), DefaultDevice> SliceEvaluator;
   auto slice1 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
-  VERIFY_IS_EQUAL(slice1.dimensions().TotalSize(), 1ul);
+  VERIFY_IS_EQUAL(slice1.dimensions().TotalSize(), 1);
   VERIFY_IS_EQUAL(slice1.data()[0], tensor(1,2,3,4));
 
   if (DataLayout == ColMajor) {
     extents = Eigen::DSizes<ptrdiff_t, 4>(2,1,1,1);
     auto slice2 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
-    VERIFY_IS_EQUAL(slice2.dimensions().TotalSize(), 2ul);
+    VERIFY_IS_EQUAL(slice2.dimensions().TotalSize(), 2);
     VERIFY_IS_EQUAL(slice2.data()[0], tensor(1,2,3,4));
     VERIFY_IS_EQUAL(slice2.data()[1], tensor(2,2,3,4));
   } else {
     extents = Eigen::DSizes<ptrdiff_t, 4>(1,1,1,2);
     auto slice2 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
-    VERIFY_IS_EQUAL(slice2.dimensions().TotalSize(), 2ul);
+    VERIFY_IS_EQUAL(slice2.dimensions().TotalSize(), 2);
     VERIFY_IS_EQUAL(slice2.data()[0], tensor(1,2,3,4));
     VERIFY_IS_EQUAL(slice2.data()[1], tensor(1,2,3,5));
   }
 
   extents = Eigen::DSizes<ptrdiff_t, 4>(1,2,1,1);
   auto slice3 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
-  VERIFY_IS_EQUAL(slice3.dimensions().TotalSize(), 2ul);
+  VERIFY_IS_EQUAL(slice3.dimensions().TotalSize(), 2);
   VERIFY_IS_EQUAL(slice3.data(), static_cast<float*>(0));
 
   if (DataLayout == ColMajor) {
     offsets = Eigen::DSizes<ptrdiff_t, 4>(0,2,3,4);
     extents = Eigen::DSizes<ptrdiff_t, 4>(3,2,1,1);
     auto slice4 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
-    VERIFY_IS_EQUAL(slice4.dimensions().TotalSize(), 6ul);
+    VERIFY_IS_EQUAL(slice4.dimensions().TotalSize(), 6);
     for (int i = 0; i < 3; ++i) {
       for (int j = 0; j < 2; ++j) {
         VERIFY_IS_EQUAL(slice4.data()[i+3*j], tensor(i,2+j,3,4));
@@ -257,7 +257,7 @@ static void test_slice_raw_data()
     offsets = Eigen::DSizes<ptrdiff_t, 4>(1,2,3,0);
     extents = Eigen::DSizes<ptrdiff_t, 4>(1,1,2,11);
     auto slice4 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
-    VERIFY_IS_EQUAL(slice4.dimensions().TotalSize(), 22ul);
+    VERIFY_IS_EQUAL(slice4.dimensions().TotalSize(), 22);
     for (int l = 0; l < 11; ++l) {
       for (int k = 0; k < 2; ++k) {
         VERIFY_IS_EQUAL(slice4.data()[l+11*k], tensor(1,2,3+k,l));
@@ -269,7 +269,7 @@ static void test_slice_raw_data()
     offsets = Eigen::DSizes<ptrdiff_t, 4>(0,0,0,4);
     extents = Eigen::DSizes<ptrdiff_t, 4>(3,5,7,2);
     auto slice5 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
-    VERIFY_IS_EQUAL(slice5.dimensions().TotalSize(), 210ul);
+    VERIFY_IS_EQUAL(slice5.dimensions().TotalSize(), 210);
     for (int i = 0; i < 3; ++i) {
       for (int j = 0; j < 5; ++j) {
         for (int k = 0; k < 7; ++k) {
@@ -284,7 +284,7 @@ static void test_slice_raw_data()
     offsets = Eigen::DSizes<ptrdiff_t, 4>(1,0,0,0);
     extents = Eigen::DSizes<ptrdiff_t, 4>(2,5,7,11);
     auto slice5 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
-    VERIFY_IS_EQUAL(slice5.dimensions().TotalSize(), 770ul);
+    VERIFY_IS_EQUAL(slice5.dimensions().TotalSize(), 770);
     for (int l = 0; l < 11; ++l) {
       for (int k = 0; k < 7; ++k) {
         for (int j = 0; j < 5; ++j) {
@@ -301,7 +301,7 @@ static void test_slice_raw_data()
   offsets = Eigen::DSizes<ptrdiff_t, 4>(0,0,0,0);
   extents = Eigen::DSizes<ptrdiff_t, 4>(3,5,7,11);
   auto slice6 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
-  VERIFY_IS_EQUAL(slice6.dimensions().TotalSize(), 3ul*5*7*11);
+  VERIFY_IS_EQUAL(slice6.dimensions().TotalSize(), 3*5*7*11);
   VERIFY_IS_EQUAL(slice6.data(), tensor.data());
 }
 
@@ -311,11 +311,11 @@ static void test_composition()
   Eigen::Tensor<float, 2, DataLayout> matrix(7, 11);
   matrix.setRandom();
 
-  const DSizes<ptrdiff_t, 3> newDims{{1, 1, 11}};
+  const DSizes<ptrdiff_t, 3> newDims(1, 1, 11);
   Eigen::Tensor<float, 3, DataLayout> tensor =
       matrix.slice(DSizes<ptrdiff_t, 2>(2, 0), DSizes<ptrdiff_t, 2>(1, 11)).reshape(newDims);
 
-  VERIFY_IS_EQUAL(tensor.dimensions().TotalSize(), 11ul);
+  VERIFY_IS_EQUAL(tensor.dimensions().TotalSize(), 11);
   VERIFY_IS_EQUAL(tensor.dimension(0), 1);
   VERIFY_IS_EQUAL(tensor.dimension(1), 1);
   VERIFY_IS_EQUAL(tensor.dimension(2), 11);
diff --git a/unsupported/test/cxx11_tensor_of_complex.cpp b/unsupported/test/cxx11_tensor_of_complex.cpp
index 24b2bcb58..8ad04f699 100644
--- a/unsupported/test/cxx11_tensor_of_complex.cpp
+++ b/unsupported/test/cxx11_tensor_of_complex.cpp
@@ -67,7 +67,7 @@ static void test_contractions()
   Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
   t_result = t_left.contract(t_right, dims);
   m_result = m_left * m_right;
-  for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
+  for (int i = 0; i < t_result.dimensions().TotalSize(); i++) {
     VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
   }
 }
diff --git a/unsupported/test/cxx11_tensor_of_strings.cpp b/unsupported/test/cxx11_tensor_of_strings.cpp
index 8d05d154e..4ef9aed91 100644
--- a/unsupported/test/cxx11_tensor_of_strings.cpp
+++ b/unsupported/test/cxx11_tensor_of_strings.cpp
@@ -94,9 +94,9 @@ static void test_slices()
     }
   }
 
-  const Eigen::DSizes<ptrdiff_t, 2> half_size{{2, 3}};
-  const Eigen::DSizes<ptrdiff_t, 2> first_half{{0, 0}};
-  const Eigen::DSizes<ptrdiff_t, 2> second_half{{0, 3}};
+  const Eigen::DSizes<ptrdiff_t, 2> half_size(2, 3);
+  const Eigen::DSizes<ptrdiff_t, 2> first_half(0, 0);
+  const Eigen::DSizes<ptrdiff_t, 2> second_half(0, 3);
 
   Tensor<std::string, 2> t1 = data.slice(first_half, half_size);
   Tensor<std::string, 2> t2 = data.slice(second_half, half_size);
diff --git a/unsupported/test/cxx11_tensor_random.cpp b/unsupported/test/cxx11_tensor_random.cpp
index 8276ae822..85937ec77 100644
--- a/unsupported/test/cxx11_tensor_random.cpp
+++ b/unsupported/test/cxx11_tensor_random.cpp
@@ -44,7 +44,7 @@ struct MyGenerator {
   // location of the entry to set in the tensor, it can typically
   // be ignored.
   int operator()(Eigen::DenseIndex element_location, Eigen::DenseIndex /*unused*/ = 0) const {
-    return 3 * element_location;
+    return static_cast<int>(3 * element_location);
   }
 
   // Same as above but generates several numbers at a time.
@@ -53,7 +53,7 @@ struct MyGenerator {
     const int packetSize = internal::packet_traits<int>::size;
     EIGEN_ALIGN_DEFAULT int values[packetSize];
     for (int i = 0; i < packetSize; ++i) {
-      values[i] = 3 * (packet_location + i);
+      values[i] = static_cast<int>(3 * (packet_location + i));
     }
     return internal::pload<typename internal::packet_traits<int>::type>(values);
   }
diff --git a/unsupported/test/cxx11_tensor_simple.cpp b/unsupported/test/cxx11_tensor_simple.cpp
index 23855fca0..8cd2ab7fd 100644
--- a/unsupported/test/cxx11_tensor_simple.cpp
+++ b/unsupported/test/cxx11_tensor_simple.cpp
@@ -267,21 +267,21 @@ static void test_resize()
   VERIFY_IS_EQUAL(epsilon.dimension(0), 2);
   VERIFY_IS_EQUAL(epsilon.dimension(1), 3);
   VERIFY_IS_EQUAL(epsilon.dimension(2), 7);
-  VERIFY_IS_EQUAL(epsilon.dimensions().TotalSize(), 2ul*3*7);
+  VERIFY_IS_EQUAL(epsilon.dimensions().TotalSize(), 2*3*7);
 
   const int* old_data = epsilon.data();
   epsilon.resize(3,2,7);
   VERIFY_IS_EQUAL(epsilon.dimension(0), 3);
   VERIFY_IS_EQUAL(epsilon.dimension(1), 2);
   VERIFY_IS_EQUAL(epsilon.dimension(2), 7);
-  VERIFY_IS_EQUAL(epsilon.dimensions().TotalSize(), 2ul*3*7);
+  VERIFY_IS_EQUAL(epsilon.dimensions().TotalSize(), 2*3*7);
   VERIFY_IS_EQUAL(epsilon.data(), old_data);
 
   epsilon.resize(3,5,7);
   VERIFY_IS_EQUAL(epsilon.dimension(0), 3);
   VERIFY_IS_EQUAL(epsilon.dimension(1), 5);
   VERIFY_IS_EQUAL(epsilon.dimension(2), 7);
-  VERIFY_IS_EQUAL(epsilon.dimensions().TotalSize(), 3ul*5*7);
+  VERIFY_IS_EQUAL(epsilon.dimensions().TotalSize(), 3*5*7);
   VERIFY_IS_NOT_EQUAL(epsilon.data(), old_data);
 }
 
diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp
index 6fe65c7f9..05b55f706 100644
--- a/unsupported/test/cxx11_tensor_thread_pool.cpp
+++ b/unsupported/test/cxx11_tensor_thread_pool.cpp
@@ -26,7 +26,8 @@ static void test_multithread_elementwise()
   in1.setRandom();
   in2.setRandom();
 
-  Eigen::ThreadPoolDevice thread_pool_device(internal::random<int>(3, 11));
+  Eigen::ThreadPool tp(internal::random<int>(3, 11));
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
   out.device(thread_pool_device) = in1 + in2 * 3.14f;
 
   for (int i = 0; i < 2; ++i) {
@@ -48,7 +49,8 @@ static void test_multithread_compound_assignment()
   in1.setRandom();
   in2.setRandom();
 
-  Eigen::ThreadPoolDevice thread_pool_device(internal::random<int>(3, 11));
+  Eigen::ThreadPool tp(internal::random<int>(3, 11));
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
   out.device(thread_pool_device) = in1;
   out.device(thread_pool_device) += in2 * 3.14f;
 
@@ -80,7 +82,8 @@ static void test_multithread_contraction()
   MapXf m_right(t_right.data(), 1147, 1400);
   Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
 
-  Eigen::ThreadPoolDevice thread_pool_device(4);
+  Eigen::ThreadPool tp(4);
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, 4);
 
   // compute results by separate methods
   t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
@@ -115,7 +118,8 @@ static void test_contraction_corner_cases()
   MapXf m_right(t_right.data(), 32, 28*28);
   Matrix<float, Dynamic, Dynamic, DataLayout> m_result(500, 28*28);
 
-  Eigen::ThreadPoolDevice thread_pool_device(12);
+  Eigen::ThreadPool tp(12);
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, 12);
 
   // compute results by separate methods
   t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
@@ -204,7 +208,8 @@ static void test_multithread_contraction_agrees_with_singlethread() {
   typedef Tensor<float, 1>::DimensionPair DimPair;
   Eigen::array<DimPair, 1> dims({{DimPair(1, 2)}});
 
-  Eigen::ThreadPoolDevice thread_pool_device(internal::random<int>(2, 11));
+  Eigen::ThreadPool tp(internal::random<int>(2, 11));
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(2, 11));
 
   Tensor<float, 5, DataLayout> st_result;
   st_result = left.contract(right, dims);
@@ -227,7 +232,8 @@ static void test_memcpy() {
 
   for (int i = 0; i < 5; ++i) {
     const int num_threads = internal::random<int>(3, 11);
-    Eigen::ThreadPoolDevice thread_pool_device(num_threads);
+    Eigen::ThreadPool tp(num_threads);
+    Eigen::ThreadPoolDevice thread_pool_device(&tp, num_threads);
 
     const int size = internal::random<int>(13, 7632);
     Tensor<float, 1> t1(size);
@@ -243,7 +249,8 @@ static void test_memcpy() {
 
 static void test_multithread_random()
 {
-  Eigen::ThreadPoolDevice device(2);
+  Eigen::ThreadPool tp(2);
+  Eigen::ThreadPoolDevice device(&tp, 2);
   Tensor<float, 1> t(1 << 20);
   t.device(device) = t.random<Eigen::internal::NormalRandomGenerator<float>>();
 }
diff --git a/unsupported/test/levenberg_marquardt.cpp b/unsupported/test/levenberg_marquardt.cpp
index 1fa1c3c22..a2bdb99e4 100644
--- a/unsupported/test/levenberg_marquardt.cpp
+++ b/unsupported/test/levenberg_marquardt.cpp
@@ -9,6 +9,9 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 
+// FIXME: These tests all check for hard-coded values. Ideally, parameters and start estimates should be randomized.
+
+
 #include <stdio.h>
 
 #include "main.h"
@@ -275,7 +278,7 @@ const double chwirut2_functor::m_y[54] = { 92.9000E0 ,57.1000E0 ,31.0500E0 ,11.5
 void testNistChwirut2(void)
 {
   const int n=3;
-  int info;
+  LevenbergMarquardtSpace::Status info;
 
   VectorXd x(n);
 
@@ -610,7 +613,7 @@ const double lanczos1_functor::y[24] = { 2.513400000000E+00 ,2.044333373291E+00
 void testNistLanczos1(void)
 {
   const int n=6;
-  int info;
+  LevenbergMarquardtSpace::Status info;
 
   VectorXd x(n);
 
@@ -624,7 +627,7 @@ void testNistLanczos1(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY_IS_EQUAL(info, 2);
+  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeErrorTooSmall);
   VERIFY_IS_EQUAL(lm.nfev(), 79);
   VERIFY_IS_EQUAL(lm.njev(), 72);
   // check norm^2
@@ -645,7 +648,7 @@ void testNistLanczos1(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY_IS_EQUAL(info, 2);
+  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeErrorTooSmall);
   VERIFY_IS_EQUAL(lm.nfev(), 9);
   VERIFY_IS_EQUAL(lm.njev(), 8);
   // check norm^2
@@ -696,7 +699,7 @@ const double rat42_functor::y[9] = { 8.930E0 ,10.800E0 ,18.590E0 ,22.330E0 ,39.3
 void testNistRat42(void)
 {
   const int n=3;
-  int info;
+  LevenbergMarquardtSpace::Status info;
 
   VectorXd x(n);
 
@@ -710,7 +713,7 @@ void testNistRat42(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall);
   VERIFY_IS_EQUAL(lm.nfev(), 10);
   VERIFY_IS_EQUAL(lm.njev(), 8);
   // check norm^2
@@ -728,7 +731,7 @@ void testNistRat42(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall);
   VERIFY_IS_EQUAL(lm.nfev(), 6);
   VERIFY_IS_EQUAL(lm.njev(), 5);
   // check norm^2
@@ -774,7 +777,7 @@ const double MGH10_functor::y[16] = { 3.478000E+04, 2.861000E+04, 2.365000E+04,
 void testNistMGH10(void)
 {
   const int n=3;
-  int info;
+  LevenbergMarquardtSpace::Status info;
 
   VectorXd x(n);
 
@@ -786,6 +789,7 @@ void testNistMGH10(void)
   MGH10_functor functor;
   LevenbergMarquardt<MGH10_functor> lm(functor);
   info = lm.minimize(x);
+  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeErrorTooSmall);
 
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.7945855171E+01);
@@ -805,6 +809,7 @@ void testNistMGH10(void)
   x<< 0.02, 4000., 250.;
   // do the computation
   info = lm.minimize(x);
+  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall);
 
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.7945855171E+01);
@@ -891,8 +896,8 @@ void testNistBoxBOD(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1); 
-  VERIFY_IS_EQUAL(lm.nfev(), 15 ); 
-  VERIFY_IS_EQUAL(lm.njev(), 14 ); 
+  VERIFY_IS_EQUAL(lm.nfev(), 16 );
+  VERIFY_IS_EQUAL(lm.njev(), 15 );
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.1680088766E+03);
   // check x