113 files changed, 2885 insertions, 1261 deletions
diff --git a/Eigen/CholmodSupport b/Eigen/CholmodSupport
index 83e2c1da4..bed8924d3 100644
--- a/Eigen/CholmodSupport
+++ b/Eigen/CholmodSupport
@@ -19,7 +19,7 @@ extern "C" {
 /** \ingroup Support_modules
   * \defgroup CholmodSupport_Module CholmodSupport module
   *
-  * This module provides an interface to the Cholmod library which is part of the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">suitesparse</a> package.
+  * This module provides an interface to the Cholmod library which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
   * It provides the two following main factorization classes:
   * - class CholmodSupernodalLLT: a supernodal LLT Cholesky factorization.
   * - class CholmodDecomposiiton: a general L(D)LT Cholesky factorization with automatic or explicit runtime selection of the underlying factorization method (supernodal or simplicial).
diff --git a/Eigen/SPQRSupport b/Eigen/SPQRSupport
index f9489dcd8..f70390c17 100644
--- a/Eigen/SPQRSupport
+++ b/Eigen/SPQRSupport
@@ -17,7 +17,7 @@
 /** \ingroup Support_modules
   * \defgroup SPQRSupport_Module SuiteSparseQR module
   * 
-  * This module provides an interface to the SPQR library, which is part of the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">suitesparse</a> package.
+  * This module provides an interface to the SPQR library, which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
   *
   * \code
   * #include <Eigen/SPQRSupport>
diff --git a/Eigen/UmfPackSupport b/Eigen/UmfPackSupport
index 4a9f46a1e..00eec8087 100644
--- a/Eigen/UmfPackSupport
+++ b/Eigen/UmfPackSupport
@@ -19,7 +19,7 @@ extern "C" {
 /** \ingroup Support_modules
   * \defgroup UmfPackSupport_Module UmfPackSupport module
   *
-  * This module provides an interface to the UmfPack library which is part of the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">suitesparse</a> package.
+  * This module provides an interface to the UmfPack library which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
   * It provides the following factorization class:
   * - class UmfPackLU: a multifrontal sequential LU factorization.
   *
diff --git a/Eigen/src/CholmodSupport/CholmodSupport.h b/Eigen/src/CholmodSupport/CholmodSupport.h
index c7c521b95..b8020a92c 100644
--- a/Eigen/src/CholmodSupport/CholmodSupport.h
+++ b/Eigen/src/CholmodSupport/CholmodSupport.h
@@ -273,9 +273,10 @@ class CholmodBase : public SparseSolverBase<Derived>
       const Index size = m_cholmodFactor->n;
       EIGEN_UNUSED_VARIABLE(size);
       eigen_assert(size==b.rows());
+      
+      // Cholmod needs column-major stoarge without inner-stride, which corresponds to the default behavior of Ref.
+      Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b.derived());
 
-      // note: cd stands for Cholmod Dense
-      Rhs& b_ref(b.const_cast_derived());
       cholmod_dense b_cd = viewAsCholmod(b_ref);
       cholmod_dense* x_cd = cholmod_solve(CHOLMOD_A, m_cholmodFactor, &b_cd, &m_cholmod);
       if(!x_cd)
diff --git a/Eigen/src/Core/ArrayBase.h b/Eigen/src/Core/ArrayBase.h
index b4c24a27a..0443e3032 100644
--- a/Eigen/src/Core/ArrayBase.h
+++ b/Eigen/src/Core/ArrayBase.h
@@ -103,7 +103,7 @@ template<typename Derived> class ArrayBase
     /** Special case of the template operator=, in order to prevent the compiler
       * from generating a default operator= (issue hit with g++ 4.1)
       */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator=(const ArrayBase& other)
     {
       internal::call_assignment(derived(), other.derived());
@@ -112,28 +112,28 @@ template<typename Derived> class ArrayBase
     
     /** Set all the entries to \a value.
       * \sa DenseBase::setConstant(), DenseBase::fill() */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator=(const Scalar &value)
     { Base::setConstant(value); return derived(); }
 
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator+=(const Scalar& scalar);
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator-=(const Scalar& scalar);
 
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator+=(const ArrayBase<OtherDerived>& other);
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator-=(const ArrayBase<OtherDerived>& other);
 
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator*=(const ArrayBase<OtherDerived>& other);
 
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator/=(const ArrayBase<OtherDerived>& other);
 
   public:
diff --git a/Eigen/src/Core/ArrayWrapper.h b/Eigen/src/Core/ArrayWrapper.h
index 4e484f290..6013d4d85 100644
--- a/Eigen/src/Core/ArrayWrapper.h
+++ b/Eigen/src/Core/ArrayWrapper.h
@@ -52,7 +52,7 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
                        const Scalar
                      >::type ScalarWithConstIfNotLvalue;
 
-    typedef typename internal::ref_selector<ExpressionType>::type NestedExpressionType;
+    typedef typename internal::ref_selector<ExpressionType>::non_const_type NestedExpressionType;
 
     EIGEN_DEVICE_FUNC
     explicit EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {}
@@ -67,7 +67,7 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
     inline Index innerStride() const { return m_expression.innerStride(); }
 
     EIGEN_DEVICE_FUNC
-    inline ScalarWithConstIfNotLvalue* data() { return m_expression.const_cast_derived().data(); }
+    inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
     EIGEN_DEVICE_FUNC
     inline const Scalar* data() const { return m_expression.data(); }
 
@@ -80,13 +80,13 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
     EIGEN_DEVICE_FUNC
     inline Scalar& coeffRef(Index rowId, Index colId)
     {
-      return m_expression.const_cast_derived().coeffRef(rowId, colId);
+      return m_expression.coeffRef(rowId, colId);
     }
 
     EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index rowId, Index colId) const
     {
-      return m_expression.const_cast_derived().coeffRef(rowId, colId);
+      return m_expression.coeffRef(rowId, colId);
     }
 
     EIGEN_DEVICE_FUNC
@@ -98,13 +98,13 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
     EIGEN_DEVICE_FUNC
     inline Scalar& coeffRef(Index index)
     {
-      return m_expression.const_cast_derived().coeffRef(index);
+      return m_expression.coeffRef(index);
     }
 
     EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index index) const
     {
-      return m_expression.const_cast_derived().coeffRef(index);
+      return m_expression.coeffRef(index);
     }
 
     template<int LoadMode>
@@ -116,7 +116,7 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
     template<int LoadMode>
     inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
     {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(rowId, colId, val);
+      m_expression.template writePacket<LoadMode>(rowId, colId, val);
     }
 
     template<int LoadMode>
@@ -128,7 +128,7 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
     template<int LoadMode>
     inline void writePacket(Index index, const PacketScalar& val)
     {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(index, val);
+      m_expression.template writePacket<LoadMode>(index, val);
     }
 
     template<typename Dest>
@@ -145,11 +145,11 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
     /** Forwards the resizing request to the nested expression
       * \sa DenseBase::resize(Index)  */
     EIGEN_DEVICE_FUNC
-    void resize(Index newSize) { m_expression.const_cast_derived().resize(newSize); }
+    void resize(Index newSize) { m_expression.resize(newSize); }
     /** Forwards the resizing request to the nested expression
       * \sa DenseBase::resize(Index,Index)*/
     EIGEN_DEVICE_FUNC
-    void resize(Index rows, Index cols) { m_expression.const_cast_derived().resize(rows,cols); }
+    void resize(Index rows, Index cols) { m_expression.resize(rows,cols); }
 
   protected:
     NestedExpressionType m_expression;
@@ -195,7 +195,7 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
                        const Scalar
                      >::type ScalarWithConstIfNotLvalue;
 
-    typedef typename internal::ref_selector<ExpressionType>::type NestedExpressionType;
+    typedef typename internal::ref_selector<ExpressionType>::non_const_type NestedExpressionType;
 
     EIGEN_DEVICE_FUNC
     explicit inline MatrixWrapper(ExpressionType& matrix) : m_expression(matrix) {}
@@ -210,7 +210,7 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
     inline Index innerStride() const { return m_expression.innerStride(); }
 
     EIGEN_DEVICE_FUNC
-    inline ScalarWithConstIfNotLvalue* data() { return m_expression.const_cast_derived().data(); }
+    inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
     EIGEN_DEVICE_FUNC
     inline const Scalar* data() const { return m_expression.data(); }
 
@@ -223,7 +223,7 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
     EIGEN_DEVICE_FUNC
     inline Scalar& coeffRef(Index rowId, Index colId)
     {
-      return m_expression.const_cast_derived().coeffRef(rowId, colId);
+      return m_expression.coeffRef(rowId, colId);
     }
 
     EIGEN_DEVICE_FUNC
@@ -241,13 +241,13 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
     EIGEN_DEVICE_FUNC
     inline Scalar& coeffRef(Index index)
     {
-      return m_expression.const_cast_derived().coeffRef(index);
+      return m_expression.coeffRef(index);
     }
 
     EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index index) const
     {
-      return m_expression.const_cast_derived().coeffRef(index);
+      return m_expression.coeffRef(index);
     }
 
     template<int LoadMode>
@@ -259,7 +259,7 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
     template<int LoadMode>
     inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
     {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(rowId, colId, val);
+      m_expression.template writePacket<LoadMode>(rowId, colId, val);
     }
 
     template<int LoadMode>
@@ -271,7 +271,7 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
     template<int LoadMode>
     inline void writePacket(Index index, const PacketScalar& val)
     {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(index, val);
+      m_expression.template writePacket<LoadMode>(index, val);
     }
 
     EIGEN_DEVICE_FUNC
@@ -284,11 +284,11 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
     /** Forwards the resizing request to the nested expression
       * \sa DenseBase::resize(Index)  */
     EIGEN_DEVICE_FUNC
-    void resize(Index newSize) { m_expression.const_cast_derived().resize(newSize); }
+    void resize(Index newSize) { m_expression.resize(newSize); }
     /** Forwards the resizing request to the nested expression
       * \sa DenseBase::resize(Index,Index)*/
     EIGEN_DEVICE_FUNC
-    void resize(Index rows, Index cols) { m_expression.const_cast_derived().resize(rows,cols); }
+    void resize(Index rows, Index cols) { m_expression.resize(rows,cols); }
 
   protected:
     NestedExpressionType m_expression;
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h
index f6632de69..5b65bfb0c 100755
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -637,7 +637,7 @@ protected:
 ***************************************************************************/
 
 template<typename DstXprType, typename SrcXprType, typename Functor>
-EIGEN_DEVICE_FUNC void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src, const Functor &func)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src, const Functor &func)
 {
   eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
   
@@ -654,7 +654,7 @@ EIGEN_DEVICE_FUNC void call_dense_assignment_loop(const DstXprType& dst, const S
 }
 
 template<typename DstXprType, typename SrcXprType>
-EIGEN_DEVICE_FUNC void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src)
 {
   call_dense_assignment_loop(dst, src, internal::assign_op<typename DstXprType::Scalar>());
 }
@@ -688,26 +688,30 @@ struct Assignment;
 // does not has to bother about these annoying details.
 
 template<typename Dst, typename Src>
-EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment(Dst& dst, const Src& src)
 {
   call_assignment(dst, src, internal::assign_op<typename Dst::Scalar>());
 }
 template<typename Dst, typename Src>
-EIGEN_DEVICE_FUNC void call_assignment(const Dst& dst, const Src& src)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment(const Dst& dst, const Src& src)
 {
   call_assignment(dst, src, internal::assign_op<typename Dst::Scalar>());
 }
                      
 // Deal with "assume-aliasing"
 template<typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if< evaluator_assume_aliasing<Src>::value, void*>::type = 0)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if< evaluator_assume_aliasing<Src>::value, void*>::type = 0)
 {
   typename plain_matrix_type<Src>::type tmp(src);
   call_assignment_no_alias(dst, tmp, func);
 }
 
 template<typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<!evaluator_assume_aliasing<Src>::value, void*>::type = 0)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<!evaluator_assume_aliasing<Src>::value, void*>::type = 0)
 {
   call_assignment_no_alias(dst, src, func);
 }
@@ -715,14 +719,16 @@ EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& fun
 // by-pass "assume-aliasing"
 // When there is no aliasing, we require that 'dst' has been properly resized
 template<typename Dst, template <typename> class StorageBase, typename Src, typename Func>
-EIGEN_DEVICE_FUNC void call_assignment(NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment(NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func)
 {
   call_assignment_no_alias(dst.expression(), src, func);
 }
 
 
 template<typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
 {
   enum {
     NeedToTranspose = (    (int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1)
@@ -747,13 +753,15 @@ EIGEN_DEVICE_FUNC void call_assignment_no_alias(Dst& dst, const Src& src, const
   Assignment<ActualDstTypeCleaned,Src,Func>::run(actualDst, src, func);
 }
 template<typename Dst, typename Src>
-EIGEN_DEVICE_FUNC void call_assignment_no_alias(Dst& dst, const Src& src)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment_no_alias(Dst& dst, const Src& src)
 {
   call_assignment_no_alias(dst, src, internal::assign_op<typename Dst::Scalar>());
 }
 
 template<typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src, const Func& func)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src, const Func& func)
 {
   Index dstRows = src.rows();
   Index dstCols = src.cols();
@@ -767,7 +775,8 @@ EIGEN_DEVICE_FUNC void call_assignment_no_alias_no_transpose(Dst& dst, const Src
   Assignment<Dst,Src,Func>::run(dst, src, func);
 }
 template<typename Dst, typename Src>
-EIGEN_DEVICE_FUNC void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src)
 {
   call_assignment_no_alias_no_transpose(dst, src, internal::assign_op<typename Dst::Scalar>());
 }
@@ -779,7 +788,8 @@ template<typename Dst, typename Src> void check_for_aliasing(const Dst &dst, con
 template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
 struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Scalar>
 {
-  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
   {
     eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
     
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 599b714cc..cf962aed1 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -129,8 +129,8 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class
       : Impl(xpr, startRow, startCol)
     {
       EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE)
-      eigen_assert(startRow >= 0 && BlockRows >= 1 && startRow + BlockRows <= xpr.rows()
-             && startCol >= 0 && BlockCols >= 1 && startCol + BlockCols <= xpr.cols());
+      eigen_assert(startRow >= 0 && BlockRows >= 0 && startRow + BlockRows <= xpr.rows()
+             && startCol >= 0 && BlockCols >= 0 && startCol + BlockCols <= xpr.cols());
     }
 
     /** Dynamic-size constructor
@@ -221,15 +221,13 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
     inline Scalar& coeffRef(Index rowId, Index colId)
     {
       EIGEN_STATIC_ASSERT_LVALUE(XprType)
-      return m_xpr.const_cast_derived()
-               .coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
+      return m_xpr.coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
     }
 
     EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index rowId, Index colId) const
     {
-      return m_xpr.derived()
-               .coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
+      return m_xpr.derived().coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
     }
 
     EIGEN_DEVICE_FUNC
@@ -242,39 +240,34 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
     inline Scalar& coeffRef(Index index)
     {
       EIGEN_STATIC_ASSERT_LVALUE(XprType)
-      return m_xpr.const_cast_derived()
-             .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
-                       m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+      return m_xpr.coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                            m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
     }
 
     EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index index) const
     {
-      return m_xpr.const_cast_derived()
-             .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
-                       m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+      return m_xpr.coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                            m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
     }
 
     EIGEN_DEVICE_FUNC
     inline const CoeffReturnType coeff(Index index) const
     {
-      return m_xpr
-             .coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
-                    m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+      return m_xpr.coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                         m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
     }
 
     template<int LoadMode>
     inline PacketScalar packet(Index rowId, Index colId) const
     {
-      return m_xpr.template packet<Unaligned>
-              (rowId + m_startRow.value(), colId + m_startCol.value());
+      return m_xpr.template packet<Unaligned>(rowId + m_startRow.value(), colId + m_startCol.value());
     }
 
     template<int LoadMode>
     inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
     {
-      m_xpr.const_cast_derived().template writePacket<Unaligned>
-              (rowId + m_startRow.value(), colId + m_startCol.value(), val);
+      m_xpr.template writePacket<Unaligned>(rowId + m_startRow.value(), colId + m_startCol.value(), val);
     }
 
     template<int LoadMode>
@@ -288,7 +281,7 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
     template<int LoadMode>
     inline void writePacket(Index index, const PacketScalar& val)
     {
-      m_xpr.const_cast_derived().template writePacket<Unaligned>
+      m_xpr.template writePacket<Unaligned>
          (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
           m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0), val);
     }
@@ -320,7 +313,7 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
 
   protected:
 
-    const typename XprType::Nested m_xpr;
+    typename XprType::Nested m_xpr;
     const internal::variable_if_dynamic<StorageIndex, XprType::RowsAtCompileTime == 1 ? 0 : Dynamic> m_startRow;
     const internal::variable_if_dynamic<StorageIndex, XprType::ColsAtCompileTime == 1 ? 0 : Dynamic> m_startCol;
     const internal::variable_if_dynamic<StorageIndex, RowsAtCompileTime> m_blockRows;
diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index 8bd73b814..a729e0454 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -148,7 +148,8 @@ struct evaluator<PlainObjectBase<Derived> >
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
   {
     if (IsRowMajor)
       return m_data[row * m_outerStride.value() + col];
@@ -156,12 +157,14 @@ struct evaluator<PlainObjectBase<Derived> >
       return m_data[row + col * m_outerStride.value()];
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
   {
     return m_data[index];
   }
 
-  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
   {
     if (IsRowMajor)
       return const_cast<Scalar*>(m_data)[row * m_outerStride.value() + col];
@@ -169,12 +172,14 @@ struct evaluator<PlainObjectBase<Derived> >
       return const_cast<Scalar*>(m_data)[row + col * m_outerStride.value()];
   }
 
-  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
   {
     return const_cast<Scalar*>(m_data)[index];
   }
 
   template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
   PacketType packet(Index row, Index col) const
   {
     if (IsRowMajor)
@@ -184,12 +189,14 @@ struct evaluator<PlainObjectBase<Derived> >
   }
 
   template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
   PacketType packet(Index index) const
   {
     return ploadt<PacketType, LoadMode>(m_data + index);
   }
 
   template<int StoreMode,typename PacketType>
+  EIGEN_STRONG_INLINE
   void writePacket(Index row, Index col, const PacketType& x)
   {
     if (IsRowMajor)
@@ -201,6 +208,7 @@ struct evaluator<PlainObjectBase<Derived> >
   }
 
   template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketType& x)
   {
     return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_data) + index, x);
@@ -260,45 +268,53 @@ struct unary_evaluator<Transpose<ArgType>, IndexBased>
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
   {
     return m_argImpl.coeff(col, row);
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
   {
     return m_argImpl.coeff(index);
   }
 
-  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
   {
     return m_argImpl.coeffRef(col, row);
   }
 
-  EIGEN_DEVICE_FUNC typename XprType::Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  typename XprType::Scalar& coeffRef(Index index)
   {
     return m_argImpl.coeffRef(index);
   }
 
   template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
   PacketType packet(Index row, Index col) const
   {
     return m_argImpl.template packet<LoadMode,PacketType>(col, row);
   }
 
   template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
   PacketType packet(Index index) const
   {
     return m_argImpl.template packet<LoadMode,PacketType>(index);
   }
 
-  template<int StoreMode, typename PacketType> 
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
   void writePacket(Index row, Index col, const PacketType& x)
   {
     m_argImpl.template writePacket<StoreMode,PacketType>(col, row, x);
   }
 
-  template<int StoreMode, typename PacketType> 
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketType& x)
   {
     m_argImpl.template writePacket<StoreMode,PacketType>(index, x);
@@ -338,23 +354,27 @@ struct evaluator<CwiseNullaryOp<NullaryOp,PlainObjectType> >
 
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
   {
     return m_functor(row, col);
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
   {
     return m_functor(index);
   }
 
   template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
   PacketType packet(Index row, Index col) const
   {
     return m_functor.template packetOp<Index,PacketType>(row, col);
   }
 
   template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
   PacketType packet(Index index) const
   {
     return m_functor.template packetOp<Index,PacketType>(index);
@@ -380,7 +400,8 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased >
     Alignment = evaluator<ArgType>::Alignment
   };
 
-  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  explicit unary_evaluator(const XprType& op)
     : m_functor(op.functor()), 
       m_argImpl(op.nestedExpression()) 
   {
@@ -390,23 +411,27 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased >
 
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
   {
     return m_functor(m_argImpl.coeff(row, col));
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
   {
     return m_functor(m_argImpl.coeff(index));
   }
 
   template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
   PacketType packet(Index row, Index col) const
   {
     return m_functor.packetOp(m_argImpl.template packet<LoadMode, PacketType>(row, col));
   }
 
   template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
   PacketType packet(Index index) const
   {
     return m_functor.packetOp(m_argImpl.template packet<LoadMode, PacketType>(index));
@@ -466,17 +491,20 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase
 
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
   {
     return m_functor(m_lhsImpl.coeff(row, col), m_rhsImpl.coeff(row, col));
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
   {
     return m_functor(m_lhsImpl.coeff(index), m_rhsImpl.coeff(index));
   }
 
   template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
   PacketType packet(Index row, Index col) const
   {
     return m_functor.packetOp(m_lhsImpl.template packet<LoadMode,PacketType>(row, col),
@@ -484,6 +512,7 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase
   }
 
   template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
   PacketType packet(Index index) const
   {
     return m_functor.packetOp(m_lhsImpl.template packet<LoadMode,PacketType>(index),
@@ -523,22 +552,26 @@ struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType>, IndexBased>
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
   {
     return m_unaryOp(m_argImpl.coeff(row, col));
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
   {
     return m_unaryOp(m_argImpl.coeff(index));
   }
 
-  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
   {
     return m_unaryOp(m_argImpl.coeffRef(row, col));
   }
 
-  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
   {
     return m_unaryOp(m_argImpl.coeffRef(index));
   }
@@ -578,47 +611,55 @@ struct mapbase_evaluator : evaluator_base<Derived>
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
  
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
   {
     return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()];
   }
   
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
   {
     return m_data[index * m_xpr.innerStride()];
   }
 
-  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
   {
     return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()];
   }
   
-  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
   {
     return m_data[index * m_xpr.innerStride()];
   }
  
-  template<int LoadMode, typename PacketType> 
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
   PacketType packet(Index row, Index col) const 
   {
     PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride();
     return internal::ploadt<PacketType, LoadMode>(ptr);
   }
 
-  template<int LoadMode, typename PacketType> 
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
   PacketType packet(Index index) const 
   {
     return internal::ploadt<PacketType, LoadMode>(m_data + index * m_xpr.innerStride());
   }
   
-  template<int StoreMode, typename PacketType> 
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
   void writePacket(Index row, Index col, const PacketType& x) 
   {
     PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride();
     return internal::pstoret<Scalar, PacketType, StoreMode>(ptr, x);
   }
   
-  template<int StoreMode, typename PacketType> 
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketType& x) 
   {
     internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_xpr.innerStride(), x);
@@ -767,46 +808,54 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
     RowsAtCompileTime = XprType::RowsAtCompileTime
   };
  
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
   { 
     return m_argImpl.coeff(m_startRow.value() + row, m_startCol.value() + col); 
   }
   
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
   { 
     return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
   }
 
-  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
   { 
     return m_argImpl.coeffRef(m_startRow.value() + row, m_startCol.value() + col); 
   }
   
-  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
   { 
     return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
   }
  
-  template<int LoadMode, typename PacketType> 
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
   PacketType packet(Index row, Index col) const 
   { 
     return m_argImpl.template packet<LoadMode,PacketType>(m_startRow.value() + row, m_startCol.value() + col); 
   }
 
-  template<int LoadMode, typename PacketType> 
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
   PacketType packet(Index index) const 
   { 
     return packet<LoadMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
                                        RowsAtCompileTime == 1 ? index : 0);
   }
   
-  template<int StoreMode, typename PacketType> 
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
   void writePacket(Index row, Index col, const PacketType& x) 
   { 
     return m_argImpl.template writePacket<StoreMode,PacketType>(m_startRow.value() + row, m_startCol.value() + col, x); 
   }
   
-  template<int StoreMode, typename PacketType> 
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketType& x) 
   { 
     return writePacket<StoreMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
@@ -859,7 +908,7 @@ struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
     Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ThenMatrixType>::Alignment, evaluator<ElseMatrixType>::Alignment)
   };
 
-  inline EIGEN_DEVICE_FUNC  explicit evaluator(const XprType& select)
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& select)
     : m_conditionImpl(select.conditionMatrix()),
       m_thenImpl(select.thenMatrix()),
       m_elseImpl(select.elseMatrix())
@@ -869,7 +918,8 @@ struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
  
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
-  inline EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
   {
     if (m_conditionImpl.coeff(row, col))
       return m_thenImpl.coeff(row, col);
@@ -877,7 +927,8 @@ struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
       return m_elseImpl.coeff(row, col);
   }
 
-  inline EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
   {
     if (m_conditionImpl.coeff(index))
       return m_thenImpl.coeff(index);
@@ -921,7 +972,8 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
       m_cols(replicate.nestedExpression().cols())
   {}
  
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
   {
     // try to avoid using modulo; this is a pure optimization strategy
     const Index actual_row = internal::traits<XprType>::RowsAtCompileTime==1 ? 0
@@ -934,7 +986,8 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
     return m_argImpl.coeff(actual_row, actual_col);
   }
   
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
   {
     // try to avoid using modulo; this is a pure optimization strategy
     const Index actual_index = internal::traits<XprType>::RowsAtCompileTime==1
@@ -945,6 +998,7 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
   }
 
   template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
   PacketType packet(Index row, Index col) const
   {
     const Index actual_row = internal::traits<XprType>::RowsAtCompileTime==1 ? 0
@@ -958,6 +1012,7 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
   }
   
   template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
   PacketType packet(Index index) const
   {
     const Index actual_index = internal::traits<XprType>::RowsAtCompileTime==1
@@ -994,7 +1049,7 @@ struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
     CoeffReadCost = TraversalSize==Dynamic ? HugeCost
                   : TraversalSize * evaluator<ArgType>::CoeffReadCost + int(CostOpType::value),
     
-    Flags = (traits<XprType>::Flags&RowMajorBit) | (evaluator<ArgType>::Flags&(HereditaryBits&(~RowMajorBit))),
+    Flags = (traits<XprType>::Flags&RowMajorBit) | (evaluator<ArgType>::Flags&(HereditaryBits&(~RowMajorBit))) | LinearAccessBit,
     
     Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized
   };
@@ -1008,7 +1063,8 @@ struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
 
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index i, Index j) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Scalar coeff(Index i, Index j) const
   {
     if (Direction==Vertical)
       return m_functor(m_arg.col(j));
@@ -1016,7 +1072,8 @@ struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
       return m_functor(m_arg.row(i));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Scalar coeff(Index index) const
   {
     if (Direction==Vertical)
       return m_functor(m_arg.col(index));
@@ -1051,45 +1108,53 @@ struct evaluator_wrapper_base
   typedef typename ArgType::Scalar Scalar;
   typedef typename ArgType::CoeffReturnType CoeffReturnType;
 
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
   {
     return m_argImpl.coeff(row, col);
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
   {
     return m_argImpl.coeff(index);
   }
 
-  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
   {
     return m_argImpl.coeffRef(row, col);
   }
 
-  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
   {
     return m_argImpl.coeffRef(index);
   }
 
-  template<int LoadMode, typename PacketType> 
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
   PacketType packet(Index row, Index col) const
   {
     return m_argImpl.template packet<LoadMode,PacketType>(row, col);
   }
 
-  template<int LoadMode, typename PacketType> 
+  template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
   PacketType packet(Index index) const
   {
     return m_argImpl.template packet<LoadMode,PacketType>(index);
   }
 
-  template<int StoreMode, typename PacketType> 
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
   void writePacket(Index row, Index col, const PacketType& x)
   {
     m_argImpl.template writePacket<StoreMode>(row, col, x);
   }
 
-  template<int StoreMode, typename PacketType> 
+  template<int StoreMode, typename PacketType>
+  EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketType& x)
   {
     m_argImpl.template writePacket<StoreMode>(index, x);
@@ -1164,29 +1229,34 @@ struct unary_evaluator<Reverse<ArgType, Direction> >
       m_cols(ReverseCol ? reverse.nestedExpression().cols() : 1)
   { }
  
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index col) const
   {
     return m_argImpl.coeff(ReverseRow ? m_rows.value() - row - 1 : row,
                            ReverseCol ? m_cols.value() - col - 1 : col);
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
   {
     return m_argImpl.coeff(m_rows.value() * m_cols.value() - index - 1);
   }
 
-  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index col)
   {
     return m_argImpl.coeffRef(ReverseRow ? m_rows.value() - row - 1 : row,
                               ReverseCol ? m_cols.value() - col - 1 : col);
   }
 
-  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
   {
     return m_argImpl.coeffRef(m_rows.value() * m_cols.value() - index - 1);
   }
 
   template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
   PacketType packet(Index row, Index col) const
   {
     enum {
@@ -1201,6 +1271,7 @@ struct unary_evaluator<Reverse<ArgType, Direction> >
   }
 
   template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
   PacketType packet(Index index) const
   {
     enum { PacketSize = unpacket_traits<PacketType>::size };
@@ -1208,6 +1279,7 @@ struct unary_evaluator<Reverse<ArgType, Direction> >
   }
 
   template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
   void writePacket(Index row, Index col, const PacketType& x)
   {
     // FIXME we could factorize some code with packet(i,j)
@@ -1224,6 +1296,7 @@ struct unary_evaluator<Reverse<ArgType, Direction> >
   }
 
   template<int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketType& x)
   {
     enum { PacketSize = unpacket_traits<PacketType>::size };
@@ -1267,22 +1340,26 @@ struct evaluator<Diagonal<ArgType, DiagIndex> >
   typedef typename internal::conditional<!internal::is_same<typename ArgType::StorageKind,Sparse>::value,
                                          typename XprType::CoeffReturnType,Scalar>::type CoeffReturnType;
 
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index row, Index) const
   {
     return m_argImpl.coeff(row + rowOffset(), row + colOffset());
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  CoeffReturnType coeff(Index index) const
   {
     return m_argImpl.coeff(index + rowOffset(), index + colOffset());
   }
 
-  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index row, Index)
   {
     return m_argImpl.coeffRef(row + rowOffset(), row + colOffset());
   }
 
-  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Scalar& coeffRef(Index index)
   {
     return m_argImpl.coeffRef(index + rowOffset(), index + colOffset());
   }
diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h
index f94629e6d..39820fd7d 100644
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@@ -32,8 +32,8 @@ struct traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
   // we still want to handle the case when the result type is different.
   typedef typename result_of<
                      BinaryOp(
-                       typename Lhs::Scalar,
-                       typename Rhs::Scalar
+                       const typename Lhs::Scalar&,
+                       const typename Rhs::Scalar&
                      )
                    >::type Scalar;
   typedef typename cwise_promote_storage_type<typename traits<Lhs>::StorageKind,
diff --git a/Eigen/src/Core/CwiseUnaryOp.h b/Eigen/src/Core/CwiseUnaryOp.h
index 5a809cf21..1d2dd19f2 100644
--- a/Eigen/src/Core/CwiseUnaryOp.h
+++ b/Eigen/src/Core/CwiseUnaryOp.h
@@ -19,7 +19,7 @@ struct traits<CwiseUnaryOp<UnaryOp, XprType> >
  : traits<XprType>
 {
   typedef typename result_of<
-                     UnaryOp(typename XprType::Scalar)
+                     UnaryOp(const typename XprType::Scalar&)
                    >::type Scalar;
   typedef typename XprType::Nested XprTypeNested;
   typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
@@ -58,33 +58,34 @@ class CwiseUnaryOp : public CwiseUnaryOpImpl<UnaryOp, XprType, typename internal
 
     typedef typename CwiseUnaryOpImpl<UnaryOp, XprType,typename internal::traits<XprType>::StorageKind>::Base Base;
     EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryOp)
+    typedef typename internal::ref_selector<XprType>::type XprTypeNested;
     typedef typename internal::remove_all<XprType>::type NestedExpression;
 
-    EIGEN_DEVICE_FUNC
-    explicit inline CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    explicit CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
       : m_xpr(xpr), m_functor(func) {}
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index rows() const { return m_xpr.rows(); }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index cols() const { return m_xpr.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Index rows() const { return m_xpr.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Index cols() const { return m_xpr.cols(); }
 
     /** \returns the functor representing the unary operation */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const UnaryOp& functor() const { return m_functor; }
 
     /** \returns the nested expression */
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type&
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const typename internal::remove_all<XprTypeNested>::type&
     nestedExpression() const { return m_xpr; }
 
     /** \returns the nested expression */
-    EIGEN_DEVICE_FUNC
-    typename internal::remove_all<typename XprType::Nested>::type&
-    nestedExpression() { return m_xpr.const_cast_derived(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    typename internal::remove_all<XprTypeNested>::type&
+    nestedExpression() { return m_xpr; }
 
   protected:
-    typename XprType::Nested m_xpr;
+    XprTypeNested m_xpr;
     const UnaryOp m_functor;
 };
 
diff --git a/Eigen/src/Core/CwiseUnaryView.h b/Eigen/src/Core/CwiseUnaryView.h
index 5a7db2b19..271033056 100644
--- a/Eigen/src/Core/CwiseUnaryView.h
+++ b/Eigen/src/Core/CwiseUnaryView.h
@@ -18,7 +18,7 @@ struct traits<CwiseUnaryView<ViewOp, MatrixType> >
  : traits<MatrixType>
 {
   typedef typename result_of<
-                     ViewOp(typename traits<MatrixType>::Scalar)
+                     ViewOp(const typename traits<MatrixType>::Scalar&)
                    >::type Scalar;
   typedef typename MatrixType::Nested MatrixTypeNested;
   typedef typename remove_all<MatrixTypeNested>::type _MatrixTypeNested;
@@ -61,6 +61,7 @@ class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename in
 
     typedef typename CwiseUnaryViewImpl<ViewOp, MatrixType,typename internal::traits<MatrixType>::StorageKind>::Base Base;
     EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryView)
+    typedef typename internal::ref_selector<MatrixType>::non_const_type MatrixTypeNested;
     typedef typename internal::remove_all<MatrixType>::type NestedExpression;
 
     explicit inline CwiseUnaryView(MatrixType& mat, const ViewOp& func = ViewOp())
@@ -75,15 +76,15 @@ class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename in
     const ViewOp& functor() const { return m_functor; }
 
     /** \returns the nested expression */
-    const typename internal::remove_all<typename MatrixType::Nested>::type&
+    const typename internal::remove_all<MatrixTypeNested>::type&
     nestedExpression() const { return m_matrix; }
 
     /** \returns the nested expression */
-    typename internal::remove_all<typename MatrixType::Nested>::type&
+    typename internal::remove_reference<MatrixTypeNested>::type&
     nestedExpression() { return m_matrix.const_cast_derived(); }
 
   protected:
-    typename internal::ref_selector<MatrixType>::type m_matrix;
+    MatrixTypeNested m_matrix;
     ViewOp m_functor;
 };
 
diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h
index 2c5c0ad28..5a38e5f22 100644
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -275,13 +275,13 @@ template<typename Derived> class DenseBase
 
     /** Copies \a other into *this. \returns a reference to *this. */
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator=(const DenseBase<OtherDerived>& other);
 
     /** Special case of the template operator=, in order to prevent the compiler
       * from generating a default operator= (issue hit with g++ 4.1)
       */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator=(const DenseBase& other);
 
     template<typename OtherDerived>
@@ -388,10 +388,10 @@ template<typename Derived> class DenseBase
     inline bool hasNaN() const;
     inline bool allFinite() const;
 
-    EIGEN_DEVICE_FUNC
-    inline Derived& operator*=(const Scalar& other);
-    EIGEN_DEVICE_FUNC
-    inline Derived& operator/=(const Scalar& other);
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator*=(const Scalar& other);
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator/=(const Scalar& other);
 
     typedef typename internal::add_const_on_value_type<typename internal::eval<Derived>::type>::type EvalReturnType;
     /** \returns the matrix or vector obtained by evaluating this expression.
diff --git a/Eigen/src/Core/Diagonal.h b/Eigen/src/Core/Diagonal.h
index fa3176266..bfea0584b 100644
--- a/Eigen/src/Core/Diagonal.h
+++ b/Eigen/src/Core/Diagonal.h
@@ -103,21 +103,21 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
                      >::type ScalarWithConstIfNotLvalue;
 
     EIGEN_DEVICE_FUNC
-    inline ScalarWithConstIfNotLvalue* data() { return &(m_matrix.const_cast_derived().coeffRef(rowOffset(), colOffset())); }
+    inline ScalarWithConstIfNotLvalue* data() { return &(m_matrix.coeffRef(rowOffset(), colOffset())); }
     EIGEN_DEVICE_FUNC
-    inline const Scalar* data() const { return &(m_matrix.const_cast_derived().coeffRef(rowOffset(), colOffset())); }
+    inline const Scalar* data() const { return &(m_matrix.coeffRef(rowOffset(), colOffset())); }
 
     EIGEN_DEVICE_FUNC
     inline Scalar& coeffRef(Index row, Index)
     {
       EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
-      return m_matrix.const_cast_derived().coeffRef(row+rowOffset(), row+colOffset());
+      return m_matrix.coeffRef(row+rowOffset(), row+colOffset());
     }
 
     EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index row, Index) const
     {
-      return m_matrix.const_cast_derived().coeffRef(row+rowOffset(), row+colOffset());
+      return m_matrix.coeffRef(row+rowOffset(), row+colOffset());
     }
 
     EIGEN_DEVICE_FUNC
@@ -130,13 +130,13 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
     inline Scalar& coeffRef(Index idx)
     {
       EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
-      return m_matrix.const_cast_derived().coeffRef(idx+rowOffset(), idx+colOffset());
+      return m_matrix.coeffRef(idx+rowOffset(), idx+colOffset());
     }
 
     EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index idx) const
     {
-      return m_matrix.const_cast_derived().coeffRef(idx+rowOffset(), idx+colOffset());
+      return m_matrix.coeffRef(idx+rowOffset(), idx+colOffset());
     }
 
     EIGEN_DEVICE_FUNC
@@ -159,7 +159,7 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
     }
 
   protected:
-    typename MatrixType::Nested m_matrix;
+    typename internal::ref_selector<MatrixType>::non_const_type m_matrix;
     const internal::variable_if_dynamicindex<Index, DiagIndex> m_index;
 
   private:
diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h
index 221fc3224..82d58fc0b 100644
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -142,6 +142,52 @@ inline void MatrixBase<Derived>::normalize()
     derived() /= numext::sqrt(z);
 }
 
+/** \returns an expression of the quotient of \c *this by its own norm while avoiding underflow and overflow.
+  *
+  * \only_for_vectors
+  *
+  * This method is analogue to the normalized() method, but it reduces the risk of
+  * underflow and overflow when computing the norm.
+  *
+  * \warning If the input vector is too small (i.e., this->norm()==0),
+  *          then this function returns a copy of the input.
+  *
+  * \sa stableNorm(), stableNormalize(), normalized()
+  */
+template<typename Derived>
+inline const typename MatrixBase<Derived>::PlainObject
+MatrixBase<Derived>::stableNormalized() const
+{
+  typedef typename internal::nested_eval<Derived,3>::type _Nested;
+  _Nested n(derived());
+  RealScalar w = n.cwiseAbs().maxCoeff();
+  RealScalar z = (n/w).squaredNorm();
+  if(z>RealScalar(0))
+    return n / (numext::sqrt(z)*w);
+  else
+    return n;
+}
+
+/** Normalizes the vector while avoid underflow and overflow
+  *
+  * \only_for_vectors
+  *
+  * This method is analogue to the normalize() method, but it reduces the risk of
+  * underflow and overflow when computing the norm.
+  *
+  * \warning If the input vector is too small (i.e., this->norm()==0), then \c *this is left unchanged.
+  *
+  * \sa stableNorm(), stableNormalized(), normalize()
+  */
+template<typename Derived>
+inline void MatrixBase<Derived>::stableNormalize()
+{
+  RealScalar w = cwiseAbs().maxCoeff();
+  RealScalar z = (derived()/w).squaredNorm();
+  if(z>RealScalar(0))
+    derived() /= numext::sqrt(z)*w;
+}
+
 //---------- implementation of other norms ----------
 
 namespace internal {
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 8ad51bad5..8f63af7cb 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -75,6 +75,7 @@ struct default_packet_traits
     HasCosh    = 0,
     HasTanh    = 0,
     HasLGamma = 0,
+    HasDiGamma = 0,
     HasErf = 0,
     HasErfc = 0,
 
@@ -284,7 +285,7 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu
  { pstore(to, from); }
 
 /** \internal tries to do cache prefetching of \a addr */
-template<typename Scalar> inline void prefetch(const Scalar* addr)
+template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
 {
 #ifdef __CUDA_ARCH__
 #if defined(__LP64__)
@@ -439,6 +440,10 @@ Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); }
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet plgamma(const Packet& a) { using numext::lgamma; return lgamma(a); }
 
+/** \internal \returns the derivative of lgamma, psi(\a a) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pdigamma(const Packet& a) { using numext::digamma; return digamma(a); }
+
 /** \internal \returns the erf(\a a) (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet perf(const Packet& a) { using numext::erf; return erf(a); }
diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h
index 62fec7008..396da8e71 100644
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@@ -50,6 +50,7 @@ namespace Eigen
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erfc,scalar_erfc_op)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op)
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 1c7b28a4b..e87b60f8f 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -748,9 +748,9 @@ template<typename T> EIGEN_DEVICE_FUNC bool isinf_msvc_helper(T x)
 }
 
 //MSVC defines a _isnan builtin function, but for double only
-EIGEN_DEVICE_FUNC inline bool isnan_impl(const long double& x) { return _isnan(x); }
-EIGEN_DEVICE_FUNC inline bool isnan_impl(const double& x)      { return _isnan(x); }
-EIGEN_DEVICE_FUNC inline bool isnan_impl(const float& x)       { return _isnan(x); }
+EIGEN_DEVICE_FUNC inline bool isnan_impl(const long double& x) { return _isnan(x)!=0; }
+EIGEN_DEVICE_FUNC inline bool isnan_impl(const double& x)      { return _isnan(x)!=0; }
+EIGEN_DEVICE_FUNC inline bool isnan_impl(const float& x)       { return _isnan(x)!=0; }
 
 EIGEN_DEVICE_FUNC inline bool isinf_impl(const long double& x) { return isinf_msvc_helper(x); }
 EIGEN_DEVICE_FUNC inline bool isinf_impl(const double& x)      { return isinf_msvc_helper(x); }
@@ -1080,21 +1080,21 @@ struct scalar_fuzzy_impl : scalar_fuzzy_default_impl<Scalar, NumTraits<Scalar>::
 
 template<typename Scalar, typename OtherScalar> EIGEN_DEVICE_FUNC
 inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y,
-                                   typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
+                              const typename NumTraits<Scalar>::Real &precision = NumTraits<Scalar>::dummy_precision())
 {
   return scalar_fuzzy_impl<Scalar>::template isMuchSmallerThan<OtherScalar>(x, y, precision);
 }
 
 template<typename Scalar> EIGEN_DEVICE_FUNC
 inline bool isApprox(const Scalar& x, const Scalar& y,
-                          typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
+                     const typename NumTraits<Scalar>::Real &precision = NumTraits<Scalar>::dummy_precision())
 {
   return scalar_fuzzy_impl<Scalar>::isApprox(x, y, precision);
 }
 
 template<typename Scalar> EIGEN_DEVICE_FUNC
 inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y,
-                                    typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
+                               const typename NumTraits<Scalar>::Real &precision = NumTraits<Scalar>::dummy_precision())
 {
   return scalar_fuzzy_impl<Scalar>::isApproxOrLessThan(x, y, precision);
 }
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index f3935802d..3770ab257 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -135,14 +135,14 @@ template<typename Derived> class MatrixBase
     /** Special case of the template operator=, in order to prevent the compiler
       * from generating a default operator= (issue hit with g++ 4.1)
       */
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator=(const MatrixBase& other);
 
     // We cannot inherit here via Base::operator= since it is causing
     // trouble with MSVC.
 
     template <typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator=(const DenseBase<OtherDerived>& other);
 
     template <typename OtherDerived>
@@ -154,10 +154,10 @@ template<typename Derived> class MatrixBase
     Derived& operator=(const ReturnByValue<OtherDerived>& other);
 
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator+=(const MatrixBase<OtherDerived>& other);
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     Derived& operator-=(const MatrixBase<OtherDerived>& other);
 
 #ifdef __CUDACC__
@@ -204,7 +204,9 @@ template<typename Derived> class MatrixBase
     RealScalar blueNorm() const;
     RealScalar hypotNorm() const;
     EIGEN_DEVICE_FUNC const PlainObject normalized() const;
+    EIGEN_DEVICE_FUNC const PlainObject stableNormalized() const;
     EIGEN_DEVICE_FUNC void normalize();
+    EIGEN_DEVICE_FUNC void stableNormalize();
 
     EIGEN_DEVICE_FUNC const AdjointReturnType adjoint() const;
     EIGEN_DEVICE_FUNC void adjointInPlace();
diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h
index e709eb213..9fda02691 100644
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@@ -32,7 +32,7 @@ namespace internal {
 template<typename MatrixType, unsigned int UpLo>
 struct traits<SelfAdjointView<MatrixType, UpLo> > : traits<MatrixType>
 {
-  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::non_const_type MatrixTypeNested;
   typedef typename remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
   typedef MatrixType ExpressionType;
   typedef typename MatrixType::PlainObject FullMatrixType;
@@ -97,7 +97,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
     {
       EIGEN_STATIC_ASSERT_LVALUE(SelfAdjointView);
       Base::check_coordinates_internal(row, col);
-      return m_matrix.const_cast_derived().coeffRef(row, col);
+      return m_matrix.coeffRef(row, col);
     }
 
     /** \internal */
@@ -107,7 +107,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
     EIGEN_DEVICE_FUNC
     const MatrixTypeNestedCleaned& nestedExpression() const { return m_matrix; }
     EIGEN_DEVICE_FUNC
-    MatrixTypeNestedCleaned& nestedExpression() { return *const_cast<MatrixTypeNestedCleaned*>(&m_matrix); }
+    MatrixTypeNestedCleaned& nestedExpression() { return m_matrix; }
 
     /** Efficient triangular matrix times vector/matrix product */
     template<typename OtherDerived>
diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h
index d43cf23a1..9f89e184d 100644
--- a/Eigen/src/Core/SpecialFunctions.h
+++ b/Eigen/src/Core/SpecialFunctions.h
@@ -13,79 +13,349 @@
 namespace Eigen {
 namespace internal {
 
+//  Parts of this code are based on the Cephes Math Library.
+//
+//  Cephes Math Library Release 2.8:  June, 2000
+//  Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier
+//
+//  Permission has been kindly provided by the original author
+//  to incorporate the Cephes software into the Eigen codebase:
+//
+//    From: Stephen Moshier
+//    To: Eugene Brevdo
+//    Subject: Re: Permission to wrap several cephes functions in Eigen
+//
+//    Hello Eugene,
+//
+//    Thank you for writing.
+//
+//    If your licensing is similar to BSD, the formal way that has been
+//    handled is simply to add a statement to the effect that you are incorporating
+//    the Cephes software by permission of the author.
+//
+//    Good luck with your project,
+//    Steve
+
+namespace cephes {
+
+/* polevl (modified for Eigen)
+ *
+ *      Evaluate polynomial
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * int N;
+ * Scalar x, y, coef[N+1];
+ *
+ * y = polevl<decltype(x), N>( x, coef);
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Evaluates polynomial of degree N:
+ *
+ *                     2          N
+ * y  =  C  + C x + C x  +...+ C x
+ *        0    1     2          N
+ *
+ * Coefficients are stored in reverse order:
+ *
+ * coef[0] = C  , ..., coef[N] = C  .
+ *            N                   0
+ *
+ *  The function p1evl() assumes that coef[N] = 1.0 and is
+ * omitted from the array.  Its calling arguments are
+ * otherwise the same as polevl().
+ *
+ *
+ * The Eigen implementation is templatized.  For best speed, store
+ * coef as a const array (constexpr), e.g.
+ *
+ * const double coef[] = {1.0, 2.0, 3.0, ...};
+ *
+ */
+template <typename Scalar, int N>
+struct polevl {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static Scalar run(const Scalar x, const Scalar coef[]) {
+    EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    return polevl<Scalar, N - 1>::run(x, coef) * x + coef[N];
+  }
+};
+
+template <typename Scalar>
+struct polevl<Scalar, 0> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static Scalar run(const Scalar, const Scalar coef[]) {
+    return coef[0];
+  }
+};
+
+}  // end namespace cephes
+
 /****************************************************************************
  * Implementation of lgamma                                                 *
  ****************************************************************************/
 
-template<typename Scalar>
-struct lgamma_impl
-{
+template <typename Scalar>
+struct lgamma_impl {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Scalar&)
-  {
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
     EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
                         THIS_TYPE_IS_NOT_SUPPORTED);
     return Scalar(0);
   }
 };
 
-template<typename Scalar>
-struct lgamma_retval
-{
+template <typename Scalar>
+struct lgamma_retval {
   typedef Scalar type;
 };
 
 #ifdef EIGEN_HAS_C99_MATH
-template<>
-struct lgamma_impl<float>
-{
+template <>
+struct lgamma_impl<float> {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE double run(const float& x) { return ::lgammaf(x); }
+  static EIGEN_STRONG_INLINE float run(float x) { return ::lgammaf(x); }
 };
 
-template<>
-struct lgamma_impl<double>
-{
+template <>
+struct lgamma_impl<double> {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE double run(const double& x) { return ::lgamma(x); }
+  static EIGEN_STRONG_INLINE double run(double x) { return ::lgamma(x); }
 };
 #endif
 
 /****************************************************************************
+ * Implementation of digamma (psi)                                          *
+ ****************************************************************************/
+
+#ifdef EIGEN_HAS_C99_MATH
+
+/*
+ *
+ * Polynomial evaluation helper for the Psi (digamma) function.
+ *
+ * digamma_impl_maybe_poly::run(s) evaluates the asymptotic Psi expansion for
+ * input Scalar s, assuming s is above 10.0.
+ *
+ * If s is above a certain threshold for the given Scalar type, zero
+ * is returned.  Otherwise the polynomial is evaluated with enough
+ * coefficients for results matching Scalar machine precision.
+ *
+ *
+ */
+template <typename Scalar>
+struct digamma_impl_maybe_poly {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+
+template <>
+struct digamma_impl_maybe_poly<float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE float run(const float s) {
+    const float A[] = {
+      -4.16666666666666666667E-3,
+      3.96825396825396825397E-3,
+      -8.33333333333333333333E-3,
+      8.33333333333333333333E-2
+    };
+
+    float z;
+    if (s < 1.0e8f) {
+      z = 1.0f / (s * s);
+      return z * cephes::polevl<float, 3>::run(z, A);
+    } else return 0.0f;
+  }
+};
+
+template <>
+struct digamma_impl_maybe_poly<double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double run(const double s) {
+    const double A[] = {
+      8.33333333333333333333E-2,
+      -2.10927960927960927961E-2,
+      7.57575757575757575758E-3,
+      -4.16666666666666666667E-3,
+      3.96825396825396825397E-3,
+      -8.33333333333333333333E-3,
+      8.33333333333333333333E-2
+    };
+
+    double z;
+    if (s < 1.0e17) {
+      z = 1.0 / (s * s);
+      return z * cephes::polevl<double, 6>::run(z, A);
+    }
+    else return 0.0;
+  }
+};
+
+#endif  // EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct digamma_retval {
+  typedef Scalar type;
+};
+
+#ifdef EIGEN_HAS_C99_MATH
+template <typename Scalar>
+struct digamma_impl {
+  EIGEN_DEVICE_FUNC
+  static Scalar run(Scalar x) {
+    /*
+     *
+     *     Psi (digamma) function (modified for Eigen)
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, psi();
+     *
+     * y = psi( x );
+     *
+     *
+     * DESCRIPTION:
+     *
+     *              d      -
+     *   psi(x)  =  -- ln | (x)
+     *              dx
+     *
+     * is the logarithmic derivative of the gamma function.
+     * For integer x,
+     *                   n-1
+     *                    -
+     * psi(n) = -EUL  +   >  1/k.
+     *                    -
+     *                   k=1
+     *
+     * If x is negative, it is transformed to a positive argument by the
+     * reflection formula  psi(1-x) = psi(x) + pi cot(pi x).
+     * For general positive x, the argument is made greater than 10
+     * using the recurrence  psi(x+1) = psi(x) + 1/x.
+     * Then the following asymptotic expansion is applied:
+     *
+     *                           inf.   B
+     *                            -      2k
+     * psi(x) = log(x) - 1/2x -   >   -------
+     *                            -        2k
+     *                           k=1   2k x
+     *
+     * where the B2k are Bernoulli numbers.
+     *
+     * ACCURACY (float):
+     *    Relative error (except absolute when |psi| < 1):
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,30        30000       1.3e-15     1.4e-16
+     *    IEEE      -30,0       40000       1.5e-15     2.2e-16
+     *
+     * ACCURACY (double):
+     *    Absolute error,  relative when |psi| > 1 :
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      -33,0        30000      8.2e-7      1.2e-7
+     *    IEEE      0,33        100000      7.3e-7      7.7e-8
+     *
+     * ERROR MESSAGES:
+     *     message         condition      value returned
+     * psi singularity    x integer <=0      INFINITY
+     */
+
+    Scalar p, q, nz, s, w, y;
+    bool negative;
+
+    const Scalar maxnum = std::numeric_limits<Scalar>::infinity();
+    const Scalar m_pi = 3.14159265358979323846;
+
+    negative = 0;
+    nz = 0.0;
+
+    const Scalar zero = 0.0;
+    const Scalar one = 1.0;
+    const Scalar half = 0.5;
+
+    if (x <= zero) {
+      negative = one;
+      q = x;
+      p = ::floor(q);
+      if (p == q) {
+        return maxnum;
+      }
+      /* Remove the zeros of tan(m_pi x)
+       * by subtracting the nearest integer from x
+       */
+      nz = q - p;
+      if (nz != half) {
+        if (nz > half) {
+          p += one;
+          nz = q - p;
+        }
+        nz = m_pi / ::tan(m_pi * nz);
+      }
+      else {
+        nz = zero;
+      }
+      x = one - x;
+    }
+
+    /* use the recurrence psi(x+1) = psi(x) + 1/x. */
+    s = x;
+    w = zero;
+    while (s < Scalar(10)) {
+      w += one / s;
+      s += one;
+    }
+
+    y = digamma_impl_maybe_poly<Scalar>::run(s);
+
+    y = ::log(s) - (half / s) - y - w;
+
+    return (negative) ? y - nz : y;
+  }
+};
+
+#endif  // EIGEN_HAS_C99_MATH
+
+/****************************************************************************
  * Implementation of erf                                                    *
  ****************************************************************************/
 
-template<typename Scalar>
-struct erf_impl
-{
+template <typename Scalar>
+struct erf_impl {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Scalar&)
-  {
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
     EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
                         THIS_TYPE_IS_NOT_SUPPORTED);
     return Scalar(0);
   }
 };
 
-template<typename Scalar>
-struct erf_retval
-{
+template <typename Scalar>
+struct erf_retval {
   typedef Scalar type;
 };
 
 #ifdef EIGEN_HAS_C99_MATH
-template<>
-struct erf_impl<float>
-{
+template <>
+struct erf_impl<float> {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE float run(const float& x) { return ::erff(x); }
+  static EIGEN_STRONG_INLINE float run(float x) { return ::erff(x); }
 };
 
-template<>
-struct erf_impl<double>
-{
+template <>
+struct erf_impl<double> {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE double run(const double& x) { return ::erf(x); }
+  static EIGEN_STRONG_INLINE double run(double x) { return ::erf(x); }
 };
 #endif  // EIGEN_HAS_C99_MATH
 
@@ -93,35 +363,30 @@ struct erf_impl<double>
 * Implementation of erfc                                                   *
 ****************************************************************************/
 
-template<typename Scalar>
-struct erfc_impl
-{
+template <typename Scalar>
+struct erfc_impl {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Scalar&)
-  {
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
     EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
                         THIS_TYPE_IS_NOT_SUPPORTED);
     return Scalar(0);
   }
 };
 
-template<typename Scalar>
-struct erfc_retval
-{
+template <typename Scalar>
+struct erfc_retval {
   typedef Scalar type;
 };
 
 #ifdef EIGEN_HAS_C99_MATH
-template<>
-struct erfc_impl<float>
-{
+template <>
+struct erfc_impl<float> {
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE float run(const float x) { return ::erfcf(x); }
 };
 
-template<>
-struct erfc_impl<double>
-{
+template <>
+struct erfc_impl<double> {
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE double run(const double x) { return ::erfc(x); }
 };
@@ -129,27 +394,29 @@ struct erfc_impl<double>
 
 }  // end namespace internal
 
-
 namespace numext {
 
-template<typename Scalar>
-EIGEN_DEVICE_FUNC
-inline EIGEN_MATHFUNC_RETVAL(lgamma, Scalar) lgamma(const Scalar& x)
-{
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(lgamma, Scalar)
+    lgamma(const Scalar& x) {
   return EIGEN_MATHFUNC_IMPL(lgamma, Scalar)::run(x);
 }
 
-template<typename Scalar>
-EIGEN_DEVICE_FUNC
-inline EIGEN_MATHFUNC_RETVAL(erf, Scalar) erf(const Scalar& x)
-{
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(digamma, Scalar)
+    digamma(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(digamma, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erf, Scalar)
+    erf(const Scalar& x) {
   return EIGEN_MATHFUNC_IMPL(erf, Scalar)::run(x);
 }
 
-template<typename Scalar>
-EIGEN_DEVICE_FUNC
-inline EIGEN_MATHFUNC_RETVAL(erfc, Scalar) erfc(const Scalar& x)
-{
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erfc, Scalar)
+    erfc(const Scalar& x) {
   return EIGEN_MATHFUNC_IMPL(erfc, Scalar)::run(x);
 }
 
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index f199d1086..bc232526a 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -54,6 +54,8 @@ template<typename MatrixType> class Transpose
 {
   public:
 
+    typedef typename internal::ref_selector<MatrixType>::non_const_type MatrixTypeNested;
+
     typedef typename TransposeImpl<MatrixType,typename internal::traits<MatrixType>::StorageKind>::Base Base;
     EIGEN_GENERIC_PUBLIC_INTERFACE(Transpose)
     typedef typename internal::remove_all<MatrixType>::type NestedExpression;
@@ -68,16 +70,16 @@ template<typename MatrixType> class Transpose
 
     /** \returns the nested expression */
     EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename MatrixType::Nested>::type&
+    const typename internal::remove_all<MatrixTypeNested>::type&
     nestedExpression() const { return m_matrix; }
 
     /** \returns the nested expression */
     EIGEN_DEVICE_FUNC
-    typename internal::remove_all<typename MatrixType::Nested>::type&
-    nestedExpression() { return m_matrix.const_cast_derived(); }
+    typename internal::remove_reference<MatrixTypeNested>::type&
+    nestedExpression() { return m_matrix; }
 
   protected:
-    typename MatrixType::Nested m_matrix;
+    typename internal::ref_selector<MatrixType>::non_const_type m_matrix;
 };
 
 namespace internal {
diff --git a/Eigen/src/Core/Transpositions.h b/Eigen/src/Core/Transpositions.h
index 678ba3288..19c17bb4a 100644
--- a/Eigen/src/Core/Transpositions.h
+++ b/Eigen/src/Core/Transpositions.h
@@ -325,7 +325,7 @@ class TranspositionsWrapper
 
   protected:
 
-    const typename IndicesType::Nested m_indices;
+    typename IndicesType::Nested m_indices;
 };
 
 
diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h
index 27845e89c..f55b42eed 100644
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -168,7 +168,7 @@ namespace internal {
 template<typename MatrixType, unsigned int _Mode>
 struct traits<TriangularView<MatrixType, _Mode> > : traits<MatrixType>
 {
-  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::non_const_type MatrixTypeNested;
   typedef typename remove_reference<MatrixTypeNested>::type MatrixTypeNestedNonRef;
   typedef typename remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
   typedef typename MatrixType::PlainObject FullMatrixType;
@@ -213,7 +213,6 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
       IsVectorAtCompileTime = false
     };
 
-    // FIXME This, combined with const_cast_derived in transpose() leads to a const-correctness loophole
     EIGEN_DEVICE_FUNC
     explicit inline TriangularView(MatrixType& matrix) : m_matrix(matrix)
     {}
@@ -235,7 +234,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
 
     /** \returns a reference to the nested expression */
     EIGEN_DEVICE_FUNC
-    NestedExpression& nestedExpression() { return *const_cast<NestedExpression*>(&m_matrix); }
+    NestedExpression& nestedExpression() { return m_matrix; }
     
     typedef TriangularView<const MatrixConjugateReturnType,Mode> ConjugateReturnType;
     /** \sa MatrixBase::conjugate() const */
@@ -255,7 +254,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
     inline TransposeReturnType transpose()
     {
       EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
-      typename MatrixType::TransposeReturnType tmp(m_matrix.const_cast_derived());
+      typename MatrixType::TransposeReturnType tmp(m_matrix);
       return TransposeReturnType(tmp);
     }
     
@@ -418,7 +417,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
     {
       EIGEN_STATIC_ASSERT_LVALUE(TriangularViewType);
       Base::check_coordinates_internal(row, col);
-      return derived().nestedExpression().const_cast_derived().coeffRef(row, col);
+      return derived().nestedExpression().coeffRef(row, col);
     }
 
     /** Assigns a triangular matrix to a triangular part of a dense matrix */
diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h
index 95bcaa86f..193891189 100755
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@@ -124,7 +124,7 @@ struct member_lpnorm {
 template <typename BinaryOp, typename Scalar>
 struct member_redux {
   typedef typename result_of<
-                     BinaryOp(Scalar,Scalar)
+                     BinaryOp(const Scalar&,const Scalar&)
                    >::type  result_type;
   template<typename _Scalar, int Size> struct Cost
   { enum { value = (Size-1) * functor_traits<BinaryOp>::Cost }; };
diff --git a/Eigen/src/Core/Visitor.h b/Eigen/src/Core/Visitor.h
index 7aac0b6e1..d71dfc968 100644
--- a/Eigen/src/Core/Visitor.h
+++ b/Eigen/src/Core/Visitor.h
@@ -197,7 +197,7 @@ struct functor_traits<max_coeff_visitor<Scalar> > {
 /** \returns the minimum of all coefficients of *this and puts in *row and *col its location.
   * \warning the result is undefined if \c *this contains NaN.
   *
-  * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visitor(), DenseBase::minCoeff()
+  * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff()
   */
 template<typename Derived>
 template<typename IndexType>
@@ -215,7 +215,7 @@ DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
 /** \returns the minimum of all coefficients of *this and puts in *index its location.
   * \warning the result is undefined if \c *this contains NaN. 
   *
-  * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::minCoeff()
+  * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::minCoeff()
   */
 template<typename Derived>
 template<typename IndexType>
@@ -233,7 +233,7 @@ DenseBase<Derived>::minCoeff(IndexType* index) const
 /** \returns the maximum of all coefficients of *this and puts in *row and *col its location.
   * \warning the result is undefined if \c *this contains NaN. 
   *
-  * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::maxCoeff()
+  * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff()
   */
 template<typename Derived>
 template<typename IndexType>
diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/CUDA/MathFunctions.h
index ecd5c444e..a2c06a817 100644
--- a/Eigen/src/Core/arch/CUDA/MathFunctions.h
+++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h
@@ -79,6 +79,20 @@ double2 plgamma<double2>(const double2& a)
 }
 
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pdigamma<float4>(const float4& a)
+{
+  using numext::digamma;
+  return make_float4(digamma(a.x), digamma(a.y), digamma(a.z), digamma(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pdigamma<double2>(const double2& a)
+{
+  using numext::digamma;
+  return make_double2(digamma(a.x), digamma(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 perf<float4>(const float4& a)
 {
   return make_float4(erf(a.x), erf(a.y), erf(a.z), erf(a.w));
diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h
index 9d5773106..d3d9f910e 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -40,6 +40,7 @@ template<> struct packet_traits<float> : default_packet_traits
     HasSqrt = 1,
     HasRsqrt = 1,
     HasLGamma = 1,
+    HasDiGamma = 1,
     HasErf = 1,
     HasErfc = 1,
 
@@ -63,6 +64,7 @@ template<> struct packet_traits<double> : default_packet_traits
     HasSqrt = 1,
     HasRsqrt = 1,
     HasLGamma = 1,
+    HasDiGamma = 1,
     HasErf = 1,
     HasErfc = 1,
 
diff --git a/Eigen/src/Core/functors/NullaryFunctors.h b/Eigen/src/Core/functors/NullaryFunctors.h
index cd9fbf267..c5836d048 100644
--- a/Eigen/src/Core/functors/NullaryFunctors.h
+++ b/Eigen/src/Core/functors/NullaryFunctors.h
@@ -37,7 +37,7 @@ template<typename Scalar>
 struct functor_traits<scalar_identity_op<Scalar> >
 { enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false, IsRepeatable = true }; };
 
-template <typename Scalar, typename Packet, bool RandomAccess> struct linspaced_op_impl;
+template <typename Scalar, typename Packet, bool RandomAccess, bool IsInteger> struct linspaced_op_impl;
 
 // linear access for packet ops:
 // 1) initialization
@@ -48,12 +48,12 @@ template <typename Scalar, typename Packet, bool RandomAccess> struct linspaced_
 // TODO: Perhaps it's better to initialize lazily (so not in the constructor but in packetOp)
 //       in order to avoid the padd() in operator() ?
 template <typename Scalar, typename Packet>
-struct linspaced_op_impl<Scalar,Packet,false>
+struct linspaced_op_impl<Scalar,Packet,/*RandomAccess*/false,/*IsInteger*/false>
 {
-  linspaced_op_impl(const Scalar& low, const Scalar& step) :
-  m_low(low), m_step(step),
-  m_packetStep(pset1<Packet>(unpacket_traits<Packet>::size*step)),
-  m_base(padd(pset1<Packet>(low), pmul(pset1<Packet>(step),plset<Packet>(-unpacket_traits<Packet>::size)))) {}
+  linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
+    m_low(low), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)),
+    m_packetStep(pset1<Packet>(unpacket_traits<Packet>::size*m_step)),
+    m_base(padd(pset1<Packet>(low), pmul(pset1<Packet>(m_step),plset<Packet>(-unpacket_traits<Packet>::size)))) {}
 
   template<typename Index>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const 
@@ -75,11 +75,11 @@ struct linspaced_op_impl<Scalar,Packet,false>
 // 1) each step
 //   [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
 template <typename Scalar, typename Packet>
-struct linspaced_op_impl<Scalar,Packet,true>
+struct linspaced_op_impl<Scalar,Packet,/*RandomAccess*/true,/*IsInteger*/false>
 {
-  linspaced_op_impl(const Scalar& low, const Scalar& step) :
-  m_low(low), m_step(step),
-  m_lowPacket(pset1<Packet>(m_low)), m_stepPacket(pset1<Packet>(m_step)), m_interPacket(plset<Packet>(0)) {}
+  linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
+    m_low(low), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)),
+    m_lowPacket(pset1<Packet>(m_low)), m_stepPacket(pset1<Packet>(m_step)), m_interPacket(plset<Packet>(0)) {}
 
   template<typename Index>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return m_low+i*m_step; }
@@ -95,6 +95,31 @@ struct linspaced_op_impl<Scalar,Packet,true>
   const Packet m_interPacket;
 };
 
+template <typename Scalar, typename Packet>
+struct linspaced_op_impl<Scalar,Packet,/*RandomAccess*/true,/*IsInteger*/true>
+{
+  linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
+    m_low(low), m_length(high-low), m_divisor(num_steps==1?1:num_steps-1), m_interPacket(plset<Packet>(0))
+  {}
+
+  template<typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Scalar operator() (Index i) const {
+    return m_low + (m_length*Scalar(i))/m_divisor;
+  }
+
+  template<typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Packet packetOp(Index i) const {
+    return internal::padd(pset1<Packet>(m_low), pdiv(pmul(pset1<Packet>(m_length), padd(pset1<Packet>(Scalar(i)),m_interPacket)),
+                                                     pset1<Packet>(m_divisor))); }
+
+  const Scalar m_low;
+  const Scalar m_length;
+  const Index  m_divisor;
+  const Packet m_interPacket;
+};
+
 // ----- Linspace functor ----------------------------------------------------------------
 
 // Forward declaration (we default to random access which does not really give
@@ -102,10 +127,20 @@ struct linspaced_op_impl<Scalar,Packet,true>
 // nested expressions).
 template <typename Scalar, typename PacketType, bool RandomAccess = true> struct linspaced_op;
 template <typename Scalar, typename PacketType, bool RandomAccess> struct functor_traits< linspaced_op<Scalar,PacketType,RandomAccess> >
-{ enum { Cost = 1, PacketAccess = packet_traits<Scalar>::HasSetLinear, IsRepeatable = true }; };
+{
+  enum
+  {
+    Cost = 1,
+    PacketAccess =   packet_traits<Scalar>::HasSetLinear
+                  && ((!NumTraits<Scalar>::IsInteger) || packet_traits<Scalar>::HasDiv),
+    IsRepeatable = true
+  };
+};
 template <typename Scalar, typename PacketType, bool RandomAccess> struct linspaced_op
 {
-  linspaced_op(const Scalar& low, const Scalar& high, Index num_steps) : impl((num_steps==1 ? high : low), (num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1))) {}
+  linspaced_op(const Scalar& low, const Scalar& high, Index num_steps)
+    : impl((num_steps==1 ? high : low),high,num_steps)
+  {}
 
   template<typename Index>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return impl(i); }
@@ -134,7 +169,9 @@ template <typename Scalar, typename PacketType, bool RandomAccess> struct linspa
   // This proxy object handles the actual required temporaries, the different
   // implementations (random vs. sequential access) as well as the
   // correct piping to size 2/4 packet operations.
-  const linspaced_op_impl<Scalar,PacketType,RandomAccess> impl;
+  // As long as we don't have a Bresenham-like implementation for linear-access and integer types,
+  // we have to by-pass RandomAccess for integer types. See bug 698.
+  const linspaced_op_impl<Scalar,PacketType,(NumTraits<Scalar>::IsInteger?true:RandomAccess),NumTraits<Scalar>::IsInteger> impl;
 };
 
 // all functors allow linear access, except scalar_identity_op. So we fix here a quick meta
diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h
index 01727f250..531beead6 100644
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@@ -428,6 +428,28 @@ struct functor_traits<scalar_lgamma_op<Scalar> >
 };
 
 /** \internal
+ * \brief Template functor to compute psi, the derivative of lgamma of a scalar.
+ * \sa class CwiseUnaryOp, Cwise::digamma()
+ */
+template<typename Scalar> struct scalar_digamma_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_digamma_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+    using numext::digamma; return digamma(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pdigamma(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_digamma_op<Scalar> >
+{
+  enum {
+    // Guesstimate
+    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasDiGamma
+  };
+};
+
+/** \internal
  * \brief Template functor to compute the Gauss error function of a
  * scalar
  * \sa class CwiseUnaryOp, Cwise::erf()
@@ -644,7 +666,7 @@ struct functor_traits<scalar_floor_op<Scalar> >
 template<typename Scalar> struct scalar_ceil_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_ceil_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::ceil(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
+  template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pceil(a); }
 };
 template<typename Scalar>
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 229e96ceb..d2e6f26c8 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -252,7 +252,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
         // we have both L2 and L3, and problem is small enough to be kept in L2
         // Let's choose m such that lhs's block fit in 1/3 of L2
         actual_lm = l2;
-        max_mc = 576;
+        max_mc = (std::min<Index>)(576,max_mc);
       }
       Index mc = (std::min<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
       if (mc > Traits::mr) mc -= mc % Traits::mr;
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index d77fc2630..a39c7808c 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -352,9 +352,8 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
       }
       else  // no l3 blocking
       {
-        Index m = this->m_mc;
         Index n = this->m_nc;
-        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, n, num_threads);
+        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, n, num_threads);
       }
 
       m_sizeA = this->m_mc * this->m_kc;
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
index a36eb2fe0..831089dee 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -42,13 +42,14 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
 {
   typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
   static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride,
-                                      const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride, const ResScalar& alpha)
+                                      const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride,
+                                      const ResScalar& alpha, level3_blocking<LhsScalar,RhsScalar>& blocking)
   {
     general_matrix_matrix_triangular_product<Index,
         RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
         LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs,
         ColMajor, UpLo==Lower?Upper:Lower>
-      ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha);
+      ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking);
   }
 };
 
@@ -58,7 +59,8 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
 {
   typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
   static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride,
-                                      const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride, const ResScalar& alpha)
+                                      const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride,
+                                      const ResScalar& alpha, level3_blocking<LhsScalar,RhsScalar>& blocking)
   {
     typedef gebp_traits<LhsScalar,RhsScalar> Traits;
 
@@ -69,16 +71,18 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
     RhsMapper rhs(_rhs,rhsStride);
     ResMapper res(_res, resStride);
 
-    Index kc = depth; // cache block size along the K direction
-    Index mc = size;  // cache block size along the M direction
-    Index nc = size;  // cache block size along the N direction
-    computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc, 1);
+    Index kc = blocking.kc();
+    Index mc = (std::min)(size,blocking.mc());
+
     // !!! mc must be a multiple of nr:
     if(mc > Traits::nr)
       mc = (mc/Traits::nr)*Traits::nr;
 
-    ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0);
-    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, kc*size, 0);
+    std::size_t sizeA = kc*mc;
+    std::size_t sizeB = kc*size;
+
+    ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
+    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
 
     gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
     gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
@@ -136,7 +140,7 @@ struct tribb_kernel
   typedef typename Traits::ResScalar ResScalar;
 
   enum {
-    BlockSize  = EIGEN_PLAIN_ENUM_MAX(mr,nr)
+    BlockSize  = meta_least_common_multiple<EIGEN_PLAIN_ENUM_MAX(mr,nr),EIGEN_PLAIN_ENUM_MIN(mr,nr)>::ret
   };
   void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
   {
@@ -256,13 +260,27 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
 
     typename ProductType::Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived());
 
+    enum {
+      IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0,
+      LhsIsRowMajor = _ActualLhs::Flags&RowMajorBit ? 1 : 0,
+      RhsIsRowMajor = _ActualRhs::Flags&RowMajorBit ? 1 : 0
+    };
+
+    Index size = mat.cols();
+    Index depth = actualLhs.cols();
+
+    typedef internal::gemm_blocking_space<IsRowMajor ? RowMajor : ColMajor,typename Lhs::Scalar,typename Rhs::Scalar,
+          MatrixType::MaxColsAtCompileTime, MatrixType::MaxColsAtCompileTime, _ActualRhs::MaxColsAtCompileTime> BlockingType;
+
+    BlockingType blocking(size, size, depth, 1, false);
+
     internal::general_matrix_matrix_triangular_product<Index,
-      typename Lhs::Scalar, _ActualLhs::Flags&RowMajorBit ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
-      typename Rhs::Scalar, _ActualRhs::Flags&RowMajorBit ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
-      MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor, UpLo>
-      ::run(mat.cols(), actualLhs.cols(),
+      typename Lhs::Scalar, LhsIsRowMajor ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
+      typename Rhs::Scalar, RhsIsRowMajor ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
+      IsRowMajor ? RowMajor : ColMajor, UpLo>
+      ::run(size, depth,
             &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &actualRhs.coeffRef(0,0), actualRhs.outerStride(),
-            mat.data(), mat.outerStride(), actualAlpha);
+            mat.data(), mat.outerStride(), actualAlpha, blocking);
   }
 };
 
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
index f84f54982..da6f82abc 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -291,7 +291,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,Co
     const Scalar* lhs, Index lhsStride,
     const Scalar* rhs, Index rhsStride,
     Scalar* res,       Index resStride,
-    const Scalar& alpha)
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
   {
     product_selfadjoint_matrix<Scalar, Index,
       EIGEN_LOGICAL_XOR(RhsSelfAdjoint,RhsStorageOrder==RowMajor) ? ColMajor : RowMajor,
@@ -299,7 +299,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,Co
       EIGEN_LOGICAL_XOR(LhsSelfAdjoint,LhsStorageOrder==RowMajor) ? ColMajor : RowMajor,
       LhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs),
       ColMajor>
-      ::run(cols, rows,  rhs, rhsStride,  lhs, lhsStride,  res, resStride,  alpha);
+      ::run(cols, rows,  rhs, rhsStride,  lhs, lhsStride,  res, resStride,  alpha, blocking);
   }
 };
 
@@ -314,7 +314,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
     Scalar* res,        Index resStride,
-    const Scalar& alpha);
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
 };
 
 template <typename Scalar, typename Index,
@@ -325,7 +325,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
     Scalar* _res,        Index resStride,
-    const Scalar& alpha)
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
   {
     Index size = rows;
 
@@ -340,17 +340,14 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
     RhsMapper rhs(_rhs,rhsStride);
     ResMapper res(_res, resStride);
 
-    Index kc = size;  // cache block size along the K direction
-    Index mc = rows;  // cache block size along the M direction
-    Index nc = cols;  // cache block size along the N direction
-    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, 1);
-    // kc must smaller than mc
+    Index kc = blocking.kc();                   // cache block size along the K direction
+    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
+    // kc must be smaller than mc
     kc = (std::min)(kc,mc);
-
+    std::size_t sizeA = kc*mc;
     std::size_t sizeB = kc*cols;
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
-    ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
-    Scalar* blockB = allocatedBlockB;
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
 
     gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
     symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
@@ -410,7 +407,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLh
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
     Scalar* res,        Index resStride,
-    const Scalar& alpha);
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
 };
 
 template <typename Scalar, typename Index,
@@ -421,7 +418,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
     Scalar* _res,        Index resStride,
-    const Scalar& alpha)
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
   {
     Index size = cols;
 
@@ -432,14 +429,12 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
     LhsMapper lhs(_lhs,lhsStride);
     ResMapper res(_res,resStride);
 
-    Index kc = size;  // cache block size along the K direction
-    Index mc = rows;  // cache block size along the M direction
-    Index nc = cols;  // cache block size along the N direction
-    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, 1);
+    Index kc = blocking.kc();                   // cache block size along the K direction
+    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
+    std::size_t sizeA = kc*mc;
     std::size_t sizeB = kc*cols;
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
-    ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
-    Scalar* blockB = allocatedBlockB;
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
 
     gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
     gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
@@ -498,6 +493,11 @@ struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,RhsMode,false>
     Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs)
                                * RhsBlasTraits::extractScalarFactor(a_rhs);
 
+    typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
+              Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,1> BlockingType;
+
+    BlockingType blocking(lhs.rows(), rhs.cols(), lhs.cols(), 1, false);
+
     internal::product_selfadjoint_matrix<Scalar, Index,
       EIGEN_LOGICAL_XOR(LhsIsUpper,internal::traits<Lhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint,
       NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)),
@@ -509,7 +509,7 @@ struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,RhsMode,false>
         &lhs.coeffRef(0,0), lhs.outerStride(),  // lhs info
         &rhs.coeffRef(0,0), rhs.outerStride(),  // rhs info
         &dst.coeffRef(0,0), dst.outerStride(),  // result info
-        actualAlpha                             // alpha
+        actualAlpha, blocking                   // alpha
       );
   }
 };
diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h
index 2af00058d..f038d686f 100644
--- a/Eigen/src/Core/products/SelfadjointProduct.h
+++ b/Eigen/src/Core/products/SelfadjointProduct.h
@@ -92,15 +92,27 @@ struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false>
 
     Scalar actualAlpha = alpha * OtherBlasTraits::extractScalarFactor(other.derived());
 
-    enum { IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0 };
+    enum {
+      IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0,
+      OtherIsRowMajor = _ActualOtherType::Flags&RowMajorBit ? 1 : 0
+    };
+
+    Index size = mat.cols();
+    Index depth = actualOther.cols();
+
+    typedef internal::gemm_blocking_space<IsRowMajor ? RowMajor : ColMajor,Scalar,Scalar,
+              MatrixType::MaxColsAtCompileTime, MatrixType::MaxColsAtCompileTime, _ActualOtherType::MaxColsAtCompileTime> BlockingType;
+
+    BlockingType blocking(size, size, depth, 1, false);
+
 
     internal::general_matrix_matrix_triangular_product<Index,
-      Scalar, _ActualOtherType::Flags&RowMajorBit ? RowMajor : ColMajor,   OtherBlasTraits::NeedToConjugate  && NumTraits<Scalar>::IsComplex,
-      Scalar, _ActualOtherType::Flags&RowMajorBit ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex,
-      MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor, UpLo>
-      ::run(mat.cols(), actualOther.cols(),
+      Scalar, OtherIsRowMajor ? RowMajor : ColMajor,   OtherBlasTraits::NeedToConjugate  && NumTraits<Scalar>::IsComplex,
+      Scalar, OtherIsRowMajor ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex,
+      IsRowMajor ? RowMajor : ColMajor, UpLo>
+      ::run(size, depth,
             &actualOther.coeffRef(0,0), actualOther.outerStride(), &actualOther.coeffRef(0,0), actualOther.outerStride(),
-            mat.data(), mat.outerStride(), actualAlpha);
+            mat.data(), mat.outerStride(), actualAlpha, blocking);
   }
 };
 
diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h
index 39ab87df8..8a2f7cd78 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -126,6 +126,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
 
     Index kc = blocking.kc();                   // cache block size along the K direction
     Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
+    // The small panel size must not be larger than blocking size.
+    // Usually this should never be the case because SmallPanelWidth^2 is very small
+    // compared to L2 cache size, but let's be safe:
+    Index panelWidth = (std::min)(Index(SmallPanelWidth),(std::min)(kc,mc));
 
     std::size_t sizeA = kc*mc;
     std::size_t sizeB = kc*cols;
@@ -169,9 +173,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
       if(IsLower || actual_k2<rows)
       {
         // for each small vertical panels of lhs
-        for (Index k1=0; k1<actual_kc; k1+=SmallPanelWidth)
+        for (Index k1=0; k1<actual_kc; k1+=panelWidth)
         {
-          Index actualPanelWidth = std::min<Index>(actual_kc-k1, SmallPanelWidth);
+          Index actualPanelWidth = std::min<Index>(actual_kc-k1, panelWidth);
           Index lengthTarget = IsLower ? actual_kc-k1-actualPanelWidth : k1;
           Index startBlock   = actual_k2+k1;
           Index blockBOffset = k1;
diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h
index 747232938..91c61fcf2 100755
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -15,10 +15,11 @@
   // 4522 - 'class' : multiple assignment operators specified
   // 4700 - uninitialized local variable 'xyz' used
   // 4717 - 'function' : recursive on all control paths, function will cause runtime stack overflow
+  // 4800 - 'type' : forcing value to bool 'true' or 'false' (performance warning)
   #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
     #pragma warning( push )
   #endif
-  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4717 )
+  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4717 4800)
 #elif defined __INTEL_COMPILER
   // 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
   //        ICC 12 generates this warning even without any inline keyword, when defining class methods 'inline' i.e. inside of class body
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 9b4f8faa7..cf6b03ec7 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -336,7 +336,6 @@
 // Do we support r-value references?
 #if (__has_feature(cxx_rvalue_references) || \
     (defined(__cplusplus) && __cplusplus >= 201103L) || \
-     defined(__GXX_EXPERIMENTAL_CXX0X__) || \
     (EIGEN_COMP_MSVC >= 1600))
   #define EIGEN_HAVE_RVALUE_REFERENCES
 #endif
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 823e077af..415bc48cb 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -526,9 +526,9 @@ template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_align
 template<int Alignment, typename Scalar, typename Index>
 EIGEN_DEVICE_FUNC inline Index first_aligned(const Scalar* array, Index size)
 {
-  static const Index ScalarSize = sizeof(Scalar);
-  static const Index AlignmentSize = Alignment / ScalarSize;
-  static const Index AlignmentMask = AlignmentSize-1;
+  const Index ScalarSize = sizeof(Scalar);
+  const Index AlignmentSize = Alignment / ScalarSize;
+  const Index AlignmentMask = AlignmentSize-1;
 
   if(AlignmentSize<=1)
   {
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 617ba0a65..b01437d88 100644
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -257,7 +257,7 @@ struct has_std_result_type {int a[2];};
 struct has_tr1_result {int a[3];};
 
 template<typename Func, typename ArgType, int SizeOf=sizeof(has_none)>
-struct unary_result_of_select {typedef ArgType type;};
+struct unary_result_of_select {typedef typename internal::remove_all<ArgType>::type type;};
 
 template<typename Func, typename ArgType>
 struct unary_result_of_select<Func, ArgType, sizeof(has_std_result_type)> {typedef typename Func::result_type type;};
@@ -279,7 +279,7 @@ struct result_of<Func(ArgType)> {
 };
 
 template<typename Func, typename ArgType0, typename ArgType1, int SizeOf=sizeof(has_none)>
-struct binary_result_of_select {typedef ArgType0 type;};
+struct binary_result_of_select {typedef typename internal::remove_all<ArgType0>::type type;};
 
 template<typename Func, typename ArgType0, typename ArgType1>
 struct binary_result_of_select<Func, ArgType0, ArgType1, sizeof(has_std_result_type)>
@@ -326,6 +326,22 @@ class meta_sqrt
 template<int Y, int InfX, int SupX>
 class meta_sqrt<Y, InfX, SupX, true> { public:  enum { ret = (SupX*SupX <= Y) ? SupX : InfX }; };
 
+
+/** \internal Computes the least common multiple of two positive integer A and B
+  * at compile-time. It implements a naive algorithm testing all multiples of A.
+  * It thus works better if A>=B.
+  */
+template<int A, int B, int K=1, bool Done = ((A*K)%B)==0>
+struct meta_least_common_multiple
+{
+  enum { ret = meta_least_common_multiple<A,B,K+1>::ret };
+};
+template<int A, int B, int K>
+struct meta_least_common_multiple<A,B,K,true>
+{
+  enum { ret = A*K };
+};
+
 /** \internal determines whether the product of two numeric types is allowed and what the return type is */
 template<typename T, typename U> struct scalar_product_traits
 {
diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h
index e174509e0..afae2e51e 100644
--- a/Eigen/src/Core/util/StaticAssert.h
+++ b/Eigen/src/Core/util/StaticAssert.h
@@ -26,7 +26,7 @@
 
 #ifndef EIGEN_NO_STATIC_ASSERT
 
-  #if defined(__GXX_EXPERIMENTAL_CXX0X__) || (EIGEN_COMP_MSVC >= 1600)
+  #if __has_feature(cxx_static_assert) || (defined(__cplusplus) && __cplusplus >= 201103L) || (EIGEN_COMP_MSVC >= 1600)
 
     // if native static_assert is enabled, let's use it
     #define EIGEN_STATIC_ASSERT(X,MSG) static_assert(X,#MSG);
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index 8b71e2c62..a001c473a 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -466,17 +466,17 @@ struct special_scalar_op_base : public BaseType
 template<typename Derived,typename Scalar,typename OtherScalar, typename BaseType>
 struct special_scalar_op_base<Derived,Scalar,OtherScalar,BaseType,true>  : public BaseType
 {
-  const CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, Derived>
+  const CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, const Derived>
   operator*(const OtherScalar& scalar) const
   {
 #ifdef EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN
     EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN
 #endif
-    return CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, Derived>
+    return CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, const Derived>
       (*static_cast<const Derived*>(this), scalar_multiple2_op<Scalar,OtherScalar>(scalar));
   }
 
-  inline friend const CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, Derived>
+  inline friend const CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, const Derived>
   operator*(const OtherScalar& scalar, const Derived& matrix)
   {
 #ifdef EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN
@@ -485,13 +485,13 @@ struct special_scalar_op_base<Derived,Scalar,OtherScalar,BaseType,true>  : publi
     return static_cast<const special_scalar_op_base&>(matrix).operator*(scalar);
   }
   
-  const CwiseUnaryOp<scalar_quotient2_op<Scalar,OtherScalar>, Derived>
+  const CwiseUnaryOp<scalar_quotient2_op<Scalar,OtherScalar>, const Derived>
   operator/(const OtherScalar& scalar) const
   {
 #ifdef EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN
     EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN
 #endif
-    return CwiseUnaryOp<scalar_quotient2_op<Scalar,OtherScalar>, Derived>
+    return CwiseUnaryOp<scalar_quotient2_op<Scalar,OtherScalar>, const Derived>
       (*static_cast<const Derived*>(this), scalar_quotient2_op<Scalar,OtherScalar>(scalar));
   }
 };
@@ -526,22 +526,21 @@ template <typename A> struct promote_storage_type<const A, A>
   * the functor.
   * The default rules are as follows:
   * \code
-  * A     op A      -> A
-  * A     op dense  -> dense
-  * dense op B      -> dense
-  * A     *  dense  -> A
-  * dense *  B      -> B
+  * A      op A      -> A
+  * A      op dense  -> dense
+  * dense  op B      -> dense
+  * sparse op dense  -> sparse
+  * dense  op sparse -> sparse
   * \endcode
   */
 template <typename A, typename B, typename Functor> struct cwise_promote_storage_type;
 
-template <typename A, typename Functor>                   struct cwise_promote_storage_type<A,A,Functor>                                      { typedef A     ret; };
-template <typename Functor>                               struct cwise_promote_storage_type<Dense,Dense,Functor>                              { typedef Dense ret; };
-template <typename ScalarA, typename ScalarB>             struct cwise_promote_storage_type<Dense,Dense,scalar_product_op<ScalarA,ScalarB> >  { typedef Dense ret; };
-template <typename A, typename Functor>                   struct cwise_promote_storage_type<A,Dense,Functor>                                  { typedef Dense ret; };
-template <typename B, typename Functor>                   struct cwise_promote_storage_type<Dense,B,Functor>                                  { typedef Dense ret; };
-template <typename A, typename ScalarA, typename ScalarB> struct cwise_promote_storage_type<A,Dense,scalar_product_op<ScalarA,ScalarB> >      { typedef A     ret; };
-template <typename B, typename ScalarA, typename ScalarB> struct cwise_promote_storage_type<Dense,B,scalar_product_op<ScalarA,ScalarB> >      { typedef B     ret; };
+template <typename A, typename Functor>                   struct cwise_promote_storage_type<A,A,Functor>                                      { typedef A      ret; };
+template <typename Functor>                               struct cwise_promote_storage_type<Dense,Dense,Functor>                              { typedef Dense  ret; };
+template <typename A, typename Functor>                   struct cwise_promote_storage_type<A,Dense,Functor>                                  { typedef Dense  ret; };
+template <typename B, typename Functor>                   struct cwise_promote_storage_type<Dense,B,Functor>                                  { typedef Dense  ret; };
+template <typename Functor>                               struct cwise_promote_storage_type<Sparse,Dense,Functor>                             { typedef Sparse ret; };
+template <typename Functor>                               struct cwise_promote_storage_type<Dense,Sparse,Functor>                             { typedef Sparse ret; };
 
 /** \internal Specify the "storage kind" of multiplying an expression of kind A with kind B.
   * The template parameter ProductTag permits to specialize the resulting storage kind wrt to
diff --git a/Eigen/src/Geometry/ParametrizedLine.h b/Eigen/src/Geometry/ParametrizedLine.h
index fdcd69760..c43dce773 100644
--- a/Eigen/src/Geometry/ParametrizedLine.h
+++ b/Eigen/src/Geometry/ParametrizedLine.h
@@ -129,7 +129,7 @@ public:
     * determined by \a prec.
     *
     * \sa MatrixBase::isApprox() */
-  bool isApprox(const ParametrizedLine& other, typename NumTraits<Scalar>::Real prec = NumTraits<Scalar>::dummy_precision()) const
+  bool isApprox(const ParametrizedLine& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
   { return m_origin.isApprox(other.m_origin, prec) && m_direction.isApprox(other.m_direction, prec); }
 
 protected:
diff --git a/Eigen/src/Geometry/Translation.h b/Eigen/src/Geometry/Translation.h
index 87ea445e9..82d7777f0 100644
--- a/Eigen/src/Geometry/Translation.h
+++ b/Eigen/src/Geometry/Translation.h
@@ -162,7 +162,7 @@ public:
     * determined by \a prec.
     *
     * \sa MatrixBase::isApprox() */
-  bool isApprox(const Translation& other, typename NumTraits<Scalar>::Real prec = NumTraits<Scalar>::dummy_precision()) const
+  bool isApprox(const Translation& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
   { return m_coeffs.isApprox(other.m_coeffs, prec); }
 
 };
diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
index 18e9d1466..e45c272b4 100644
--- a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
+++ b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
@@ -37,6 +37,8 @@ namespace Eigen {
   * and \f$ \beta \f$ be the minimum value of the diagonal. If \f$ \beta > 0 \f$ then, the factorization is directly performed
   * on the matrix B. Otherwise, the factorization is performed on the shifted matrix \f$ B + (\sigma+|\beta| I \f$ where
   * \f$ \sigma \f$ is the initial shift value as returned and set by setInitialShift() method. The default value is \f$ \sigma = 10^{-3} \f$.
+  * If the factorization fails, then the shift in doubled until it succeed or a maximum of ten attempts. If it still fails, as returned by
+  * the info() method, then you can either increase the initial shift, or better use another preconditioning technique.
   *
   */
 template <typename Scalar, int _UpLo = Lower, typename _OrderingType =
@@ -185,6 +187,10 @@ class IncompleteCholesky : public SparseSolverBase<IncompleteCholesky<Scalar,_Up
     inline void updateList(Ref<const VectorIx> colPtr, Ref<VectorIx> rowIdx, Ref<VectorSx> vals, const Index& col, const Index& jk, VectorIx& firstElt, VectorList& listCol); 
 }; 
 
+// Based on the following paper:
+//   C-J. Lin and J. J. Moré, Incomplete Cholesky Factorizations with
+//   Limited memory, SIAM J. Sci. Comput.  21(1), pp. 24-45, 1999
+//   http://ftp.mcs.anl.gov/pub/tech_reports/reports/P682.pdf
 template<typename Scalar, int _UpLo, typename OrderingType>
 template<typename _MatrixType>
 void IncompleteCholesky<Scalar,_UpLo, OrderingType>::factorize(const _MatrixType& mat)
@@ -240,7 +246,7 @@ void IncompleteCholesky<Scalar,_UpLo, OrderingType>::factorize(const _MatrixType
     else
       m_scale(j) = 1;
 
-  // FIXME disable scaling if not needed, i.e., if it is roughly uniform? (this will make solve() faster)
+  // TODO disable scaling if not needed, i.e., if it is roughly uniform? (this will make solve() faster)
   
   // Scale and compute the shift for the matrix 
   RealScalar mindiag = NumTraits<RealScalar>::highest();
@@ -251,96 +257,122 @@ void IncompleteCholesky<Scalar,_UpLo, OrderingType>::factorize(const _MatrixType
     eigen_internal_assert(rowIdx[colPtr[j]]==j && "IncompleteCholesky: only the lower triangular part must be stored");
     mindiag = numext::mini(numext::real(vals[colPtr[j]]), mindiag);
   }
+
+  FactorType L_save = m_L;
   
   RealScalar shift = 0;
   if(mindiag <= RealScalar(0.))
     shift = m_initialShift - mindiag;
 
-  // Apply the shift to the diagonal elements of the matrix
-  for (Index j = 0; j < n; j++)
-    vals[colPtr[j]] += shift;
-  
-  // jki version of the Cholesky factorization 
-  for (Index j=0; j < n; ++j)
-  {  
-    // Left-looking factorization of the j-th column
-    // First, load the j-th column into col_vals 
-    Scalar diag = vals[colPtr[j]];  // It is assumed that only the lower part is stored
-    col_nnz = 0;
-    for (Index i = colPtr[j] + 1; i < colPtr[j+1]; i++)
-    {
-      StorageIndex l = rowIdx[i];
-      col_vals(col_nnz) = vals[i];
-      col_irow(col_nnz) = l;
-      col_pattern(l) = col_nnz;
-      col_nnz++;
-    }
+  m_info = NumericalIssue;
+
+  // Try to perform the incomplete factorization using the current shift
+  int iter = 0;
+  do
+  {
+    // Apply the shift to the diagonal elements of the matrix
+    for (Index j = 0; j < n; j++)
+      vals[colPtr[j]] += shift;
+
+    // jki version of the Cholesky factorization
+    Index j=0;
+    for (; j < n; ++j)
     {
-      typename std::list<StorageIndex>::iterator k; 
-      // Browse all previous columns that will update column j
-      for(k = listCol[j].begin(); k != listCol[j].end(); k++) 
+      // Left-looking factorization of the j-th column
+      // First, load the j-th column into col_vals
+      Scalar diag = vals[colPtr[j]];  // It is assumed that only the lower part is stored
+      col_nnz = 0;
+      for (Index i = colPtr[j] + 1; i < colPtr[j+1]; i++)
       {
-        Index jk = firstElt(*k); // First element to use in the column 
-        eigen_internal_assert(rowIdx[jk]==j);
-        Scalar v_j_jk = numext::conj(vals[jk]);
-        
-        jk += 1; 
-        for (Index i = jk; i < colPtr[*k+1]; i++)
+        StorageIndex l = rowIdx[i];
+        col_vals(col_nnz) = vals[i];
+        col_irow(col_nnz) = l;
+        col_pattern(l) = col_nnz;
+        col_nnz++;
+      }
+      {
+        typename std::list<StorageIndex>::iterator k;
+        // Browse all previous columns that will update column j
+        for(k = listCol[j].begin(); k != listCol[j].end(); k++)
         {
-          StorageIndex l = rowIdx[i];
-          if(col_pattern[l]<0)
+          Index jk = firstElt(*k); // First element to use in the column
+          eigen_internal_assert(rowIdx[jk]==j);
+          Scalar v_j_jk = numext::conj(vals[jk]);
+
+          jk += 1;
+          for (Index i = jk; i < colPtr[*k+1]; i++)
           {
-            col_vals(col_nnz) = vals[i] * v_j_jk;
-            col_irow[col_nnz] = l;
-            col_pattern(l) = col_nnz;
-            col_nnz++;
+            StorageIndex l = rowIdx[i];
+            if(col_pattern[l]<0)
+            {
+              col_vals(col_nnz) = vals[i] * v_j_jk;
+              col_irow[col_nnz] = l;
+              col_pattern(l) = col_nnz;
+              col_nnz++;
+            }
+            else
+              col_vals(col_pattern[l]) -= vals[i] * v_j_jk;
           }
-          else
-            col_vals(col_pattern[l]) -= vals[i] * v_j_jk;
+          updateList(colPtr,rowIdx,vals, *k, jk, firstElt, listCol);
         }
-        updateList(colPtr,rowIdx,vals, *k, jk, firstElt, listCol);
       }
+
+      // Scale the current column
+      if(numext::real(diag) <= 0)
+      {
+        if(++iter>=10)
+          return;
+
+        // increase shift
+        shift = numext::maxi(m_initialShift,RealScalar(2)*shift);
+        // restore m_L, col_pattern, and listCol
+        vals = Map<const VectorSx>(L_save.valuePtr(), nnz);
+        rowIdx = Map<const VectorIx>(L_save.innerIndexPtr(), nnz);
+        colPtr = Map<const VectorIx>(L_save.outerIndexPtr(), n+1);
+        col_pattern.fill(-1);
+        for(Index i=0; i<n; ++i)
+          listCol[i].clear();
+
+        break;
+      }
+
+      RealScalar rdiag = sqrt(numext::real(diag));
+      vals[colPtr[j]] = rdiag;
+      for (Index k = 0; k<col_nnz; ++k)
+      {
+        Index i = col_irow[k];
+        //Scale
+        col_vals(k) /= rdiag;
+        //Update the remaining diagonals with col_vals
+        vals[colPtr[i]] -= numext::abs2(col_vals(k));
+      }
+      // Select the largest p elements
+      // p is the original number of elements in the column (without the diagonal)
+      Index p = colPtr[j+1] - colPtr[j] - 1 ;
+      Ref<VectorSx> cvals = col_vals.head(col_nnz);
+      Ref<VectorIx> cirow = col_irow.head(col_nnz);
+      internal::QuickSplit(cvals,cirow, p);
+      // Insert the largest p elements in the matrix
+      Index cpt = 0;
+      for (Index i = colPtr[j]+1; i < colPtr[j+1]; i++)
+      {
+        vals[i] = col_vals(cpt);
+        rowIdx[i] = col_irow(cpt);
+        // restore col_pattern:
+        col_pattern(col_irow(cpt)) = -1;
+        cpt++;
+      }
+      // Get the first smallest row index and put it after the diagonal element
+      Index jk = colPtr(j)+1;
+      updateList(colPtr,rowIdx,vals,j,jk,firstElt,listCol);
     }
-    
-    // Scale the current column
-    if(numext::real(diag) <= 0) 
-    {
-      m_info = NumericalIssue; 
-      return; 
-    }
-    
-    RealScalar rdiag = sqrt(numext::real(diag));
-    vals[colPtr[j]] = rdiag;
-    for (Index k = 0; k<col_nnz; ++k)
-    {
-      Index i = col_irow[k];
-      //Scale
-      col_vals(k) /= rdiag;
-      //Update the remaining diagonals with col_vals
-      vals[colPtr[i]] -= numext::abs2(col_vals(k));
-    }
-    // Select the largest p elements
-    // p is the original number of elements in the column (without the diagonal)
-    Index p = colPtr[j+1] - colPtr[j] - 1 ; 
-    Ref<VectorSx> cvals = col_vals.head(col_nnz);
-    Ref<VectorIx> cirow = col_irow.head(col_nnz);
-    internal::QuickSplit(cvals,cirow, p); 
-    // Insert the largest p elements in the matrix
-    Index cpt = 0; 
-    for (Index i = colPtr[j]+1; i < colPtr[j+1]; i++)
+
+    if(j==n)
     {
-      vals[i] = col_vals(cpt); 
-      rowIdx[i] = col_irow(cpt);
-      // restore col_pattern:
-      col_pattern(col_irow(cpt)) = -1;
-      cpt++; 
+      m_factorizationIsOk = true;
+      m_info = Success;
     }
-    // Get the first smallest row index and put it after the diagonal element
-    Index jk = colPtr(j)+1;
-    updateList(colPtr,rowIdx,vals,j,jk,firstElt,listCol); 
-  }
-  m_factorizationIsOk = true; 
-  m_info = Success;
+  } while(m_info!=Success);
 }
 
 template<typename Scalar, int _UpLo, typename OrderingType>
diff --git a/Eigen/src/OrderingMethods/Amd.h b/Eigen/src/OrderingMethods/Amd.h
index d1d08ca57..f91ecb24e 100644
--- a/Eigen/src/OrderingMethods/Amd.h
+++ b/Eigen/src/OrderingMethods/Amd.h
@@ -8,7 +8,7 @@
 NOTE: this routine has been adapted from the CSparse library:
 
 Copyright (c) 2006, Timothy A. Davis.
-http://www.cise.ufl.edu/research/sparse/CSparse
+http://www.suitesparse.com
 
 CSparse is free software; you can redistribute it and/or
 modify it under the terms of the GNU Lesser General Public
diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h
index 70c987afa..933cd564b 100644
--- a/Eigen/src/OrderingMethods/Eigen_Colamd.h
+++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h
@@ -41,12 +41,8 @@
 // 
 //   The colamd/symamd library is available at
 // 
-//       http://www.cise.ufl.edu/research/sparse/colamd/
+//       http://www.suitesparse.com
 
-//   This is the http://www.cise.ufl.edu/research/sparse/colamd/colamd.h
-//   file.  It is required by the colamd.c, colamdmex.c, and symamdmex.c
-//   files, and by any C code that calls the routines whose prototypes are
-//   listed below, or that uses the colamd/symamd definitions listed below.
   
 #ifndef EIGEN_COLAMD_H
 #define EIGEN_COLAMD_H
@@ -102,9 +98,6 @@ namespace internal {
 /* === Definitions ========================================================== */
 /* ========================================================================== */
 
-#define COLAMD_MAX(a,b) (((a) > (b)) ? (a) : (b))
-#define COLAMD_MIN(a,b) (((a) < (b)) ? (a) : (b))
-
 #define ONES_COMPLEMENT(r) (-(r)-1)
 
 /* -------------------------------------------------------------------------- */
@@ -739,8 +732,8 @@ static void init_scoring
 
   /* === Extract knobs ==================================================== */
 
-  dense_row_count = COLAMD_MAX (0, COLAMD_MIN (knobs [COLAMD_DENSE_ROW] * n_col, n_col)) ;
-  dense_col_count = COLAMD_MAX (0, COLAMD_MIN (knobs [COLAMD_DENSE_COL] * n_row, n_row)) ;
+  dense_row_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [COLAMD_DENSE_ROW] * n_col), n_col)) ;
+  dense_col_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [COLAMD_DENSE_COL] * n_row), n_row)) ;
   COLAMD_DEBUG1 (("colamd: densecount: %d %d\n", dense_row_count, dense_col_count)) ;
   max_deg = 0 ;
   n_col2 = n_col ;
@@ -804,7 +797,7 @@ static void init_scoring
     else
     {
       /* keep track of max degree of remaining rows */
-      max_deg = COLAMD_MAX (max_deg, deg) ;
+      max_deg = numext::maxi(max_deg, deg) ;
     }
   }
   COLAMD_DEBUG1 (("colamd: Dense and null rows killed: %d\n", n_row - n_row2)) ;
@@ -842,7 +835,7 @@ static void init_scoring
       /* add row's external degree */
       score += Row [row].shared1.degree - 1 ;
       /* guard against integer overflow */
-      score = COLAMD_MIN (score, n_col) ;
+      score = numext::mini(score, n_col) ;
     }
     /* determine pruned column length */
     col_length = (IndexType) (new_cp - &A [Col [c].start]) ;
@@ -914,7 +907,7 @@ static void init_scoring
       head [score] = c ;
 
       /* see if this score is less than current min */
-      min_score = COLAMD_MIN (min_score, score) ;
+      min_score = numext::mini(min_score, score) ;
 
 
     }
@@ -1040,7 +1033,7 @@ static IndexType find_ordering /* return the number of garbage collections */
 
     /* === Garbage_collection, if necessary ============================= */
 
-    needed_memory = COLAMD_MIN (pivot_col_score, n_col - k) ;
+    needed_memory = numext::mini(pivot_col_score, n_col - k) ;
     if (pfree + needed_memory >= Alen)
     {
       pfree = Eigen::internal::garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ;
@@ -1099,7 +1092,7 @@ static IndexType find_ordering /* return the number of garbage collections */
 
     /* clear tag on pivot column */
     Col [pivot_col].shared1.thickness = pivot_col_thickness ;
-    max_deg = COLAMD_MAX (max_deg, pivot_row_degree) ;
+    max_deg = numext::maxi(max_deg, pivot_row_degree) ;
 
 
     /* === Kill all rows used to construct pivot row ==================== */
@@ -1273,7 +1266,7 @@ static IndexType find_ordering /* return the number of garbage collections */
 	/* add set difference */
 	cur_score += row_mark - tag_mark ;
 	/* integer overflow... */
-	cur_score = COLAMD_MIN (cur_score, n_col) ;
+	cur_score = numext::mini(cur_score, n_col) ;
       }
 
       /* recompute the column's length */
@@ -1386,7 +1379,7 @@ static IndexType find_ordering /* return the number of garbage collections */
       cur_score -= Col [col].shared1.thickness ;
 
       /* make sure score is less or equal than the max score */
-      cur_score = COLAMD_MIN (cur_score, max_score) ;
+      cur_score = numext::mini(cur_score, max_score) ;
       COLAMD_ASSERT (cur_score >= 0) ;
 
       /* store updated score */
@@ -1409,7 +1402,7 @@ static IndexType find_ordering /* return the number of garbage collections */
       head [cur_score] = col ;
 
       /* see if this score is less than current min */
-      min_score = COLAMD_MIN (min_score, cur_score) ;
+      min_score = numext::mini(min_score, cur_score) ;
 
     }
 
diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h
index 10be84856..43fe788d9 100644
--- a/Eigen/src/SparseCore/SparseBlock.h
+++ b/Eigen/src/SparseCore/SparseBlock.h
@@ -100,11 +100,11 @@ protected:
     enum { OuterSize = IsRowMajor ? BlockRows : BlockCols };
 public:
 
-    inline sparse_matrix_block_impl(const SparseMatrixType& xpr, Index i)
+    inline sparse_matrix_block_impl(SparseMatrixType& xpr, Index i)
       : m_matrix(xpr), m_outerStart(convert_index(i)), m_outerSize(OuterSize)
     {}
 
-    inline sparse_matrix_block_impl(const SparseMatrixType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+    inline sparse_matrix_block_impl(SparseMatrixType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
       : m_matrix(xpr), m_outerStart(convert_index(IsRowMajor ? startRow : startCol)), m_outerSize(convert_index(IsRowMajor ? blockRows : blockCols))
     {}
 
@@ -112,7 +112,7 @@ public:
     inline BlockType& operator=(const SparseMatrixBase<OtherDerived>& other)
     {
       typedef typename internal::remove_all<typename SparseMatrixType::Nested>::type _NestedMatrixType;
-      _NestedMatrixType& matrix = const_cast<_NestedMatrixType&>(m_matrix);;
+      _NestedMatrixType& matrix = m_matrix;
       // This assignment is slow if this vector set is not empty
       // and/or it is not at the end of the nonzeros of the underlying matrix.
 
@@ -209,28 +209,28 @@ public:
     inline const Scalar* valuePtr() const
     { return m_matrix.valuePtr(); }
     inline Scalar* valuePtr()
-    { return m_matrix.const_cast_derived().valuePtr(); }
+    { return m_matrix.valuePtr(); }
 
     inline const StorageIndex* innerIndexPtr() const
     { return m_matrix.innerIndexPtr(); }
     inline StorageIndex* innerIndexPtr()
-    { return m_matrix.const_cast_derived().innerIndexPtr(); }
+    { return m_matrix.innerIndexPtr(); }
 
     inline const StorageIndex* outerIndexPtr() const
     { return m_matrix.outerIndexPtr() + m_outerStart; }
     inline StorageIndex* outerIndexPtr()
-    { return m_matrix.const_cast_derived().outerIndexPtr() + m_outerStart; }
+    { return m_matrix.outerIndexPtr() + m_outerStart; }
     
     inline const StorageIndex* innerNonZeroPtr() const
     { return isCompressed() ? 0 : (m_matrix.innerNonZeroPtr()+m_outerStart); }
     inline StorageIndex* innerNonZeroPtr()
-    { return isCompressed() ? 0 : (m_matrix.const_cast_derived().innerNonZeroPtr()+m_outerStart); }
+    { return isCompressed() ? 0 : (m_matrix.innerNonZeroPtr()+m_outerStart); }
     
     bool isCompressed() const { return m_matrix.innerNonZeroPtr()==0; }
     
     inline Scalar& coeffRef(Index row, Index col)
     {
-      return m_matrix.const_cast_derived().coeffRef(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 :  m_outerStart));
+      return m_matrix.coeffRef(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 :  m_outerStart));
     }
     
     inline const Scalar coeff(Index row, Index col) const
@@ -264,7 +264,7 @@ public:
 
   protected:
 
-    typename SparseMatrixType::Nested m_matrix;
+    typename internal::ref_selector<SparseMatrixType>::non_const_type m_matrix;
     Index m_outerStart;
     const internal::variable_if_dynamic<Index, OuterSize> m_outerSize;
 
@@ -373,7 +373,7 @@ public:
 
     /** Column or Row constructor
       */
-    inline BlockImpl(const XprType& xpr, Index i)
+    inline BlockImpl(XprType& xpr, Index i)
       : m_matrix(xpr),
         m_startRow( (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? convert_index(i) : 0),
         m_startCol( (BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) ? convert_index(i) : 0),
@@ -383,7 +383,7 @@ public:
 
     /** Dynamic-size constructor
       */
-    inline BlockImpl(const XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+    inline BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
       : m_matrix(xpr), m_startRow(convert_index(startRow)), m_startCol(convert_index(startCol)), m_blockRows(convert_index(blockRows)), m_blockCols(convert_index(blockCols))
     {}
 
@@ -392,8 +392,7 @@ public:
 
     inline Scalar& coeffRef(Index row, Index col)
     {
-      return m_matrix.const_cast_derived()
-               .coeffRef(row + m_startRow.value(), col + m_startCol.value());
+      return m_matrix.coeffRef(row + m_startRow.value(), col + m_startCol.value());
     }
 
     inline const Scalar coeff(Index row, Index col) const
@@ -403,16 +402,14 @@ public:
 
     inline Scalar& coeffRef(Index index)
     {
-      return m_matrix.const_cast_derived()
-             .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
-                       m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+      return m_matrix.coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                               m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
     }
 
     inline const Scalar coeff(Index index) const
     {
-      return m_matrix
-             .coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
-                    m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+      return m_matrix.coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                            m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
     }
     
     inline const _MatrixTypeNested& nestedExpression() const { return m_matrix; }
@@ -430,7 +427,7 @@ public:
     
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
 
-    typename XprType::Nested m_matrix;
+    typename internal::ref_selector<XprType>::non_const_type m_matrix;
     const internal::variable_if_dynamic<Index, XprType::RowsAtCompileTime == 1 ? 0 : Dynamic> m_startRow;
     const internal::variable_if_dynamic<Index, XprType::ColsAtCompileTime == 1 ? 0 : Dynamic> m_startCol;
     const internal::variable_if_dynamic<Index, RowsAtCompileTime> m_blockRows;
diff --git a/Eigen/src/SparseCore/SparseCompressedBase.h b/Eigen/src/SparseCore/SparseCompressedBase.h
index f78d7c24d..ea71b41d1 100644
--- a/Eigen/src/SparseCore/SparseCompressedBase.h
+++ b/Eigen/src/SparseCore/SparseCompressedBase.h
@@ -117,6 +117,24 @@ template<typename Derived>
 class SparseCompressedBase<Derived>::InnerIterator
 {
   public:
+    InnerIterator()
+      : m_values(0), m_indices(0), m_outer(0), m_id(0), m_end(0)
+    {}
+
+    InnerIterator(const InnerIterator& other)
+      : m_values(other.m_values), m_indices(other.m_indices), m_outer(other.m_outer), m_id(other.m_id), m_end(other.m_end)
+    {}
+
+    InnerIterator& operator=(const InnerIterator& other)
+    {
+      m_values = other.m_values;
+      m_indices = other.m_indices;
+      const_cast<OuterType&>(m_outer).setValue(other.m_outer.value());
+      m_id = other.m_id;
+      m_end = other.m_end;
+      return *this;
+    }
+
     InnerIterator(const SparseCompressedBase& mat, Index outer)
       : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer)
     {
@@ -162,7 +180,8 @@ class SparseCompressedBase<Derived>::InnerIterator
   protected:
     const Scalar* m_values;
     const StorageIndex* m_indices;
-    const internal::variable_if_dynamic<Index,Derived::IsVectorAtCompileTime?0:Dynamic> m_outer;
+    typedef internal::variable_if_dynamic<Index,Derived::IsVectorAtCompileTime?0:Dynamic> OuterType;
+    const OuterType m_outer;
     Index m_id;
     Index m_end;
   private:
diff --git a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
index d9420ac63..c57d9ac59 100644
--- a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
@@ -49,17 +49,10 @@ class CwiseBinaryOpImpl<BinaryOp, Lhs, Rhs, Sparse>
 
 namespace internal {
 
-template<typename BinaryOp, typename Lhs, typename Rhs, typename Derived,
-  typename _LhsStorageMode = typename traits<Lhs>::StorageKind,
-  typename _RhsStorageMode = typename traits<Rhs>::StorageKind>
-class sparse_cwise_binary_op_inner_iterator_selector;
-
-} // end namespace internal
-
-namespace internal {
-
   
 // Generic "sparse OP sparse"
+template<typename XprType> struct binary_sparse_evaluator;
+
 template<typename BinaryOp, typename Lhs, typename Rhs>
 struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IteratorBased, IteratorBased>
   : evaluator_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
@@ -153,6 +146,182 @@ protected:
   evaluator<Rhs> m_rhsImpl;
 };
 
+// dense op sparse
+template<typename BinaryOp, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IteratorBased>
+  : evaluator_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
+{
+protected:
+  typedef typename evaluator<Rhs>::InnerIterator  RhsIterator;
+  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+  typedef typename traits<XprType>::Scalar Scalar;
+  typedef typename XprType::StorageIndex StorageIndex;
+public:
+
+  class ReverseInnerIterator;
+  class InnerIterator
+  {
+    enum { IsRowMajor = (int(Rhs::Flags)&RowMajorBit)==RowMajorBit };
+  public:
+
+    EIGEN_STRONG_INLINE InnerIterator(const binary_evaluator& aEval, Index outer)
+      : m_lhsEval(aEval.m_lhsImpl), m_rhsIter(aEval.m_rhsImpl,outer), m_functor(aEval.m_functor), m_id(-1), m_innerSize(aEval.m_expr.rhs().innerSize())
+    {
+      this->operator++();
+    }
+
+    EIGEN_STRONG_INLINE InnerIterator& operator++()
+    {
+      ++m_id;
+      if(m_id<m_innerSize)
+      {
+        Scalar lhsVal = m_lhsEval.coeff(IsRowMajor?m_rhsIter.outer():m_id,
+                                        IsRowMajor?m_id:m_rhsIter.outer());
+        if(m_rhsIter && m_rhsIter.index()==m_id)
+        {
+          m_value = m_functor(lhsVal, m_rhsIter.value());
+          ++m_rhsIter;
+        }
+        else
+          m_value = m_functor(lhsVal, Scalar(0));
+      }
+
+      return *this;
+    }
+
+    EIGEN_STRONG_INLINE Scalar value() const { return m_value; }
+
+    EIGEN_STRONG_INLINE StorageIndex index() const { return m_id; }
+    EIGEN_STRONG_INLINE Index row() const { return IsRowMajor ? m_rhsIter.outer() : m_id; }
+    EIGEN_STRONG_INLINE Index col() const { return IsRowMajor ? m_id : m_rhsIter.outer(); }
+
+    EIGEN_STRONG_INLINE operator bool() const { return m_id<m_innerSize; }
+
+  protected:
+    const evaluator<Lhs> &m_lhsEval;
+    RhsIterator m_rhsIter;
+    const BinaryOp& m_functor;
+    Scalar m_value;
+    StorageIndex m_id;
+    StorageIndex m_innerSize;
+  };
+
+
+  enum {
+    CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
+    // Expose storage order of the sparse expression
+    Flags = (XprType::Flags & ~RowMajorBit) | (int(Rhs::Flags)&RowMajorBit)
+  };
+
+  explicit binary_evaluator(const XprType& xpr)
+    : m_functor(xpr.functor()),
+      m_lhsImpl(xpr.lhs()),
+      m_rhsImpl(xpr.rhs()),
+      m_expr(xpr)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  inline Index nonZerosEstimate() const {
+    return m_expr.size();
+  }
+
+protected:
+  const BinaryOp m_functor;
+  evaluator<Lhs> m_lhsImpl;
+  evaluator<Rhs> m_rhsImpl;
+  const XprType &m_expr;
+};
+
+// sparse op dense
+template<typename BinaryOp, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IteratorBased, IndexBased>
+  : evaluator_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
+{
+protected:
+  typedef typename evaluator<Lhs>::InnerIterator  LhsIterator;
+  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+  typedef typename traits<XprType>::Scalar Scalar;
+  typedef typename XprType::StorageIndex StorageIndex;
+public:
+
+  class ReverseInnerIterator;
+  class InnerIterator
+  {
+    enum { IsRowMajor = (int(Lhs::Flags)&RowMajorBit)==RowMajorBit };
+  public:
+
+    EIGEN_STRONG_INLINE InnerIterator(const binary_evaluator& aEval, Index outer)
+      : m_lhsIter(aEval.m_lhsImpl,outer), m_rhsEval(aEval.m_rhsImpl), m_functor(aEval.m_functor), m_id(-1), m_innerSize(aEval.m_expr.lhs().innerSize())
+    {
+      this->operator++();
+    }
+
+    EIGEN_STRONG_INLINE InnerIterator& operator++()
+    {
+      ++m_id;
+      if(m_id<m_innerSize)
+      {
+        Scalar rhsVal = m_rhsEval.coeff(IsRowMajor?m_lhsIter.outer():m_id,
+                                        IsRowMajor?m_id:m_lhsIter.outer());
+        if(m_lhsIter && m_lhsIter.index()==m_id)
+        {
+          m_value = m_functor(m_lhsIter.value(), rhsVal);
+          ++m_lhsIter;
+        }
+        else
+          m_value = m_functor(Scalar(0),rhsVal);
+      }
+
+      return *this;
+    }
+
+    EIGEN_STRONG_INLINE Scalar value() const { return m_value; }
+
+    EIGEN_STRONG_INLINE StorageIndex index() const { return m_id; }
+    EIGEN_STRONG_INLINE Index row() const { return IsRowMajor ? m_lhsIter.outer() : m_id; }
+    EIGEN_STRONG_INLINE Index col() const { return IsRowMajor ? m_id : m_lhsIter.outer(); }
+
+    EIGEN_STRONG_INLINE operator bool() const { return m_id<m_innerSize; }
+
+  protected:
+    LhsIterator m_lhsIter;
+    const evaluator<Rhs> &m_rhsEval;
+    const BinaryOp& m_functor;
+    Scalar m_value;
+    StorageIndex m_id;
+    StorageIndex m_innerSize;
+  };
+
+
+  enum {
+    CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
+    // Expose storage order of the sparse expression
+    Flags = (XprType::Flags & ~RowMajorBit) | (int(Lhs::Flags)&RowMajorBit)
+  };
+
+  explicit binary_evaluator(const XprType& xpr)
+    : m_functor(xpr.functor()),
+      m_lhsImpl(xpr.lhs()),
+      m_rhsImpl(xpr.rhs()),
+      m_expr(xpr)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  inline Index nonZerosEstimate() const {
+    return m_expr.size();
+  }
+
+protected:
+  const BinaryOp m_functor;
+  evaluator<Lhs> m_lhsImpl;
+  evaluator<Rhs> m_rhsImpl;
+  const XprType &m_expr;
+};
+
 // "sparse .* sparse"
 template<typename T, typename Lhs, typename Rhs>
 struct binary_evaluator<CwiseBinaryOp<scalar_product_op<T>, Lhs, Rhs>, IteratorBased, IteratorBased>
@@ -287,7 +456,8 @@ public:
   
   enum {
     CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
-    Flags = XprType::Flags
+    // Expose storage order of the sparse expression
+    Flags = (XprType::Flags & ~RowMajorBit) | (int(Rhs::Flags)&RowMajorBit)
   };
   
   explicit binary_evaluator(const XprType& xpr)
@@ -360,7 +530,8 @@ public:
   
   enum {
     CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
-    Flags = XprType::Flags
+    // Expose storage order of the sparse expression
+    Flags = (XprType::Flags & ~RowMajorBit) | (int(Lhs::Flags)&RowMajorBit)
   };
   
   explicit binary_evaluator(const XprType& xpr)
@@ -428,6 +599,34 @@ SparseMatrixBase<Derived>::cwiseProduct(const MatrixBase<OtherDerived> &other) c
   return typename CwiseProductDenseReturnType<OtherDerived>::Type(derived(), other.derived());
 }
 
+template<typename DenseDerived, typename SparseDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_sum_op<typename DenseDerived::Scalar>, const DenseDerived, const SparseDerived>
+operator+(const MatrixBase<DenseDerived> &a, const SparseMatrixBase<SparseDerived> &b)
+{
+  return CwiseBinaryOp<internal::scalar_sum_op<typename DenseDerived::Scalar>, const DenseDerived, const SparseDerived>(a.derived(), b.derived());
+}
+
+template<typename SparseDerived, typename DenseDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_sum_op<typename DenseDerived::Scalar>, const SparseDerived, const DenseDerived>
+operator+(const SparseMatrixBase<SparseDerived> &a, const MatrixBase<DenseDerived> &b)
+{
+  return CwiseBinaryOp<internal::scalar_sum_op<typename DenseDerived::Scalar>, const SparseDerived, const DenseDerived>(a.derived(), b.derived());
+}
+
+template<typename DenseDerived, typename SparseDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_difference_op<typename DenseDerived::Scalar>, const DenseDerived, const SparseDerived>
+operator-(const MatrixBase<DenseDerived> &a, const SparseMatrixBase<SparseDerived> &b)
+{
+  return CwiseBinaryOp<internal::scalar_difference_op<typename DenseDerived::Scalar>, const DenseDerived, const SparseDerived>(a.derived(), b.derived());
+}
+
+template<typename SparseDerived, typename DenseDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_difference_op<typename DenseDerived::Scalar>, const SparseDerived, const DenseDerived>
+operator-(const SparseMatrixBase<SparseDerived> &a, const MatrixBase<DenseDerived> &b)
+{
+  return CwiseBinaryOp<internal::scalar_difference_op<typename DenseDerived::Scalar>, const SparseDerived, const DenseDerived>(a.derived(), b.derived());
+}
+
 } // end namespace Eigen
 
 #endif // EIGEN_SPARSE_CWISE_BINARY_OP_H
diff --git a/Eigen/src/SparseCore/SparseDenseProduct.h b/Eigen/src/SparseCore/SparseDenseProduct.h
index 87c946b9b..c9da8a2bb 100644
--- a/Eigen/src/SparseCore/SparseDenseProduct.h
+++ b/Eigen/src/SparseCore/SparseDenseProduct.h
@@ -48,7 +48,7 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, t
       // It basically represents the minimal amount of work to be done to be worth it.
       if(threads>1 && lhsEval.nonZerosEstimate() > 20000)
       {
-        #pragma omp parallel for schedule(static) num_threads(threads)
+        #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads)
         for(Index i=0; i<n; ++i)
           processRow(lhsEval,rhs,res,alpha,i,c);
       }
diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index 91bada40f..8b57445e6 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -538,7 +538,12 @@ class SparseMatrix
     }
 
     /** Resizes the matrix to a \a rows x \a cols matrix leaving old values untouched.
-      * \sa reserve(), setZero()
+      *
+      * If the sizes of the matrix are decreased, then the matrix is turned to \b uncompressed-mode
+      * and the storage of the out of bounds coefficients is kept and reserved.
+      * Call makeCompressed() to pack the entries and squeeze extra memory.
+      *
+      * \sa reserve(), setZero(), makeCompressed()
       */
     void conservativeResize(Index rows, Index cols) 
     {
diff --git a/Eigen/src/SparseCore/SparseSelfAdjointView.h b/Eigen/src/SparseCore/SparseSelfAdjointView.h
index 975cefd28..402733cce 100644
--- a/Eigen/src/SparseCore/SparseSelfAdjointView.h
+++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h
@@ -55,10 +55,10 @@ template<typename MatrixType, unsigned int _Mode> class SparseSelfAdjointView
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::StorageIndex StorageIndex;
     typedef Matrix<StorageIndex,Dynamic,1> VectorI;
-    typedef typename MatrixType::Nested MatrixTypeNested;
+    typedef typename internal::ref_selector<MatrixType>::non_const_type MatrixTypeNested;
     typedef typename internal::remove_all<MatrixTypeNested>::type _MatrixTypeNested;
     
-    explicit inline SparseSelfAdjointView(const MatrixType& matrix) : m_matrix(matrix)
+    explicit inline SparseSelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
     {
       eigen_assert(rows()==cols() && "SelfAdjointView is only for squared matrices");
     }
@@ -68,7 +68,7 @@ template<typename MatrixType, unsigned int _Mode> class SparseSelfAdjointView
 
     /** \internal \returns a reference to the nested matrix */
     const _MatrixTypeNested& matrix() const { return m_matrix; }
-    _MatrixTypeNested& matrix() { return m_matrix.const_cast_derived(); }
+    typename internal::remove_reference<MatrixTypeNested>::type& matrix() { return m_matrix; }
 
     /** \returns an expression of the matrix product between a sparse self-adjoint matrix \c *this and a sparse matrix \a rhs.
       *
@@ -158,7 +158,7 @@ template<typename MatrixType, unsigned int _Mode> class SparseSelfAdjointView
     
   protected:
 
-    typename MatrixType::Nested m_matrix;
+    MatrixTypeNested m_matrix;
     //mutable VectorI m_countPerRow;
     //mutable VectorI m_countPerCol;
   private:
@@ -194,9 +194,9 @@ SparseSelfAdjointView<MatrixType,Mode>::rankUpdate(const SparseMatrixBase<Derive
 {
   SparseMatrix<Scalar,(MatrixType::Flags&RowMajorBit)?RowMajor:ColMajor> tmp = u * u.adjoint();
   if(alpha==Scalar(0))
-    m_matrix.const_cast_derived() = tmp.template triangularView<Mode>();
+    m_matrix = tmp.template triangularView<Mode>();
   else
-    m_matrix.const_cast_derived() += alpha * tmp.template triangularView<Mode>();
+    m_matrix += alpha * tmp.template triangularView<Mode>();
 
   return *this;
 }
diff --git a/Eigen/src/SparseCore/SparseVector.h b/Eigen/src/SparseCore/SparseVector.h
index 7ec73a365..40a754300 100644
--- a/Eigen/src/SparseCore/SparseVector.h
+++ b/Eigen/src/SparseCore/SparseVector.h
@@ -205,23 +205,54 @@ class SparseVector
 
     inline void finalize() {}
 
+    /** \copydoc SparseMatrix::prune(const Scalar&,const RealScalar&) */
     void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision())
     {
       m_data.prune(reference,epsilon);
     }
 
+    /** Resizes the sparse vector to \a rows x \a cols
+      *
+      * This method is provided for compatibility with matrices.
+      * For a column vector, \a cols must be equal to 1.
+      * For a row vector, \a rows must be equal to 1.
+      *
+      * \sa resize(Index)
+      */
     void resize(Index rows, Index cols)
     {
       eigen_assert((IsColVector ? cols : rows)==1 && "Outer dimension must equal 1");
       resize(IsColVector ? rows : cols);
     }
 
+    /** Resizes the sparse vector to \a newSize
+      * This method deletes all entries, thus leaving an empty sparse vector
+      *
+      * \sa  conservativeResize(), setZero() */
     void resize(Index newSize)
     {
       m_size = newSize;
       m_data.clear();
     }
 
+    /** Resizes the sparse vector to \a newSize, while leaving old values untouched.
+      *
+      * If the size of the vector is decreased, then the storage of the out-of bounds coefficients is kept and reserved.
+      * Call .data().squeeze() to free extra memory.
+      *
+      * \sa reserve(), setZero()
+      */
+    void conservativeResize(Index newSize)
+    {
+      if (newSize < m_size)
+      {
+        Index i = 0;
+        while (i<m_data.size() && m_data.index(i)<newSize) ++i;
+        m_data.resize(i);
+      }
+      m_size = newSize;
+    }
+
     void resizeNonZeros(Index size) { m_data.resize(size); }
 
     inline SparseVector() : m_size(0) { check_template_parameters(); resize(0); }
diff --git a/Eigen/src/SparseCore/SparseView.h b/Eigen/src/SparseCore/SparseView.h
index c945c4dab..b867877d8 100644
--- a/Eigen/src/SparseCore/SparseView.h
+++ b/Eigen/src/SparseCore/SparseView.h
@@ -38,7 +38,7 @@ public:
   typedef typename internal::remove_all<MatrixType>::type NestedExpression;
 
   explicit SparseView(const MatrixType& mat, const Scalar& reference = Scalar(0),
-                      RealScalar epsilon = NumTraits<Scalar>::dummy_precision())
+                      const RealScalar &epsilon = NumTraits<Scalar>::dummy_precision())
     : m_matrix(mat), m_reference(reference), m_epsilon(epsilon) {}
 
   inline Index rows() const { return m_matrix.rows(); }
diff --git a/Eigen/src/SparseQR/SparseQR.h b/Eigen/src/SparseQR/SparseQR.h
index 0d448d02e..acd7f7e10 100644
--- a/Eigen/src/SparseQR/SparseQR.h
+++ b/Eigen/src/SparseQR/SparseQR.h
@@ -128,6 +128,17 @@ class SparseQR : public SparseSolverBase<SparseQR<_MatrixType,_OrderingType> >
     inline Index cols() const { return m_pmat.cols();}
     
     /** \returns a const reference to the \b sparse upper triangular matrix R of the QR factorization.
+      * \warning The entries of the returned matrix are not sorted. This means that using it in algorithms
+      *          expecting sorted entries will fail. This include random coefficient accesses (SpaseMatrix::coeff()),
+      *          and coefficient-wise operations. Matrix products and triangular solves are fine though.
+      *
+      * To sort the entries, you can assign it to a row-major matrix, and if a column-major matrix
+      * is required, you can copy it again:
+      * \code
+      * SparseMatrix<double>          R  = qr.matrixR();  // column-major, not sorted!
+      * SparseMatrix<double,RowMajor> Rr = qr.matrixR();  // row-major, sorted
+      * SparseMatrix<double>          Rc = Rr;            // column-major, sorted
+      * \endcode
       */
     const QRMatrixType& matrixR() const { return m_R; }
     
diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
index 01432e2f3..2ce7414a1 100644
--- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h
+++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
@@ -22,6 +22,7 @@ typedef CwiseUnaryOp<internal::scalar_tanh_op<Scalar>, const Derived> TanhReturn
 typedef CwiseUnaryOp<internal::scalar_sinh_op<Scalar>, const Derived> SinhReturnType;
 typedef CwiseUnaryOp<internal::scalar_cosh_op<Scalar>, const Derived> CoshReturnType;
 typedef CwiseUnaryOp<internal::scalar_lgamma_op<Scalar>, const Derived> LgammaReturnType;
+typedef CwiseUnaryOp<internal::scalar_digamma_op<Scalar>, const Derived> DigammaReturnType;
 typedef CwiseUnaryOp<internal::scalar_erf_op<Scalar>, const Derived> ErfReturnType;
 typedef CwiseUnaryOp<internal::scalar_erfc_op<Scalar>, const Derived> ErfcReturnType;
 typedef CwiseUnaryOp<internal::scalar_pow_op<Scalar>, const Derived> PowReturnType;
@@ -318,6 +319,16 @@ lgamma() const
   return LgammaReturnType(derived());
 }
 
+/** \returns an expression of the coefficient-wise digamma (psi, derivative of lgamma).
+ *
+ * \sa cos(), sin(), tan()
+ */
+inline const DigammaReturnType
+digamma() const
+{
+  return DigammaReturnType(derived());
+}
+
 /** \returns an expression of the coefficient-wise Gauss error
  * function of *this.
  *
diff --git a/bench/tensors/README b/bench/tensors/README
new file mode 100644
index 000000000..6b51fe878
--- /dev/null
+++ b/bench/tensors/README
@@ -0,0 +1,8 @@
+Each benchmark comes in 2 flavors: one that runs on CPU, and one that runs on GPU.
+
+To compile the CPU benchmarks, simply call:
+g++ tensor_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
+
+To compile the GPU benchmarks, simply call:
+nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -arch compute_35 -o benchmarks_gpu
+
diff --git a/bench/tensors/benchmark.h b/bench/tensors/benchmark.h
new file mode 100644
index 000000000..f115b54ad
--- /dev/null
+++ b/bench/tensors/benchmark.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <stddef.h>
+#include <stdint.h>
+#include <vector>
+
+namespace testing {
+class Benchmark {
+ public:
+  Benchmark(const char* name, void (*fn)(int)) {
+    Register(name, fn, NULL);
+  }
+  Benchmark(const char* name, void (*fn_range)(int, int)) {
+    Register(name, NULL, fn_range);
+  }
+  Benchmark* Arg(int x);
+  Benchmark* Range(int lo, int hi);
+  const char* Name();
+  bool ShouldRun(int argc, char* argv[]);
+  void Run();
+ private:
+  const char* name_;
+  void (*fn_)(int);
+  void (*fn_range_)(int, int);
+  std::vector<int> args_;
+  void Register(const char* name, void (*fn)(int), void (*fn_range)(int, int));
+  void RunRepeatedlyWithArg(int iterations, int arg);
+  void RunWithArg(int arg);
+};
+}  // namespace testing
+void SetBenchmarkFlopsProcessed(int64_t);
+void StopBenchmarkTiming();
+void StartBenchmarkTiming();
+#define BENCHMARK(f) \
+    static ::testing::Benchmark* _benchmark_##f __attribute__((unused)) = \
+        (new ::testing::Benchmark(#f, f))
diff --git a/bench/tensors/benchmark_main.cc b/bench/tensors/benchmark_main.cc
new file mode 100644
index 000000000..1efa0dbad
--- /dev/null
+++ b/bench/tensors/benchmark_main.cc
@@ -0,0 +1,237 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "benchmark.h"
+#include <regex.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+#include <inttypes.h>
+#include <time.h>
+#include <map>
+
+static int64_t g_flops_processed;
+static int64_t g_benchmark_total_time_ns;
+static int64_t g_benchmark_start_time_ns;
+typedef std::map<std::string, ::testing::Benchmark*> BenchmarkMap;
+typedef BenchmarkMap::iterator BenchmarkMapIt;
+
+BenchmarkMap& gBenchmarks() {
+  static BenchmarkMap g_benchmarks;
+  return g_benchmarks;
+}
+
+static int g_name_column_width = 20;
+
+static int Round(int n) {
+  int base = 1;
+  while (base*10 < n) {
+    base *= 10;
+  }
+  if (n < 2*base) {
+    return 2*base;
+  }
+  if (n < 5*base) {
+    return 5*base;
+  }
+  return 10*base;
+}
+
+#ifdef __APPLE__
+  #include <mach/mach_time.h>
+  static mach_timebase_info_data_t g_time_info;
+  static void __attribute__((constructor)) init_info() {
+    mach_timebase_info(&g_time_info);
+  }
+#endif
+
+static int64_t NanoTime() {
+#if defined(__APPLE__)
+  uint64_t t = mach_absolute_time();
+  return t * g_time_info.numer / g_time_info.denom;
+#else
+  struct timespec t;
+  t.tv_sec = t.tv_nsec = 0;
+  clock_gettime(CLOCK_MONOTONIC, &t);
+  return static_cast<int64_t>(t.tv_sec) * 1000000000LL + t.tv_nsec;
+#endif
+}
+
+namespace testing {
+Benchmark* Benchmark::Arg(int arg) {
+  args_.push_back(arg);
+  return this;
+}
+
+Benchmark* Benchmark::Range(int lo, int hi) {
+  const int kRangeMultiplier = 8;
+  if (hi < lo) {
+    int temp = hi;
+    hi = lo;
+    lo = temp;
+  }
+  while (lo < hi) {
+    args_.push_back(lo);
+    lo *= kRangeMultiplier;
+  }
+  // We always run the hi number.
+  args_.push_back(hi);
+  return this;
+}
+
+const char* Benchmark::Name() {
+  return name_;
+}
+bool Benchmark::ShouldRun(int argc, char* argv[]) {
+  if (argc == 1) {
+    return true;  // With no arguments, we run all benchmarks.
+  }
+  // Otherwise, we interpret each argument as a regular expression and
+  // see if any of our benchmarks match.
+  for (int i = 1; i < argc; i++) {
+    regex_t re;
+    if (regcomp(&re, argv[i], 0) != 0) {
+      fprintf(stderr, "couldn't compile \"%s\" as a regular expression!\n", argv[i]);
+      exit(EXIT_FAILURE);
+    }
+    int match = regexec(&re, name_, 0, NULL, 0);
+    regfree(&re);
+    if (match != REG_NOMATCH) {
+      return true;
+    }
+  }
+  return false;
+}
+void Benchmark::Register(const char* name, void (*fn)(int), void (*fn_range)(int, int)) {
+  name_ = name;
+  fn_ = fn;
+  fn_range_ = fn_range;
+  if (fn_ == NULL && fn_range_ == NULL) {
+    fprintf(stderr, "%s: missing function\n", name_);
+    exit(EXIT_FAILURE);
+  }
+  gBenchmarks().insert(std::make_pair(name, this));
+}
+void Benchmark::Run() {
+  if (fn_ != NULL) {
+    RunWithArg(0);
+  } else {
+    if (args_.empty()) {
+      fprintf(stderr, "%s: no args!\n", name_);
+      exit(EXIT_FAILURE);
+    }
+    for (size_t i = 0; i < args_.size(); ++i) {
+      RunWithArg(args_[i]);
+    }
+  }
+}
+void Benchmark::RunRepeatedlyWithArg(int iterations, int arg) {
+  g_flops_processed = 0;
+  g_benchmark_total_time_ns = 0;
+  g_benchmark_start_time_ns = NanoTime();
+  if (fn_ != NULL) {
+    fn_(iterations);
+  } else {
+    fn_range_(iterations, arg);
+  }
+  if (g_benchmark_start_time_ns != 0) {
+    g_benchmark_total_time_ns += NanoTime() - g_benchmark_start_time_ns;
+  }
+}
+void Benchmark::RunWithArg(int arg) {
+  // run once in case it's expensive
+  int iterations = 1;
+  RunRepeatedlyWithArg(iterations, arg);
+  while (g_benchmark_total_time_ns < 1e9 && iterations < 1e9) {
+    int last = iterations;
+    if (g_benchmark_total_time_ns/iterations == 0) {
+      iterations = 1e9;
+    } else {
+      iterations = 1e9 / (g_benchmark_total_time_ns/iterations);
+    }
+    iterations = std::max(last + 1, std::min(iterations + iterations/2, 100*last));
+    iterations = Round(iterations);
+    RunRepeatedlyWithArg(iterations, arg);
+  }
+  char throughput[100];
+  throughput[0] = '\0';
+  if (g_benchmark_total_time_ns > 0 && g_flops_processed > 0) {
+    double mflops_processed = static_cast<double>(g_flops_processed)/1e6;
+    double seconds = static_cast<double>(g_benchmark_total_time_ns)/1e9;
+    snprintf(throughput, sizeof(throughput), " %8.2f MFlops/s", mflops_processed/seconds);
+  }
+  char full_name[100];
+  if (fn_range_ != NULL) {
+    if (arg >= (1<<20)) {
+      snprintf(full_name, sizeof(full_name), "%s/%dM", name_, arg/(1<<20));
+    } else if (arg >= (1<<10)) {
+      snprintf(full_name, sizeof(full_name), "%s/%dK", name_, arg/(1<<10));
+    } else {
+      snprintf(full_name, sizeof(full_name), "%s/%d", name_, arg);
+    }
+  } else {
+    snprintf(full_name, sizeof(full_name), "%s", name_);
+  }
+  printf("%-*s %10d %10" PRId64 "%s\n", g_name_column_width, full_name,
+         iterations, g_benchmark_total_time_ns/iterations, throughput);
+  fflush(stdout);
+}
+}  // namespace testing
+void SetBenchmarkFlopsProcessed(int64_t x) {
+  g_flops_processed = x;
+}
+void StopBenchmarkTiming() {
+  if (g_benchmark_start_time_ns != 0) {
+    g_benchmark_total_time_ns += NanoTime() - g_benchmark_start_time_ns;
+  }
+  g_benchmark_start_time_ns = 0;
+}
+void StartBenchmarkTiming() {
+  if (g_benchmark_start_time_ns == 0) {
+    g_benchmark_start_time_ns = NanoTime();
+  }
+}
+int main(int argc, char* argv[]) {
+  if (gBenchmarks().empty()) {
+    fprintf(stderr, "No benchmarks registered!\n");
+    exit(EXIT_FAILURE);
+  }
+  for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) {
+    int name_width = static_cast<int>(strlen(it->second->Name()));
+    g_name_column_width = std::max(g_name_column_width, name_width);
+  }
+  bool need_header = true;
+  for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) {
+    ::testing::Benchmark* b = it->second;
+    if (b->ShouldRun(argc, argv)) {
+      if (need_header) {
+        printf("%-*s %10s %10s\n", g_name_column_width, "", "iterations", "ns/op");
+        fflush(stdout);
+        need_header = false;
+      }
+      b->Run();
+    }
+  }
+  if (need_header) {
+    fprintf(stderr, "No matching benchmarks!\n");
+    fprintf(stderr, "Available benchmarks:\n");
+    for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) {
+      fprintf(stderr, "  %s\n", it->second->Name());
+    }
+    exit(EXIT_FAILURE);
+  }
+  return 0;
+}
diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h
index 525b9acda..f3ec70a9e 100644
--- a/bench/tensors/tensor_benchmarks.h
+++ b/bench/tensors/tensor_benchmarks.h
@@ -4,13 +4,15 @@
 typedef int TensorIndex;
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "testing/base/public/benchmark.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+#include "benchmark.h"
+
+#define BENCHMARK_RANGE(bench, lo, hi) \
+  BENCHMARK(bench)->Range(lo, hi)
 
 using Eigen::Tensor;
 using Eigen::TensorMap;
 
-
 // TODO(bsteiner): also templatize on the input type since we have users
 // for int8 as well as floats.
 template <typename Device> class BenchmarkSuite {
@@ -38,12 +40,26 @@ template <typename Device> class BenchmarkSuite {
       device_.memcpy(c_, a_, m_ * m_ * sizeof(float));
     }
     // Record the number of values copied per second
-    finalizeBenchmark(m_ * m_ * num_iters);
+    finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
+  }
+
+  void typeCasting(int num_iters) {
+    eigen_assert(m_ == n_);
+    const Eigen::array<TensorIndex, 2> sizes = {{m_, k_}};
+    const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> A(a_, sizes);
+    TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> B((int*)b_, sizes);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      B.device(device_) = A.cast<int>();
+    }
+    // Record the number of values copied per second
+    finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
   }
 
   void random(int num_iters) {
     eigen_assert(m_ == k_ && k_ == n_);
-    const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+    const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
     TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
 
     StartBenchmarkTiming();
@@ -51,21 +67,21 @@ template <typename Device> class BenchmarkSuite {
       C.device(device_) = C.random();
     }
     // Record the number of random numbers generated per second
-    finalizeBenchmark(m_ * m_ * num_iters);
+    finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
   }
 
   void slicing(int num_iters) {
     eigen_assert(m_ == k_ && k_ == n_);
-    const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+    const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
     const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
     const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
     TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
 
-    const Eigen::DSizes<TensorIndex, 2> quarter_sizes(Eigen::array<TensorIndex, 2>(m_/2, m_/2));
-    const Eigen::DSizes<TensorIndex, 2> first_quadrant(Eigen::array<TensorIndex, 2>(0, 0));
-    const Eigen::DSizes<TensorIndex, 2> second_quadrant(Eigen::array<TensorIndex, 2>(0, m_/2));
-    const Eigen::DSizes<TensorIndex, 2> third_quadrant(Eigen::array<TensorIndex, 2>(m_/2, 0));
-    const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(Eigen::array<TensorIndex, 2>(m_/2, m_/2));
+    const Eigen::DSizes<TensorIndex, 2> quarter_sizes(m_/2, m_/2);
+    const Eigen::DSizes<TensorIndex, 2> first_quadrant(0, 0);
+    const Eigen::DSizes<TensorIndex, 2> second_quadrant(0, m_/2);
+    const Eigen::DSizes<TensorIndex, 2> third_quadrant(m_/2, 0);
+    const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(m_/2, m_/2);
 
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
@@ -80,31 +96,59 @@ template <typename Device> class BenchmarkSuite {
     }
     // Record the number of values copied from the rhs slice to the lhs slice
     // each second
-    finalizeBenchmark(m_ * m_ * num_iters);
+    finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
+  }
+
+  void rowChip(int num_iters) {
+    const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}};
+    const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
+    const Eigen::array<TensorIndex, 1> output_size = {{n_}};
+    TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = B.chip(iter % k_, 0);
+    }
+    // Record the number of values copied from the rhs chip to the lhs.
+    finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
+  }
+
+  void colChip(int num_iters) {
+    const Eigen::array<TensorIndex, 2> input_size= {{k_, n_}};
+    const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
+    const Eigen::array<TensorIndex, 1> output_size = {{n_}};
+    TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
+
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = B.chip(iter % n_, 1);
+    }
+    // Record the number of values copied from the rhs chip to the lhs.
+    finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
   }
 
   void shuffling(int num_iters) {
     eigen_assert(m_ == n_);
-    const Eigen::array<TensorIndex, 2> size_a(m_, k_);
+    const Eigen::array<TensorIndex, 2> size_a = {{m_, k_}};
     const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
-    const Eigen::array<TensorIndex, 2> size_b(k_, m_);
+    const Eigen::array<TensorIndex, 2> size_b = {{k_, m_}};
     TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
 
-    const Eigen::array<int, 2> shuffle(1, 0);
+    const Eigen::array<int, 2> shuffle = {{1, 0}};
 
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       B.device(device_) = A.shuffle(shuffle);
     }
     // Record the number of values shuffled from A and copied to B each second
-    finalizeBenchmark(m_ * k_ * num_iters);
+    finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
   }
 
  void padding(int num_iters) {
     eigen_assert(m_ == k_);
-    const Eigen::array<TensorIndex, 2> size_a(m_, k_-3);
+    const Eigen::array<TensorIndex, 2> size_a = {{m_, k_-3}};
     const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
-    const Eigen::array<TensorIndex, 2> size_b(k_, m_);
+    const Eigen::array<TensorIndex, 2> size_b = {{k_, m_}};
     TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
 
     Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings;
@@ -116,35 +160,34 @@ template <typename Device> class BenchmarkSuite {
       B.device(device_) = A.pad(paddings);
     }
     // Record the number of values copied from the padded tensor A each second
-    finalizeBenchmark(m_ * k_ * num_iters);
+    finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
   }
 
  void striding(int num_iters) {
     eigen_assert(m_ == k_);
-    const Eigen::array<TensorIndex, 2> size_a(m_, k_);
+    const Eigen::array<TensorIndex, 2> size_a = {{m_, k_}};
     const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
-    const Eigen::array<TensorIndex, 2> size_b(m_, k_ / 2);
+    const Eigen::array<TensorIndex, 2> size_b = {{m_, k_ / 2}};
     TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
 
-    const Eigen::array<TensorIndex, 2> strides(1, 2);
+    const Eigen::array<TensorIndex, 2> strides = {{1, 2}};
 
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
       B.device(device_) = A.stride(strides);
     }
     // Record the number of values copied from the padded tensor A each second
-    finalizeBenchmark(m_ * k_ * num_iters);
+    finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
   }
 
   void broadcasting(int num_iters) {
-    const Eigen::array<TensorIndex, 2> size_a(m_, 1);
+    const Eigen::array<TensorIndex, 2> size_a = {{m_, 1}};
     const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
-    const Eigen::array<TensorIndex, 2> size_c(m_, n_);
+    const Eigen::array<TensorIndex, 2> size_c = {{m_, n_}};
     TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, size_c);
 
-#if defined(__CUDACC__)
-    // nvcc doesn't support cxx11
-    const Eigen::array<int, 2> broadcast(1, n_);
+#ifndef EIGEN_HAS_INDEX_LIST
+    const Eigen::array<int, 2> broadcast = {{1, n_}};
 #else
     // Take advantage of cxx11 to give the compiler information it can use to
     // optimize the code.
@@ -157,12 +200,12 @@ template <typename Device> class BenchmarkSuite {
       C.device(device_) = A.broadcast(broadcast);
     }
     // Record the number of values broadcasted from A and copied to C each second
-    finalizeBenchmark(m_ * n_ * num_iters);
+    finalizeBenchmark(static_cast<int64_t>(m_) * n_ * num_iters);
   }
 
   void coeffWiseOp(int num_iters) {
     eigen_assert(m_ == k_ && k_ == n_);
-    const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+    const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
     const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
     const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
     TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
@@ -173,12 +216,12 @@ template <typename Device> class BenchmarkSuite {
     }
     // Record the number of FLOP executed per second (2 multiplications and
     // 1 addition per value)
-    finalizeBenchmark(3 * m_ * m_ * num_iters);
+    finalizeBenchmark(static_cast<int64_t>(3) * m_ * m_ * num_iters);
   }
 
   void algebraicFunc(int num_iters) {
     eigen_assert(m_ == k_ && k_ == n_);
-    const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+    const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
     const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
     const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
     TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
@@ -189,12 +232,12 @@ template <typename Device> class BenchmarkSuite {
     }
     // Record the number of FLOP executed per second (assuming one operation
     // per value)
-    finalizeBenchmark(m_ * m_ * num_iters);
+    finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
   }
 
   void transcendentalFunc(int num_iters) {
     eigen_assert(m_ == k_ && k_ == n_);
-    const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+    const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
     const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
     const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
     TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
@@ -205,17 +248,49 @@ template <typename Device> class BenchmarkSuite {
     }
     // Record the number of FLOP executed per second (assuming one operation
     // per value)
-    finalizeBenchmark(m_ * m_ * num_iters);
+    finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
   }
 
-  // Simple reduction
-  void reduction(int num_iters) {
-    const Eigen::array<TensorIndex, 2> input_size(k_, n_);
-    const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, input_size);
-    const Eigen::array<TensorIndex, 1> output_size(n_);
-    TensorMap<Tensor<float, 1>, Eigen::Aligned> C(c_, output_size);
+ // Row reduction
+  void rowReduction(int num_iters) {
+    const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}};
+    const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
+    const Eigen::array<TensorIndex, 1> output_size = {{n_}};
+    TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
+
+#ifndef EIGEN_HAS_INDEX_LIST
+    const Eigen::array<TensorIndex, 1> sum_along_dim = {{0}};
+#else
+    // Take advantage of cxx11 to give the compiler information it can use to
+    // optimize the code.
+    Eigen::IndexList<Eigen::type2index<0>> sum_along_dim;
+#endif
 
-    const Eigen::array<TensorIndex, 1> sum_along_dim(0);
+    StartBenchmarkTiming();
+    for (int iter = 0; iter < num_iters; ++iter) {
+      C.device(device_) = B.sum(sum_along_dim);
+    }
+    // Record the number of FLOP executed per second (assuming one operation
+    // per value)
+    finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
+  }
+
+  // Column reduction
+  void colReduction(int num_iters) {
+    const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}};
+    const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(
+        b_, input_size);
+    const Eigen::array<TensorIndex, 1> output_size = {{k_}};
+    TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(
+        c_, output_size);
+
+#ifndef EIGEN_HAS_INDEX_LIST
+    const Eigen::array<TensorIndex, 1> sum_along_dim = {{1}};
+#else
+    // Take advantage of cxx11 to give the compiler information it can use to
+    // optimize the code.
+    Eigen::IndexList<Eigen::type2index<1>> sum_along_dim;
+#endif
 
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
@@ -223,21 +298,21 @@ template <typename Device> class BenchmarkSuite {
     }
     // Record the number of FLOP executed per second (assuming one operation
     // per value)
-    finalizeBenchmark(m_ * m_ * num_iters);
+    finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
   }
 
   // do a contraction which is equivalent to a matrix multiplication
   void contraction(int num_iters) {
-    const Eigen::array<TensorIndex, 2> sizeA(m_, k_);
-    const Eigen::array<TensorIndex, 2> sizeB(k_, n_);
-    const Eigen::array<TensorIndex, 2> sizeC(m_, n_);
+    const Eigen::array<TensorIndex, 2> sizeA = {{m_, k_}};
+    const Eigen::array<TensorIndex, 2> sizeB = {{k_, n_}};
+    const Eigen::array<TensorIndex, 2> sizeC = {{m_, n_}};
 
     const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizeA);
     const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizeB);
     TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizeC);
 
     typedef typename Tensor<float, 2>::DimensionPair DimPair;
-    const Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+    const Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
 
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
@@ -245,18 +320,18 @@ template <typename Device> class BenchmarkSuite {
     }
     // Record the number of FLOP executed per second (size_ multiplications and
     // additions for each value in the resulting tensor)
-    finalizeBenchmark(static_cast<int64>(2) * m_ * n_ * k_ * num_iters);
+    finalizeBenchmark(static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters);
   }
 
   void convolution(int num_iters, int kernel_x, int kernel_y) {
-    const Eigen::array<TensorIndex, 2> input_sizes(m_, n_);
+    const Eigen::array<TensorIndex, 2> input_sizes = {{m_, n_}};
     TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, input_sizes);
-    const Eigen::array<TensorIndex, 2> kernel_sizes(kernel_x, kernel_y);
+    const Eigen::array<TensorIndex, 2> kernel_sizes = {{kernel_x, kernel_y}};
     TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, kernel_sizes);
-    const Eigen::array<TensorIndex, 2> result_sizes(
-        m_ - kernel_x + 1, n_ - kernel_y + 1);
+    const Eigen::array<TensorIndex, 2> result_sizes =
+        {{m_ - kernel_x + 1, n_ - kernel_y + 1}};
     TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, result_sizes);
-    Eigen::array<Tensor<float, 2>::Index, 2> dims(0, 1);
+    Eigen::array<Tensor<float, 2>::Index, 2> dims = {{0, 1}};
 
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
@@ -264,8 +339,8 @@ template <typename Device> class BenchmarkSuite {
     }
     // Record the number of FLOP executed per second (kernel_size
     // multiplications and additions for each value in the resulting tensor)
-    finalizeBenchmark(
-        (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * 2 * num_iters);
+    finalizeBenchmark(static_cast<int64_t>(2) *
+        (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * num_iters);
   }
 
  private:
@@ -280,23 +355,23 @@ template <typename Device> class BenchmarkSuite {
     device_.memset(b_, 23, k_ * n_ * sizeof(float));
     device_.memset(c_, 31, m_ * n_ * sizeof(float));
 
-    BenchmarkUseRealTime();
+    //BenchmarkUseRealTime();
   }
 
-  inline void finalizeBenchmark(int64 num_items) {
+  inline void finalizeBenchmark(int64_t num_items) {
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
     if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
       device_.synchronize();
     }
 #endif
     StopBenchmarkTiming();
-    SetBenchmarkItemsProcessed(num_items);
+    SetBenchmarkFlopsProcessed(num_items);
   }
 
 
-  size_t m_;
-  size_t k_;
-  size_t n_;
+  TensorIndex m_;
+  TensorIndex k_;
+  TensorIndex n_;
   float* a_;
   float* b_;
   float* c_;
diff --git a/bench/tensors/tensor_benchmarks_cpu.cc b/bench/tensors/tensor_benchmarks_cpu.cc
index 68653ba15..6754e1a32 100644
--- a/bench/tensors/tensor_benchmarks_cpu.cc
+++ b/bench/tensors/tensor_benchmarks_cpu.cc
@@ -1,19 +1,12 @@
 #define EIGEN_USE_THREADS
 
-#include "base/sysinfo.h"
-#include "strings/strcat.h"
-#include "third_party/eigen3/tensor_benchmarks.h"
-#include "thread/threadpool.h"
+#include <string>
+
+#include "tensor_benchmarks.h"
 
-#ifdef __ANDROID__
-#define CREATE_THREAD_POOL(threads)             \
-Eigen::ThreadPoolDevice device(threads);
-#else
 #define CREATE_THREAD_POOL(threads)             \
-ThreadPool tp(threads);                         \
-tp.StartWorkers();                              \
-Eigen::ThreadPoolDevice device(&tp, threads);
-#endif
+Eigen::ThreadPool pool(threads);                \
+Eigen::ThreadPoolDevice device(&pool, threads);
 
 // Simple functions
 #define BM_FuncCPU(FUNC, THREADS)                                \
@@ -22,7 +15,6 @@ Eigen::ThreadPoolDevice device(&tp, threads);
     CREATE_THREAD_POOL(THREADS);                                 \
     BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N);    \
     suite.FUNC(iters);                                           \
-    SetBenchmarkLabel(StrCat("using ", THREADS, " threads"));    \
   }                                                              \
   BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000);
 
@@ -30,6 +22,10 @@ BM_FuncCPU(memcpy, 4);
 BM_FuncCPU(memcpy, 8);
 BM_FuncCPU(memcpy, 12);
 
+BM_FuncCPU(typeCasting, 4);
+BM_FuncCPU(typeCasting, 8);
+BM_FuncCPU(typeCasting, 12);
+
 BM_FuncCPU(random, 4);
 BM_FuncCPU(random, 8);
 BM_FuncCPU(random, 12);
@@ -38,6 +34,14 @@ BM_FuncCPU(slicing, 4);
 BM_FuncCPU(slicing, 8);
 BM_FuncCPU(slicing, 12);
 
+BM_FuncCPU(rowChip, 4);
+BM_FuncCPU(rowChip, 8);
+BM_FuncCPU(rowChip, 12);
+
+BM_FuncCPU(colChip, 4);
+BM_FuncCPU(colChip, 8);
+BM_FuncCPU(colChip, 12);
+
 BM_FuncCPU(shuffling, 4);
 BM_FuncCPU(shuffling, 8);
 BM_FuncCPU(shuffling, 12);
@@ -66,9 +70,13 @@ BM_FuncCPU(transcendentalFunc, 4);
 BM_FuncCPU(transcendentalFunc, 8);
 BM_FuncCPU(transcendentalFunc, 12);
 
-BM_FuncCPU(reduction, 4);
-BM_FuncCPU(reduction, 8);
-BM_FuncCPU(reduction, 12);
+BM_FuncCPU(rowReduction, 4);
+BM_FuncCPU(rowReduction, 8);
+BM_FuncCPU(rowReduction, 12);
+
+BM_FuncCPU(colReduction, 4);
+BM_FuncCPU(colReduction, 8);
+BM_FuncCPU(colReduction, 12);
 
 
 // Contractions
@@ -84,7 +92,6 @@ BM_FuncCPU(reduction, 12);
       BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, D1, D2, D3);       \
       suite.FUNC(iters);                                                       \
     }                                                                          \
-    SetBenchmarkLabel(StrCat("using ", THREADS, " threads"));                  \
   }                                                                            \
   BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000);
 
@@ -107,6 +114,12 @@ BM_FuncWithInputDimsCPU(contraction, N, 64, N, 8);
 BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12);
 BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16);
 
+BM_FuncWithInputDimsCPU(contraction, N, N, 64, 1);
+BM_FuncWithInputDimsCPU(contraction, N, N, 64, 4);
+BM_FuncWithInputDimsCPU(contraction, N, N, 64, 8);
+BM_FuncWithInputDimsCPU(contraction, N, N, 64, 12);
+BM_FuncWithInputDimsCPU(contraction, N, N, 64, 16);
+
 BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1);
 BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4);
 BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8);
@@ -127,7 +140,6 @@ BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16);
     CREATE_THREAD_POOL(THREADS);                                               \
     BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N);                  \
     suite.FUNC(iters, DIM1, DIM2);                                             \
-    SetBenchmarkLabel(StrCat("using ", THREADS, " threads"));                  \
   }                                                                            \
   BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000);
 
diff --git a/bench/tensors/tensor_benchmarks_gpu.cc b/bench/tensors/tensor_benchmarks_gpu.cu
index adea754ad..611e8197b 100644
--- a/bench/tensors/tensor_benchmarks_gpu.cc
+++ b/bench/tensors/tensor_benchmarks_gpu.cu
@@ -3,47 +3,47 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <iostream>
-#include "strings/strcat.h"
-#include "third_party/eigen3/tensor_benchmarks.h"
-
 
+#include "tensor_benchmarks.h"
 
 // Simple functions
 #define BM_FuncGPU(FUNC)                                                       \
   static void BM_##FUNC(int iters, int N) {                                    \
     StopBenchmarkTiming();                                                     \
-    cudaStream_t stream;                                                       \
-    cudaStreamCreate(&stream);                                                 \
+    Eigen::CudaStreamDevice stream;                                            \
     Eigen::GpuDevice device(&stream);                                          \
     BenchmarkSuite<Eigen::GpuDevice> suite(device, N);                         \
     cudaDeviceSynchronize();                                                   \
     suite.FUNC(iters);                                                         \
-    cudaStreamDestroy(stream);                                                 \
   }                                                                            \
   BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
 
 BM_FuncGPU(memcpy);
+BM_FuncGPU(typeCasting);
 BM_FuncGPU(random);
 BM_FuncGPU(slicing);
+BM_FuncGPU(rowChip);
+BM_FuncGPU(colChip);
 BM_FuncGPU(shuffling);
 BM_FuncGPU(padding);
 BM_FuncGPU(striding);
 BM_FuncGPU(broadcasting);
 BM_FuncGPU(coeffWiseOp);
-BM_FuncGPU(reduction);
+BM_FuncGPU(algebraicFunc);
+BM_FuncGPU(transcendentalFunc);
+BM_FuncGPU(rowReduction);
+BM_FuncGPU(colReduction);
 
 
 // Contractions
 #define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3)                              \
   static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) {               \
     StopBenchmarkTiming();                                                     \
-    cudaStream_t stream;                                                       \
-    cudaStreamCreate(&stream);                                                 \
+    Eigen::CudaStreamDevice stream;                                            \
     Eigen::GpuDevice device(&stream);                                          \
     BenchmarkSuite<Eigen::GpuDevice> suite(device, D1, D2, D3);                \
     cudaDeviceSynchronize();                                                   \
     suite.FUNC(iters);                                                         \
-    cudaStreamDestroy(stream);                                                 \
   }                                                                            \
   BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
 
@@ -51,19 +51,18 @@ BM_FuncGPU(reduction);
 BM_FuncWithInputDimsGPU(contraction, N, N, N);
 BM_FuncWithInputDimsGPU(contraction, 64, N, N);
 BM_FuncWithInputDimsGPU(contraction, N, 64, N);
+BM_FuncWithInputDimsGPU(contraction, N, N, 64);
 
 
 // Convolutions
 #define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2)                             \
   static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) {                  \
     StopBenchmarkTiming();                                                     \
-    cudaStream_t stream;                                                       \
-    cudaStreamCreate(&stream);                                                 \
+    Eigen::CudaStreamDevice stream;                                            \
     Eigen::GpuDevice device(&stream);                                          \
     BenchmarkSuite<Eigen::GpuDevice> suite(device, N);                         \
     cudaDeviceSynchronize();                                                   \
     suite.FUNC(iters, DIM1, DIM2);                                             \
-    cudaStreamDestroy(stream);                                                 \
   }                                                                            \
   BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);
 
diff --git a/blas/level2_cplx_impl.h b/blas/level2_cplx_impl.h
index 9b845de22..2edc51596 100644
--- a/blas/level2_cplx_impl.h
+++ b/blas/level2_cplx_impl.h
@@ -19,19 +19,12 @@
 int EIGEN_BLAS_FUNC(hemv)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *px, int *incx, RealScalar *pbeta, RealScalar *py, int *incy)
 {
   typedef void (*functype)(int, const Scalar*, int, const Scalar*, Scalar*, Scalar);
-  static functype func[2];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<2; ++k)
-      func[k] = 0;
-
-    func[UP] = (internal::selfadjoint_matrix_vector_product<Scalar,int,ColMajor,Upper,false,false>::run);
-    func[LO] = (internal::selfadjoint_matrix_vector_product<Scalar,int,ColMajor,Lower,false,false>::run);
-
-    init = true;
-  }
+  static const functype func[2] = {
+    // array index: UP
+    (internal::selfadjoint_matrix_vector_product<Scalar,int,ColMajor,Upper,false,false>::run),
+    // array index: LO
+    (internal::selfadjoint_matrix_vector_product<Scalar,int,ColMajor,Lower,false,false>::run),
+  };
 
   Scalar* a = reinterpret_cast<Scalar*>(pa);
   Scalar* x = reinterpret_cast<Scalar*>(px);
@@ -111,19 +104,12 @@ int EIGEN_BLAS_FUNC(hemv)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa
 int EIGEN_BLAS_FUNC(hpr)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *pap)
 {
   typedef void (*functype)(int, Scalar*, const Scalar*, RealScalar);
-  static functype func[2];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<2; ++k)
-      func[k] = 0;
-
-    func[UP] = (internal::selfadjoint_packed_rank1_update<Scalar,int,ColMajor,Upper,false,Conj>::run);
-    func[LO] = (internal::selfadjoint_packed_rank1_update<Scalar,int,ColMajor,Lower,false,Conj>::run);
-
-    init = true;
-  }
+  static const functype func[2] = {
+    // array index: UP
+    (internal::selfadjoint_packed_rank1_update<Scalar,int,ColMajor,Upper,false,Conj>::run),
+    // array index: LO
+    (internal::selfadjoint_packed_rank1_update<Scalar,int,ColMajor,Lower,false,Conj>::run),
+  };
 
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar* ap = reinterpret_cast<Scalar*>(pap);
@@ -162,19 +148,12 @@ int EIGEN_BLAS_FUNC(hpr)(char *uplo, int *n, RealScalar *palpha, RealScalar *px,
 int EIGEN_BLAS_FUNC(hpr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pap)
 {
   typedef void (*functype)(int, Scalar*, const Scalar*, const Scalar*, Scalar);
-  static functype func[2];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<2; ++k)
-      func[k] = 0;
-
-    func[UP] = (internal::packed_rank2_update_selector<Scalar,int,Upper>::run);
-    func[LO] = (internal::packed_rank2_update_selector<Scalar,int,Lower>::run);
-
-    init = true;
-  }
+  static const functype func[2] = {
+    // array index: UP
+    (internal::packed_rank2_update_selector<Scalar,int,Upper>::run),
+    // array index: LO
+    (internal::packed_rank2_update_selector<Scalar,int,Lower>::run),
+  };
 
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar* y = reinterpret_cast<Scalar*>(py);
@@ -217,19 +196,12 @@ int EIGEN_BLAS_FUNC(hpr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px
 int EIGEN_BLAS_FUNC(her)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *pa, int *lda)
 {
   typedef void (*functype)(int, Scalar*, int, const Scalar*, const Scalar*, const Scalar&);
-  static functype func[2];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<2; ++k)
-      func[k] = 0;
-
-    func[UP] = (selfadjoint_rank1_update<Scalar,int,ColMajor,Upper,false,Conj>::run);
-    func[LO] = (selfadjoint_rank1_update<Scalar,int,ColMajor,Lower,false,Conj>::run);
-
-    init = true;
-  }
+  static const functype func[2] = {
+    // array index: UP
+    (selfadjoint_rank1_update<Scalar,int,ColMajor,Upper,false,Conj>::run),
+    // array index: LO
+    (selfadjoint_rank1_update<Scalar,int,ColMajor,Lower,false,Conj>::run),
+  };
 
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar* a = reinterpret_cast<Scalar*>(pa);
@@ -271,19 +243,12 @@ int EIGEN_BLAS_FUNC(her)(char *uplo, int *n, RealScalar *palpha, RealScalar *px,
 int EIGEN_BLAS_FUNC(her2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pa, int *lda)
 {
   typedef void (*functype)(int, Scalar*, int, const Scalar*, const Scalar*, Scalar);
-  static functype func[2];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<2; ++k)
-      func[k] = 0;
-
-    func[UP] = (internal::rank2_update_selector<Scalar,int,Upper>::run);
-    func[LO] = (internal::rank2_update_selector<Scalar,int,Lower>::run);
-
-    init = true;
-  }
+  static const functype func[2] = {
+    // array index: UP
+    (internal::rank2_update_selector<Scalar,int,Upper>::run),
+    // array index: LO
+    (internal::rank2_update_selector<Scalar,int,Lower>::run),
+  };
 
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar* y = reinterpret_cast<Scalar*>(py);
diff --git a/blas/level2_impl.h b/blas/level2_impl.h
index 917f2e372..d09db0cc6 100644
--- a/blas/level2_impl.h
+++ b/blas/level2_impl.h
@@ -26,20 +26,15 @@ struct general_matrix_vector_product_wrapper
 int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *incb, RealScalar *pbeta, RealScalar *pc, int *incc)
 {
   typedef void (*functype)(int, int, const Scalar *, int, const Scalar *, int , Scalar *, int, Scalar);
-  static functype func[4];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<4; ++k)
-      func[k] = 0;
-
-    func[NOTR] = (general_matrix_vector_product_wrapper<int,Scalar,ColMajor,false,false>::run);
-    func[TR  ] = (general_matrix_vector_product_wrapper<int,Scalar,RowMajor,false,false>::run);
-    func[ADJ ] = (general_matrix_vector_product_wrapper<int,Scalar,RowMajor,Conj ,false>::run);
-
-    init = true;
-  }
+  static const functype func[4] = {
+    // array index: NOTR
+    (general_matrix_vector_product_wrapper<int,Scalar,ColMajor,false,false>::run),
+    // array index: TR  
+    (general_matrix_vector_product_wrapper<int,Scalar,RowMajor,false,false>::run),
+    // array index: ADJ 
+    (general_matrix_vector_product_wrapper<int,Scalar,RowMajor,Conj ,false>::run),
+    0
+  };
 
   Scalar* a = reinterpret_cast<Scalar*>(pa);
   Scalar* b = reinterpret_cast<Scalar*>(pb);
@@ -90,32 +85,36 @@ int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealSca
 int EIGEN_BLAS_FUNC(trsv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pa, int *lda, RealScalar *pb, int *incb)
 {
   typedef void (*functype)(int, const Scalar *, int, Scalar *);
-  static functype func[16];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<16; ++k)
-      func[k] = 0;
-
-    func[NOTR  | (UP << 2) | (NUNIT << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       false,ColMajor>::run);
-    func[TR    | (UP << 2) | (NUNIT << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       false,RowMajor>::run);
-    func[ADJ   | (UP << 2) | (NUNIT << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       Conj, RowMajor>::run);
-
-    func[NOTR  | (LO << 2) | (NUNIT << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       false,ColMajor>::run);
-    func[TR    | (LO << 2) | (NUNIT << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       false,RowMajor>::run);
-    func[ADJ   | (LO << 2) | (NUNIT << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       Conj, RowMajor>::run);
-
-    func[NOTR  | (UP << 2) | (UNIT  << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,false,ColMajor>::run);
-    func[TR    | (UP << 2) | (UNIT  << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,false,RowMajor>::run);
-    func[ADJ   | (UP << 2) | (UNIT  << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,Conj, RowMajor>::run);
-
-    func[NOTR  | (LO << 2) | (UNIT  << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,false,ColMajor>::run);
-    func[TR    | (LO << 2) | (UNIT  << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,false,RowMajor>::run);
-    func[ADJ   | (LO << 2) | (UNIT  << 3)] = (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,Conj, RowMajor>::run);
-
-    init = true;
-  }
+  static const functype func[16] = {
+    // array index: NOTR  | (UP << 2) | (NUNIT << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (NUNIT << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (NUNIT << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       Conj, RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (NUNIT << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (NUNIT << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (NUNIT << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       Conj, RowMajor>::run),
+    0,
+    // array index: NOTR  | (UP << 2) | (UNIT  << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (UNIT  << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (UNIT  << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,Conj, RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (UNIT  << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (UNIT  << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (UNIT  << 3)
+    (internal::triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,Conj, RowMajor>::run),
+    0
+  };
 
   Scalar* a = reinterpret_cast<Scalar*>(pa);
   Scalar* b = reinterpret_cast<Scalar*>(pb);
@@ -145,32 +144,36 @@ int EIGEN_BLAS_FUNC(trsv)(char *uplo, char *opa, char *diag, int *n, RealScalar
 int EIGEN_BLAS_FUNC(trmv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pa, int *lda, RealScalar *pb, int *incb)
 {
   typedef void (*functype)(int, int, const Scalar *, int, const Scalar *, int, Scalar *, int, const Scalar&);
-  static functype func[16];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<16; ++k)
-      func[k] = 0;
-
-    func[NOTR  | (UP << 2) | (NUNIT << 3)] = (internal::triangular_matrix_vector_product<int,Upper|0,       Scalar,false,Scalar,false,ColMajor>::run);
-    func[TR    | (UP << 2) | (NUNIT << 3)] = (internal::triangular_matrix_vector_product<int,Lower|0,       Scalar,false,Scalar,false,RowMajor>::run);
-    func[ADJ   | (UP << 2) | (NUNIT << 3)] = (internal::triangular_matrix_vector_product<int,Lower|0,       Scalar,Conj, Scalar,false,RowMajor>::run);
-
-    func[NOTR  | (LO << 2) | (NUNIT << 3)] = (internal::triangular_matrix_vector_product<int,Lower|0,       Scalar,false,Scalar,false,ColMajor>::run);
-    func[TR    | (LO << 2) | (NUNIT << 3)] = (internal::triangular_matrix_vector_product<int,Upper|0,       Scalar,false,Scalar,false,RowMajor>::run);
-    func[ADJ   | (LO << 2) | (NUNIT << 3)] = (internal::triangular_matrix_vector_product<int,Upper|0,       Scalar,Conj, Scalar,false,RowMajor>::run);
-
-    func[NOTR  | (UP << 2) | (UNIT  << 3)] = (internal::triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,false,Scalar,false,ColMajor>::run);
-    func[TR    | (UP << 2) | (UNIT  << 3)] = (internal::triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,false,Scalar,false,RowMajor>::run);
-    func[ADJ   | (UP << 2) | (UNIT  << 3)] = (internal::triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,Conj, Scalar,false,RowMajor>::run);
-
-    func[NOTR  | (LO << 2) | (UNIT  << 3)] = (internal::triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,false,Scalar,false,ColMajor>::run);
-    func[TR    | (LO << 2) | (UNIT  << 3)] = (internal::triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,false,Scalar,false,RowMajor>::run);
-    func[ADJ   | (LO << 2) | (UNIT  << 3)] = (internal::triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,Conj, Scalar,false,RowMajor>::run);
-
-    init = true;
-  }
+  static const functype func[16] = {
+    // array index: NOTR  | (UP << 2) | (NUNIT << 3)
+    (internal::triangular_matrix_vector_product<int,Upper|0,       Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (NUNIT << 3)
+    (internal::triangular_matrix_vector_product<int,Lower|0,       Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (NUNIT << 3)
+    (internal::triangular_matrix_vector_product<int,Lower|0,       Scalar,Conj, Scalar,false,RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (NUNIT << 3)
+    (internal::triangular_matrix_vector_product<int,Lower|0,       Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (NUNIT << 3)
+    (internal::triangular_matrix_vector_product<int,Upper|0,       Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (NUNIT << 3)
+    (internal::triangular_matrix_vector_product<int,Upper|0,       Scalar,Conj, Scalar,false,RowMajor>::run),
+    0,
+    // array index: NOTR  | (UP << 2) | (UNIT  << 3)
+    (internal::triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (UNIT  << 3)
+    (internal::triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (UNIT  << 3)
+    (internal::triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,Conj, Scalar,false,RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (UNIT  << 3)
+    (internal::triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (UNIT  << 3)
+    (internal::triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (UNIT  << 3)
+    (internal::triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,Conj, Scalar,false,RowMajor>::run),
+    0
+  };
 
   Scalar* a = reinterpret_cast<Scalar*>(pa);
   Scalar* b = reinterpret_cast<Scalar*>(pb);
@@ -346,32 +349,36 @@ int EIGEN_BLAS_FUNC(tbmv)(char *uplo, char *opa, char *diag, int *n, int *k, Rea
 int EIGEN_BLAS_FUNC(tbsv)(char *uplo, char *op, char *diag, int *n, int *k, RealScalar *pa, int *lda, RealScalar *px, int *incx)
 {
   typedef void (*functype)(int, int, const Scalar *, int, Scalar *);
-  static functype func[16];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int i=0; i<16; ++i)
-      func[i] = 0;
-
-    func[NOTR  | (UP << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector<int,Upper|0,       Scalar,false,Scalar,ColMajor>::run);
-    func[TR    | (UP << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector<int,Lower|0,       Scalar,false,Scalar,RowMajor>::run);
-    func[ADJ   | (UP << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector<int,Lower|0,       Scalar,Conj, Scalar,RowMajor>::run);
-
-    func[NOTR  | (LO << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector<int,Lower|0,       Scalar,false,Scalar,ColMajor>::run);
-    func[TR    | (LO << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector<int,Upper|0,       Scalar,false,Scalar,RowMajor>::run);
-    func[ADJ   | (LO << 2) | (NUNIT << 3)] = (internal::band_solve_triangular_selector<int,Upper|0,       Scalar,Conj, Scalar,RowMajor>::run);
-
-    func[NOTR  | (UP << 2) | (UNIT  << 3)] = (internal::band_solve_triangular_selector<int,Upper|UnitDiag,Scalar,false,Scalar,ColMajor>::run);
-    func[TR    | (UP << 2) | (UNIT  << 3)] = (internal::band_solve_triangular_selector<int,Lower|UnitDiag,Scalar,false,Scalar,RowMajor>::run);
-    func[ADJ   | (UP << 2) | (UNIT  << 3)] = (internal::band_solve_triangular_selector<int,Lower|UnitDiag,Scalar,Conj, Scalar,RowMajor>::run);
-
-    func[NOTR  | (LO << 2) | (UNIT  << 3)] = (internal::band_solve_triangular_selector<int,Lower|UnitDiag,Scalar,false,Scalar,ColMajor>::run);
-    func[TR    | (LO << 2) | (UNIT  << 3)] = (internal::band_solve_triangular_selector<int,Upper|UnitDiag,Scalar,false,Scalar,RowMajor>::run);
-    func[ADJ   | (LO << 2) | (UNIT  << 3)] = (internal::band_solve_triangular_selector<int,Upper|UnitDiag,Scalar,Conj, Scalar,RowMajor>::run);
-
-    init = true;
-  }
+  static const functype func[16] = {
+    // array index: NOTR  | (UP << 2) | (NUNIT << 3)
+    (internal::band_solve_triangular_selector<int,Upper|0,       Scalar,false,Scalar,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (NUNIT << 3)
+    (internal::band_solve_triangular_selector<int,Lower|0,       Scalar,false,Scalar,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (NUNIT << 3)
+    (internal::band_solve_triangular_selector<int,Lower|0,       Scalar,Conj, Scalar,RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (NUNIT << 3)
+    (internal::band_solve_triangular_selector<int,Lower|0,       Scalar,false,Scalar,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (NUNIT << 3)
+    (internal::band_solve_triangular_selector<int,Upper|0,       Scalar,false,Scalar,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (NUNIT << 3)
+    (internal::band_solve_triangular_selector<int,Upper|0,       Scalar,Conj, Scalar,RowMajor>::run),
+    0,
+    // array index: NOTR  | (UP << 2) | (UNIT  << 3)
+    (internal::band_solve_triangular_selector<int,Upper|UnitDiag,Scalar,false,Scalar,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (UNIT  << 3)
+    (internal::band_solve_triangular_selector<int,Lower|UnitDiag,Scalar,false,Scalar,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (UNIT  << 3)
+    (internal::band_solve_triangular_selector<int,Lower|UnitDiag,Scalar,Conj, Scalar,RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (UNIT  << 3)
+    (internal::band_solve_triangular_selector<int,Lower|UnitDiag,Scalar,false,Scalar,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (UNIT  << 3)
+    (internal::band_solve_triangular_selector<int,Upper|UnitDiag,Scalar,false,Scalar,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (UNIT  << 3)
+    (internal::band_solve_triangular_selector<int,Upper|UnitDiag,Scalar,Conj, Scalar,RowMajor>::run),
+    0,
+  };
 
   Scalar* a = reinterpret_cast<Scalar*>(pa);
   Scalar* x = reinterpret_cast<Scalar*>(px);
@@ -416,32 +423,36 @@ int EIGEN_BLAS_FUNC(tbsv)(char *uplo, char *op, char *diag, int *n, int *k, Real
 int EIGEN_BLAS_FUNC(tpmv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pap, RealScalar *px, int *incx)
 {
   typedef void (*functype)(int, const Scalar*, const Scalar*, Scalar*, Scalar);
-  static functype func[16];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<16; ++k)
-      func[k] = 0;
-
-    func[NOTR  | (UP << 2) | (NUNIT << 3)] = (internal::packed_triangular_matrix_vector_product<int,Upper|0,       Scalar,false,Scalar,false,ColMajor>::run);
-    func[TR    | (UP << 2) | (NUNIT << 3)] = (internal::packed_triangular_matrix_vector_product<int,Lower|0,       Scalar,false,Scalar,false,RowMajor>::run);
-    func[ADJ   | (UP << 2) | (NUNIT << 3)] = (internal::packed_triangular_matrix_vector_product<int,Lower|0,       Scalar,Conj, Scalar,false,RowMajor>::run);
-
-    func[NOTR  | (LO << 2) | (NUNIT << 3)] = (internal::packed_triangular_matrix_vector_product<int,Lower|0,       Scalar,false,Scalar,false,ColMajor>::run);
-    func[TR    | (LO << 2) | (NUNIT << 3)] = (internal::packed_triangular_matrix_vector_product<int,Upper|0,       Scalar,false,Scalar,false,RowMajor>::run);
-    func[ADJ   | (LO << 2) | (NUNIT << 3)] = (internal::packed_triangular_matrix_vector_product<int,Upper|0,       Scalar,Conj, Scalar,false,RowMajor>::run);
-
-    func[NOTR  | (UP << 2) | (UNIT  << 3)] = (internal::packed_triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,false,Scalar,false,ColMajor>::run);
-    func[TR    | (UP << 2) | (UNIT  << 3)] = (internal::packed_triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,false,Scalar,false,RowMajor>::run);
-    func[ADJ   | (UP << 2) | (UNIT  << 3)] = (internal::packed_triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,Conj, Scalar,false,RowMajor>::run);
-
-    func[NOTR  | (LO << 2) | (UNIT  << 3)] = (internal::packed_triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,false,Scalar,false,ColMajor>::run);
-    func[TR    | (LO << 2) | (UNIT  << 3)] = (internal::packed_triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,false,Scalar,false,RowMajor>::run);
-    func[ADJ   | (LO << 2) | (UNIT  << 3)] = (internal::packed_triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,Conj, Scalar,false,RowMajor>::run);
-
-    init = true;
-  }
+  static const functype func[16] = {
+    // array index: NOTR  | (UP << 2) | (NUNIT << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Upper|0,       Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (NUNIT << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Lower|0,       Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (NUNIT << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Lower|0,       Scalar,Conj, Scalar,false,RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (NUNIT << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Lower|0,       Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (NUNIT << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Upper|0,       Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (NUNIT << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Upper|0,       Scalar,Conj, Scalar,false,RowMajor>::run),
+    0,
+    // array index: NOTR  | (UP << 2) | (UNIT  << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (UNIT  << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (UNIT  << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,Conj, Scalar,false,RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (UNIT  << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Lower|UnitDiag,Scalar,false,Scalar,false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (UNIT  << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,false,Scalar,false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (UNIT  << 3)
+    (internal::packed_triangular_matrix_vector_product<int,Upper|UnitDiag,Scalar,Conj, Scalar,false,RowMajor>::run),
+    0
+  };
 
   Scalar* ap = reinterpret_cast<Scalar*>(pap);
   Scalar* x = reinterpret_cast<Scalar*>(px);
@@ -487,32 +498,36 @@ int EIGEN_BLAS_FUNC(tpmv)(char *uplo, char *opa, char *diag, int *n, RealScalar
 int EIGEN_BLAS_FUNC(tpsv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pap, RealScalar *px, int *incx)
 {
   typedef void (*functype)(int, const Scalar*, Scalar*);
-  static functype func[16];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<16; ++k)
-      func[k] = 0;
-
-    func[NOTR  | (UP << 2) | (NUNIT << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       false,ColMajor>::run);
-    func[TR    | (UP << 2) | (NUNIT << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       false,RowMajor>::run);
-    func[ADJ   | (UP << 2) | (NUNIT << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       Conj, RowMajor>::run);
-
-    func[NOTR  | (LO << 2) | (NUNIT << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       false,ColMajor>::run);
-    func[TR    | (LO << 2) | (NUNIT << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       false,RowMajor>::run);
-    func[ADJ   | (LO << 2) | (NUNIT << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       Conj, RowMajor>::run);
-
-    func[NOTR  | (UP << 2) | (UNIT  << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,false,ColMajor>::run);
-    func[TR    | (UP << 2) | (UNIT  << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,false,RowMajor>::run);
-    func[ADJ   | (UP << 2) | (UNIT  << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,Conj, RowMajor>::run);
-
-    func[NOTR  | (LO << 2) | (UNIT  << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,false,ColMajor>::run);
-    func[TR    | (LO << 2) | (UNIT  << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,false,RowMajor>::run);
-    func[ADJ   | (LO << 2) | (UNIT  << 3)] = (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,Conj, RowMajor>::run);
-
-    init = true;
-  }
+  static const functype func[16] = {
+    // array index: NOTR  | (UP << 2) | (NUNIT << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (NUNIT << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (NUNIT << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       Conj, RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (NUNIT << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|0,       false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (NUNIT << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (NUNIT << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|0,       Conj, RowMajor>::run),
+    0,
+    // array index: NOTR  | (UP << 2) | (UNIT  << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,false,ColMajor>::run),
+    // array index: TR    | (UP << 2) | (UNIT  << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,false,RowMajor>::run),
+    // array index: ADJ   | (UP << 2) | (UNIT  << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,Conj, RowMajor>::run),
+    0,
+    // array index: NOTR  | (LO << 2) | (UNIT  << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Lower|UnitDiag,false,ColMajor>::run),
+    // array index: TR    | (LO << 2) | (UNIT  << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,false,RowMajor>::run),
+    // array index: ADJ   | (LO << 2) | (UNIT  << 3)
+    (internal::packed_triangular_solve_vector<Scalar,Scalar,int,OnTheLeft, Upper|UnitDiag,Conj, RowMajor>::run),
+    0
+  };
 
   Scalar* ap = reinterpret_cast<Scalar*>(pap);
   Scalar* x = reinterpret_cast<Scalar*>(px);
diff --git a/blas/level2_real_impl.h b/blas/level2_real_impl.h
index cac89b268..4896a03d9 100644
--- a/blas/level2_real_impl.h
+++ b/blas/level2_real_impl.h
@@ -13,19 +13,12 @@
 int EIGEN_BLAS_FUNC(symv) (char *uplo, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *px, int *incx, RealScalar *pbeta, RealScalar *py, int *incy)
 {
   typedef void (*functype)(int, const Scalar*, int, const Scalar*, Scalar*, Scalar);
-  static functype func[2];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<2; ++k)
-      func[k] = 0;
-
-    func[UP] = (internal::selfadjoint_matrix_vector_product<Scalar,int,ColMajor,Upper,false,false>::run);
-    func[LO] = (internal::selfadjoint_matrix_vector_product<Scalar,int,ColMajor,Lower,false,false>::run);
-
-    init = true;
-  }
+  static const functype func[2] = {
+    // array index: UP
+    (internal::selfadjoint_matrix_vector_product<Scalar,int,ColMajor,Upper,false,false>::run),
+    // array index: LO
+    (internal::selfadjoint_matrix_vector_product<Scalar,int,ColMajor,Lower,false,false>::run),
+  };
 
   Scalar* a = reinterpret_cast<Scalar*>(pa);
   Scalar* x = reinterpret_cast<Scalar*>(px);
@@ -71,34 +64,13 @@ int EIGEN_BLAS_FUNC(symv) (char *uplo, int *n, RealScalar *palpha, RealScalar *p
 int EIGEN_BLAS_FUNC(syr)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *pc, int *ldc)
 {
 
-//   typedef void (*functype)(int, const Scalar *, int, Scalar *, int, Scalar);
-//   static functype func[2];
-
-//   static bool init = false;
-//   if(!init)
-//   {
-//     for(int k=0; k<2; ++k)
-//       func[k] = 0;
-//
-//     func[UP] = (internal::selfadjoint_product<Scalar,ColMajor,ColMajor,false,UpperTriangular>::run);
-//     func[LO] = (internal::selfadjoint_product<Scalar,ColMajor,ColMajor,false,LowerTriangular>::run);
-
-//     init = true;
-//   }
   typedef void (*functype)(int, Scalar*, int, const Scalar*, const Scalar*, const Scalar&);
-  static functype func[2];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<2; ++k)
-      func[k] = 0;
-
-    func[UP] = (selfadjoint_rank1_update<Scalar,int,ColMajor,Upper,false,Conj>::run);
-    func[LO] = (selfadjoint_rank1_update<Scalar,int,ColMajor,Lower,false,Conj>::run);
-
-    init = true;
-  }
+  static const functype func[2] = {
+    // array index: UP
+    (selfadjoint_rank1_update<Scalar,int,ColMajor,Upper,false,Conj>::run),
+    // array index: LO
+    (selfadjoint_rank1_update<Scalar,int,ColMajor,Lower,false,Conj>::run),
+  };
 
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar* c = reinterpret_cast<Scalar*>(pc);
@@ -131,34 +103,13 @@ int EIGEN_BLAS_FUNC(syr)(char *uplo, int *n, RealScalar *palpha, RealScalar *px,
 // C := alpha*x*y' + alpha*y*x' + C
 int EIGEN_BLAS_FUNC(syr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, int *ldc)
 {
-//   typedef void (*functype)(int, const Scalar *, int, const Scalar *, int, Scalar *, int, Scalar);
-//   static functype func[2];
-//
-//   static bool init = false;
-//   if(!init)
-//   {
-//     for(int k=0; k<2; ++k)
-//       func[k] = 0;
-//
-//     func[UP] = (internal::selfadjoint_product<Scalar,ColMajor,ColMajor,false,UpperTriangular>::run);
-//     func[LO] = (internal::selfadjoint_product<Scalar,ColMajor,ColMajor,false,LowerTriangular>::run);
-//
-//     init = true;
-//   }
   typedef void (*functype)(int, Scalar*, int, const Scalar*, const Scalar*, Scalar);
-  static functype func[2];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<2; ++k)
-      func[k] = 0;
-
-    func[UP] = (internal::rank2_update_selector<Scalar,int,Upper>::run);
-    func[LO] = (internal::rank2_update_selector<Scalar,int,Lower>::run);
-
-    init = true;
-  }
+  static const functype func[2] = {
+    // array index: UP
+    (internal::rank2_update_selector<Scalar,int,Upper>::run),
+    // array index: LO
+    (internal::rank2_update_selector<Scalar,int,Lower>::run),
+  };
 
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar* y = reinterpret_cast<Scalar*>(py);
@@ -234,19 +185,12 @@ int EIGEN_BLAS_FUNC(syr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px
 int EIGEN_BLAS_FUNC(spr)(char *uplo, int *n, Scalar *palpha, Scalar *px, int *incx, Scalar *pap)
 {
   typedef void (*functype)(int, Scalar*, const Scalar*, Scalar);
-  static functype func[2];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<2; ++k)
-      func[k] = 0;
-
-    func[UP] = (internal::selfadjoint_packed_rank1_update<Scalar,int,ColMajor,Upper,false,false>::run);
-    func[LO] = (internal::selfadjoint_packed_rank1_update<Scalar,int,ColMajor,Lower,false,false>::run);
-
-    init = true;
-  }
+  static const functype func[2] = {
+    // array index: UP
+    (internal::selfadjoint_packed_rank1_update<Scalar,int,ColMajor,Upper,false,false>::run),
+    // array index: LO
+    (internal::selfadjoint_packed_rank1_update<Scalar,int,ColMajor,Lower,false,false>::run),
+  };
 
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar* ap = reinterpret_cast<Scalar*>(pap);
@@ -285,19 +229,12 @@ int EIGEN_BLAS_FUNC(spr)(char *uplo, int *n, Scalar *palpha, Scalar *px, int *in
 int EIGEN_BLAS_FUNC(spr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pap)
 {
   typedef void (*functype)(int, Scalar*, const Scalar*, const Scalar*, Scalar);
-  static functype func[2];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<2; ++k)
-      func[k] = 0;
-
-    func[UP] = (internal::packed_rank2_update_selector<Scalar,int,Upper>::run);
-    func[LO] = (internal::packed_rank2_update_selector<Scalar,int,Lower>::run);
-
-    init = true;
-  }
+  static const functype func[2] = {
+    // array index: UP
+    (internal::packed_rank2_update_selector<Scalar,int,Upper>::run),
+    // array index: LO
+    (internal::packed_rank2_update_selector<Scalar,int,Lower>::run),
+  };
 
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar* y = reinterpret_cast<Scalar*>(py);
diff --git a/blas/level3_impl.h b/blas/level3_impl.h
index 6a6b00728..267a727ef 100644
--- a/blas/level3_impl.h
+++ b/blas/level3_impl.h
@@ -13,24 +13,29 @@ int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScal
 {
 //   std::cerr << "in gemm " << *opa << " " << *opb << " " << *m << " " << *n << " " << *k << " " << *lda << " " << *ldb << " " << *ldc << " " << *palpha << " " << *pbeta << "\n";
   typedef void (*functype)(DenseIndex, DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, Scalar, internal::level3_blocking<Scalar,Scalar>&, Eigen::internal::GemmParallelInfo<DenseIndex>*);
-  static functype func[12];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int i=0; i<12; ++i)
-      func[i] = 0;
-    func[NOTR  | (NOTR << 2)] = (internal::general_matrix_matrix_product<DenseIndex,Scalar,ColMajor,false,Scalar,ColMajor,false,ColMajor>::run);
-    func[TR    | (NOTR << 2)] = (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,false,Scalar,ColMajor,false,ColMajor>::run);
-    func[ADJ   | (NOTR << 2)] = (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,false,ColMajor>::run);
-    func[NOTR  | (TR   << 2)] = (internal::general_matrix_matrix_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,false,ColMajor>::run);
-    func[TR    | (TR   << 2)] = (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,false,Scalar,RowMajor,false,ColMajor>::run);
-    func[ADJ   | (TR   << 2)] = (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,RowMajor,false,ColMajor>::run);
-    func[NOTR  | (ADJ  << 2)] = (internal::general_matrix_matrix_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,Conj, ColMajor>::run);
-    func[TR    | (ADJ  << 2)] = (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,false,Scalar,RowMajor,Conj, ColMajor>::run);
-    func[ADJ   | (ADJ  << 2)] = (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,RowMajor,Conj, ColMajor>::run);
-    init = true;
-  }
+  static const functype func[12] = {
+    // array index: NOTR  | (NOTR << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,ColMajor,false,Scalar,ColMajor,false,ColMajor>::run),
+    // array index: TR    | (NOTR << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,false,Scalar,ColMajor,false,ColMajor>::run),
+    // array index: ADJ   | (NOTR << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,false,ColMajor>::run),
+    0,
+    // array index: NOTR  | (TR   << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,false,ColMajor>::run),
+    // array index: TR    | (TR   << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,false,Scalar,RowMajor,false,ColMajor>::run),
+    // array index: ADJ   | (TR   << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,RowMajor,false,ColMajor>::run),
+    0,
+    // array index: NOTR  | (ADJ  << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,Conj, ColMajor>::run),
+    // array index: TR    | (ADJ  << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,false,Scalar,RowMajor,Conj, ColMajor>::run),
+    // array index: ADJ   | (ADJ  << 2)
+    (internal::general_matrix_matrix_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,RowMajor,Conj, ColMajor>::run),
+    0
+  };
 
   Scalar* a = reinterpret_cast<Scalar*>(pa);
   Scalar* b = reinterpret_cast<Scalar*>(pb);
@@ -73,49 +78,64 @@ int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m,
 {
 //   std::cerr << "in trsm " << *side << " " << *uplo << " " << *opa << " " << *diag << " " << *m << "," << *n << " " << *palpha << " " << *lda << " " << *ldb<< "\n";
   typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, internal::level3_blocking<Scalar,Scalar>&);
-  static functype func[32];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int i=0; i<32; ++i)
-      func[i] = 0;
-
-    func[NOTR  | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|0,          false,ColMajor,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|0,          false,RowMajor,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|0,          Conj, RowMajor,ColMajor>::run);
-
-    func[NOTR  | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|0,          false,ColMajor,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|0,          false,RowMajor,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|0,          Conj, RowMajor,ColMajor>::run);
-
-    func[NOTR  | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|0,          false,ColMajor,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|0,          false,RowMajor,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|0,          Conj, RowMajor,ColMajor>::run);
-
-    func[NOTR  | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|0,          false,ColMajor,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|0,          false,RowMajor,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|0,          Conj, RowMajor,ColMajor>::run);
-
-
-    func[NOTR  | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|UnitDiag,false,ColMajor,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|UnitDiag,false,RowMajor,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|UnitDiag,Conj, RowMajor,ColMajor>::run);
-
-    func[NOTR  | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|UnitDiag,false,ColMajor,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|UnitDiag,false,RowMajor,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|UnitDiag,Conj, RowMajor,ColMajor>::run);
-
-    func[NOTR  | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|UnitDiag,false,ColMajor,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|UnitDiag,false,RowMajor,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|UnitDiag,Conj, RowMajor,ColMajor>::run);
-
-    func[NOTR  | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|UnitDiag,false,ColMajor,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|UnitDiag,false,RowMajor,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)] = (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|UnitDiag,Conj, RowMajor,ColMajor>::run);
-
-    init = true;
-  }
+  static const functype func[32] = {
+    // array index: NOTR  | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|0,          false,ColMajor,ColMajor>::run),
+    // array index: TR    | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|0,          false,RowMajor,ColMajor>::run),
+    // array index: ADJ   | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|0,          Conj, RowMajor,ColMajor>::run),\
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|0,          false,ColMajor,ColMajor>::run),
+    // array index: TR    | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|0,          false,RowMajor,ColMajor>::run),
+    // array index: ADJ   | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|0,          Conj, RowMajor,ColMajor>::run),
+    0,
+    // array index: NOTR  | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|0,          false,ColMajor,ColMajor>::run),
+    // array index: TR    | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|0,          false,RowMajor,ColMajor>::run),
+    // array index: ADJ   | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|0,          Conj, RowMajor,ColMajor>::run),
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|0,          false,ColMajor,ColMajor>::run),
+    // array index: TR    | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|0,          false,RowMajor,ColMajor>::run),
+    // array index: ADJ   | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|0,          Conj, RowMajor,ColMajor>::run),
+    0,
+    // array index: NOTR  | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|UnitDiag,false,ColMajor,ColMajor>::run),
+    // array index: TR    | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|UnitDiag,false,RowMajor,ColMajor>::run),
+    // array index: ADJ   | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|UnitDiag,Conj, RowMajor,ColMajor>::run),
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|UnitDiag,false,ColMajor,ColMajor>::run),
+    // array index: TR    | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|UnitDiag,false,RowMajor,ColMajor>::run),
+    // array index: ADJ   | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|UnitDiag,Conj, RowMajor,ColMajor>::run),
+    0,
+    // array index: NOTR  | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Lower|UnitDiag,false,ColMajor,ColMajor>::run),
+    // array index: TR    | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|UnitDiag,false,RowMajor,ColMajor>::run),
+    // array index: ADJ   | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheLeft, Upper|UnitDiag,Conj, RowMajor,ColMajor>::run),
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Lower|UnitDiag,false,ColMajor,ColMajor>::run),
+    // array index: TR    | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|UnitDiag,false,RowMajor,ColMajor>::run),
+    // array index: ADJ   | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::triangular_solve_matrix<Scalar,DenseIndex,OnTheRight,Upper|UnitDiag,Conj, RowMajor,ColMajor>::run),
+    0
+  };
 
   Scalar* a = reinterpret_cast<Scalar*>(pa);
   Scalar* b = reinterpret_cast<Scalar*>(pb);
@@ -162,47 +182,64 @@ int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m,
 {
 //   std::cerr << "in trmm " << *side << " " << *uplo << " " << *opa << " " << *diag << " " << *m << " " << *n << " " << *lda << " " << *ldb << " " << *palpha << "\n";
   typedef void (*functype)(DenseIndex, DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&, internal::level3_blocking<Scalar,Scalar>&);
-  static functype func[32];
-  static bool init = false;
-  if(!init)
-  {
-    for(int k=0; k<32; ++k)
-      func[k] = 0;
-
-    func[NOTR  | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          true, ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          true, RowMajor,false,ColMajor,false,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          true, RowMajor,Conj, ColMajor,false,ColMajor>::run);
-
-    func[NOTR  | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          false,ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          false,ColMajor,false,RowMajor,false,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          false,ColMajor,false,RowMajor,Conj, ColMajor>::run);
-
-    func[NOTR  | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          true, ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          true, RowMajor,false,ColMajor,false,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          true, RowMajor,Conj, ColMajor,false,ColMajor>::run);
-
-    func[NOTR  | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          false,ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          false,ColMajor,false,RowMajor,false,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          false,ColMajor,false,RowMajor,Conj, ColMajor>::run);
-
-    func[NOTR  | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,true, ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,true, RowMajor,false,ColMajor,false,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,true, RowMajor,Conj, ColMajor,false,ColMajor>::run);
-
-    func[NOTR  | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,false,ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,false,ColMajor,false,RowMajor,false,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,false,ColMajor,false,RowMajor,Conj, ColMajor>::run);
-
-    func[NOTR  | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,true, ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,true, RowMajor,false,ColMajor,false,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,true, RowMajor,Conj, ColMajor,false,ColMajor>::run);
-
-    func[NOTR  | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,false,ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,false,ColMajor,false,RowMajor,false,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)] = (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,false,ColMajor,false,RowMajor,Conj, ColMajor>::run);
-
-    init = true;
-  }
+  static const functype func[32] = {
+    // array index: NOTR  | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          true, ColMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: TR    | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          true, RowMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: ADJ   | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          true, RowMajor,Conj, ColMajor,false,ColMajor>::run),
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          false,ColMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: TR    | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          false,ColMajor,false,RowMajor,false,ColMajor>::run),
+    // array index: ADJ   | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          false,ColMajor,false,RowMajor,Conj, ColMajor>::run),
+    0,
+    // array index: NOTR  | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          true, ColMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: TR    | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          true, RowMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: ADJ   | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          true, RowMajor,Conj, ColMajor,false,ColMajor>::run),
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|0,          false,ColMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: TR    | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          false,ColMajor,false,RowMajor,false,ColMajor>::run),
+    // array index: ADJ   | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|0,          false,ColMajor,false,RowMajor,Conj, ColMajor>::run),
+    0,
+    // array index: NOTR  | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,true, ColMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: TR    | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,true, RowMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: ADJ   | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,true, RowMajor,Conj, ColMajor,false,ColMajor>::run),
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,false,ColMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: TR    | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,false,ColMajor,false,RowMajor,false,ColMajor>::run),
+    // array index: ADJ   | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,false,ColMajor,false,RowMajor,Conj, ColMajor>::run),
+    0,
+    // array index: NOTR  | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,true, ColMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: TR    | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,true, RowMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: ADJ   | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,true, RowMajor,Conj, ColMajor,false,ColMajor>::run),
+    0,
+    // array index: NOTR  | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Lower|UnitDiag,false,ColMajor,false,ColMajor,false,ColMajor>::run),
+    // array index: TR    | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,false,ColMajor,false,RowMajor,false,ColMajor>::run),
+    // array index: ADJ   | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)
+    (internal::product_triangular_matrix_matrix<Scalar,DenseIndex,Upper|UnitDiag,false,ColMajor,false,RowMajor,Conj, ColMajor>::run),
+    0
+  };
 
   Scalar* a = reinterpret_cast<Scalar*>(pa);
   Scalar* b = reinterpret_cast<Scalar*>(pb);
@@ -275,9 +312,9 @@ int EIGEN_BLAS_FUNC(symm)(char *side, char *uplo, int *m, int *n, RealScalar *pa
     return 1;
   }
 
+  int size = (SIDE(*side)==LEFT) ? (*m) : (*n);
   #if ISCOMPLEX
   // FIXME add support for symmetric complex matrix
-  int size = (SIDE(*side)==LEFT) ? (*m) : (*n);
   Matrix<Scalar,Dynamic,Dynamic,ColMajor> matA(size,size);
   if(UPLO(*uplo)==UP)
   {
@@ -294,13 +331,15 @@ int EIGEN_BLAS_FUNC(symm)(char *side, char *uplo, int *m, int *n, RealScalar *pa
   else if(SIDE(*side)==RIGHT)
     matrix(c, *m, *n, *ldc) += alpha * matrix(b, *m, *n, *ldb) * matA;
   #else
+  internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*m,*n,size,1,false);
+
   if(SIDE(*side)==LEFT)
-    if(UPLO(*uplo)==UP)       internal::product_selfadjoint_matrix<Scalar, DenseIndex, RowMajor,true,false, ColMajor,false,false, ColMajor>::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha);
-    else if(UPLO(*uplo)==LO)  internal::product_selfadjoint_matrix<Scalar, DenseIndex, ColMajor,true,false, ColMajor,false,false, ColMajor>::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha);
+    if(UPLO(*uplo)==UP)       internal::product_selfadjoint_matrix<Scalar, DenseIndex, RowMajor,true,false, ColMajor,false,false, ColMajor>::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha, blocking);
+    else if(UPLO(*uplo)==LO)  internal::product_selfadjoint_matrix<Scalar, DenseIndex, ColMajor,true,false, ColMajor,false,false, ColMajor>::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha, blocking);
     else                      return 0;
   else if(SIDE(*side)==RIGHT)
-    if(UPLO(*uplo)==UP)       internal::product_selfadjoint_matrix<Scalar, DenseIndex, ColMajor,false,false, RowMajor,true,false, ColMajor>::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha);
-    else if(UPLO(*uplo)==LO)  internal::product_selfadjoint_matrix<Scalar, DenseIndex, ColMajor,false,false, ColMajor,true,false, ColMajor>::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha);
+    if(UPLO(*uplo)==UP)       internal::product_selfadjoint_matrix<Scalar, DenseIndex, ColMajor,false,false, RowMajor,true,false, ColMajor>::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha, blocking);
+    else if(UPLO(*uplo)==LO)  internal::product_selfadjoint_matrix<Scalar, DenseIndex, ColMajor,false,false, ColMajor,true,false, ColMajor>::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha, blocking);
     else                      return 0;
   else
     return 0;
@@ -315,25 +354,23 @@ int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palp
 {
 //   std::cerr << "in syrk " << *uplo << " " << *op << " " << *n << " " << *k << " " << *palpha << " " << *lda << " " << *pbeta << " " << *ldc << "\n";
   #if !ISCOMPLEX
-  typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&);
-  static functype func[8];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int i=0; i<8; ++i)
-      func[i] = 0;
-
-    func[NOTR  | (UP << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,ColMajor,Conj, Upper>::run);
-    func[TR    | (UP << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,false,Scalar,ColMajor,ColMajor,Conj, Upper>::run);
-    func[ADJ   | (UP << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,ColMajor,false,Upper>::run);
-
-    func[NOTR  | (LO << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,ColMajor,Conj, Lower>::run);
-    func[TR    | (LO << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,false,Scalar,ColMajor,ColMajor,Conj, Lower>::run);
-    func[ADJ   | (LO << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,ColMajor,false,Lower>::run);
-
-    init = true;
-  }
+  typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&, internal::level3_blocking<Scalar,Scalar>&);
+  static const functype func[8] = {
+    // array index: NOTR  | (UP << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,ColMajor,Conj, Upper>::run),
+    // array index: TR    | (UP << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,false,Scalar,ColMajor,ColMajor,Conj, Upper>::run),
+    // array index: ADJ   | (UP << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,ColMajor,false,Upper>::run),
+    0,
+    // array index: NOTR  | (LO << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,ColMajor,Conj, Lower>::run),
+    // array index: TR    | (LO << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,false,Scalar,ColMajor,ColMajor,Conj, Lower>::run),
+    // array index: ADJ   | (LO << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,ColMajor,false,Lower>::run),
+    0
+  };
   #endif
 
   Scalar* a = reinterpret_cast<Scalar*>(pa);
@@ -381,8 +418,10 @@ int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palp
       matrix(c, *n, *n, *ldc).triangularView<Lower>() += alpha * matrix(a,*k,*n,*lda).transpose() * matrix(a,*k,*n,*lda);
   }
   #else
+  internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*n,*n,*k,1,false);
+
   int code = OP(*op) | (UPLO(*uplo) << 2);
-  func[code](*n, *k, a, *lda, a, *lda, c, *ldc, alpha);
+  func[code](*n, *k, a, *lda, a, *lda, c, *ldc, alpha, blocking);
   #endif
 
   return 0;
@@ -486,20 +525,23 @@ int EIGEN_BLAS_FUNC(hemm)(char *side, char *uplo, int *m, int *n, RealScalar *pa
     return 1;
   }
 
+  int size = (SIDE(*side)==LEFT) ? (*m) : (*n);
+  internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*m,*n,size,1,false);
+
   if(SIDE(*side)==LEFT)
   {
     if(UPLO(*uplo)==UP)       internal::product_selfadjoint_matrix<Scalar,DenseIndex,RowMajor,true,Conj,  ColMajor,false,false, ColMajor>
-                                ::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha);
+                                ::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha, blocking);
     else if(UPLO(*uplo)==LO)  internal::product_selfadjoint_matrix<Scalar,DenseIndex,ColMajor,true,false, ColMajor,false,false, ColMajor>
-                                ::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha);
+                                ::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha, blocking);
     else                      return 0;
   }
   else if(SIDE(*side)==RIGHT)
   {
     if(UPLO(*uplo)==UP)       matrix(c,*m,*n,*ldc) += alpha * matrix(b,*m,*n,*ldb) * matrix(a,*n,*n,*lda).selfadjointView<Upper>();/*internal::product_selfadjoint_matrix<Scalar,DenseIndex,ColMajor,false,false, RowMajor,true,Conj,  ColMajor>
-                                ::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha);*/
+                                ::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha, blocking);*/
     else if(UPLO(*uplo)==LO)  internal::product_selfadjoint_matrix<Scalar,DenseIndex,ColMajor,false,false, ColMajor,true,false, ColMajor>
-                                ::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha);
+                                ::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha, blocking);
     else                      return 0;
   }
   else
@@ -516,23 +558,21 @@ int EIGEN_BLAS_FUNC(herk)(char *uplo, char *op, int *n, int *k, RealScalar *palp
 {
 //   std::cerr << "in herk " << *uplo << " " << *op << " " << *n << " " << *k << " " << *palpha << " " << *lda << " " << *pbeta << " " << *ldc << "\n";
 
-  typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&);
-  static functype func[8];
-
-  static bool init = false;
-  if(!init)
-  {
-    for(int i=0; i<8; ++i)
-      func[i] = 0;
-
-    func[NOTR  | (UP << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,Conj, ColMajor,Upper>::run);
-    func[ADJ   | (UP << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,false,ColMajor,Upper>::run);
-
-    func[NOTR  | (LO << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,Conj, ColMajor,Lower>::run);
-    func[ADJ   | (LO << 2)] = (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,false,ColMajor,Lower>::run);
-
-    init = true;
-  }
+  typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&, internal::level3_blocking<Scalar,Scalar>&);
+  static const functype func[8] = {
+    // array index: NOTR  | (UP << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,Conj, ColMajor,Upper>::run),
+    0,
+    // array index: ADJ   | (UP << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,false,ColMajor,Upper>::run),
+    0,
+    // array index: NOTR  | (LO << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,Conj, ColMajor,Lower>::run),
+    0,
+    // array index: ADJ   | (LO << 2)
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,false,ColMajor,Lower>::run),
+    0
+  };
 
   Scalar* a = reinterpret_cast<Scalar*>(pa);
   Scalar* c = reinterpret_cast<Scalar*>(pc);
@@ -571,7 +611,8 @@ int EIGEN_BLAS_FUNC(herk)(char *uplo, char *op, int *n, int *k, RealScalar *palp
 
   if(*k>0 && alpha!=RealScalar(0))
   {
-    func[code](*n, *k, a, *lda, a, *lda, c, *ldc, alpha);
+    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*n,*n,*k,1,false);
+    func[code](*n, *k, a, *lda, a, *lda, c, *ldc, alpha, blocking);
     matrix(c, *n, *n, *ldc).diagonal().imag().setZero();
   }
   return 0;
diff --git a/doc/AsciiQuickReference.txt b/doc/AsciiQuickReference.txt
index c604e575c..9599df60b 100644
--- a/doc/AsciiQuickReference.txt
+++ b/doc/AsciiQuickReference.txt
@@ -44,7 +44,7 @@ C.setRandom(rows,cols)                      // C = rand(rows,cols)*2-1
 VectorXd::LinSpaced(size,low,high)          // linspace(low,high,size)'
 v.setLinSpaced(size,low,high)               // v = linspace(low,high,size)'
 VectorXi::LinSpaced(((hi-low)/step)+1,      // low:step:hi
-                    low,low+step*(size-1))
+                    low,low+step*(size-1))  //
 
 
 // Matrix slicing and blocks. All expressions listed here are read/write.
@@ -94,6 +94,8 @@ R.transpose()                      // R.' or conj(R')       // Read-write
 R.diagonal()                       // diag(R)               // Read-write
 x.asDiagonal()                     // diag(x)
 R.transpose().colwise().reverse()  // rot90(R)              // Read-write
+R.rowwise().reverse()              // fliplr(R)
+R.colwise().reverse()              // flipud(R)
 R.replicate(i,j)                   // repmat(P,i,j)
 
 
@@ -139,6 +141,7 @@ R.cwiseAbs2()                  // abs(P.^2)
 R.array().abs2()               // abs(P.^2)
 (R.array() < s).select(P,Q );  // (R < s ? P : Q)
 R = (Q.array()==0).select(P,A) // R(Q==0) = P(Q==0)
+R = P.unaryExpr(ptr_fun(func)) // R = arrayfun(func, P)   // with: scalar func(const scalar &x);
 
 
 // Reductions.
diff --git a/doc/SparseLinearSystems.dox b/doc/SparseLinearSystems.dox
index 61cc50afa..ee4f53a4e 100644
--- a/doc/SparseLinearSystems.dox
+++ b/doc/SparseLinearSystems.dox
@@ -65,17 +65,17 @@ They are summarized in the following tables:
     <td>Requires the <a href="http://pastix.gforge.inria.fr">PaStiX</a> package, \b CeCILL-C </td>
     <td>optimized for tough problems and symmetric patterns</td></tr>
 <tr><td>CholmodSupernodalLLT</td><td>\link CholmodSupport_Module CholmodSupport \endlink</td><td>Direct LLt factorization</td><td>SPD</td><td>Fill-in reducing, Leverage fast dense algebra</td>
-    <td>Requires the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">SuiteSparse</a> package, \b GPL </td>
+    <td>Requires the <a href="http://www.suitesparse.com">SuiteSparse</a> package, \b GPL </td>
     <td></td></tr>
 <tr><td>UmfPackLU</td><td>\link UmfPackSupport_Module UmfPackSupport \endlink</td><td>Direct LU factorization</td><td>Square</td><td>Fill-in reducing, Leverage fast dense algebra</td>
-    <td>Requires the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">SuiteSparse</a> package, \b GPL </td>
+    <td>Requires the <a href="http://www.suitesparse.com">SuiteSparse</a> package, \b GPL </td>
     <td></td></tr>
 <tr><td>SuperLU</td><td>\link SuperLUSupport_Module SuperLUSupport \endlink</td><td>Direct LU factorization</td><td>Square</td><td>Fill-in reducing, Leverage fast dense algebra</td>
     <td>Requires the <a href="http://crd-legacy.lbl.gov/~xiaoye/SuperLU/">SuperLU</a> library, (BSD-like)</td>
     <td></td></tr>
 <tr><td>SPQR</td><td>\link SPQRSupport_Module SPQRSupport \endlink  </td> <td> QR factorization </td> 
     <td> Any, rectangular</td><td>fill-in reducing, multithreaded, fast dense algebra</td>
-    <td> requires the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">SuiteSparse</a> package, \b GPL </td><td>recommended for linear least-squares problems, has a rank-revealing feature</tr>
+    <td> requires the <a href="http://www.suitesparse.com">SuiteSparse</a> package, \b GPL </td><td>recommended for linear least-squares problems, has a rank-revealing feature</tr>
 </table>
 
 Here \c SPD means symmetric positive definite.
diff --git a/doc/TopicAliasing.dox b/doc/TopicAliasing.dox
index c2654aed2..a8f164428 100644
--- a/doc/TopicAliasing.dox
+++ b/doc/TopicAliasing.dox
@@ -153,10 +153,11 @@ not necessary to evaluate the right-hand side explicitly.
 
 \section TopicAliasingMatrixMult Aliasing and matrix multiplication
 
-Matrix multiplication is the only operation in %Eigen that assumes aliasing by default. Thus, if \c matA is a
-matrix, then the statement <tt>matA = matA * matA;</tt> is safe. All other operations in %Eigen assume that
-there are no aliasing problems, either because the result is assigned to a different matrix or because it is a
-component-wise operation.
+Matrix multiplication is the only operation in %Eigen that assumes aliasing by default, <strong>under the
+condition that the destination matrix is not resized</strong>.
+Thus, if \c matA is a \b squared matrix, then the statement <tt>matA = matA * matA;</tt> is safe.
+All other operations in %Eigen assume that there are no aliasing problems,
+either because the result is assigned to a different matrix or because it is a component-wise operation.
 
 <table class="example">
 <tr><th>Example</th><th>Output</th></tr>
@@ -198,6 +199,27 @@ may get wrong results:
 \verbinclude TopicAliasing_mult3.out
 </td></tr></table>
 
+Moreover, starting in Eigen 3.3, aliasing is \b not assumed if the destination matrix is resized and the product is not directly assigned to the destination.
+Therefore, the following example is also wrong:
+
+<table class="example">
+<tr><th>Example</th><th>Output</th></tr>
+<tr><td>
+\include TopicAliasing_mult4.cpp
+</td>
+<td>
+\verbinclude TopicAliasing_mult4.out
+</td></tr></table>
+
+As for any aliasing issue, you can resolve it by explicitly evaluating the expression prior to assignment:
+<table class="example">
+<tr><th>Example</th><th>Output</th></tr>
+<tr><td>
+\include TopicAliasing_mult5.cpp
+</td>
+<td>
+\verbinclude TopicAliasing_mult5.out
+</td></tr></table>
 
 \section TopicAliasingSummary Summary
 
diff --git a/doc/TutorialReductionsVisitorsBroadcasting.dox b/doc/TutorialReductionsVisitorsBroadcasting.dox
index 6d25ff0ea..f5322b4a6 100644
--- a/doc/TutorialReductionsVisitorsBroadcasting.dox
+++ b/doc/TutorialReductionsVisitorsBroadcasting.dox
@@ -101,17 +101,16 @@ row and column position are to be stored. These variables should be of type
 \verbinclude Tutorial_ReductionsVisitorsBroadcasting_visitors.out
 </td></tr></table>
 
-Note that both functions also return the value of the minimum or maximum coefficient if needed,
-as if it was a typical reduction operation.
+Both functions also return the value of the minimum or maximum coefficient.
 
 \section TutorialReductionsVisitorsBroadcastingPartialReductions Partial reductions
 Partial reductions are reductions that can operate column- or row-wise on a Matrix or 
 Array, applying the reduction operation on each column or row and 
-returning a column or row-vector with the corresponding values. Partial reductions are applied 
+returning a column or row vector with the corresponding values. Partial reductions are applied 
 with \link DenseBase::colwise() colwise() \endlink or \link DenseBase::rowwise() rowwise() \endlink.
 
 A simple example is obtaining the maximum of the elements 
-in each column in a given matrix, storing the result in a row-vector:
+in each column in a given matrix, storing the result in a row vector:
 
 <table class="example">
 <tr><th>Example:</th><th>Output:</th></tr>
@@ -133,8 +132,7 @@ The same operation can be performed row-wise:
 \verbinclude Tutorial_ReductionsVisitorsBroadcasting_rowwise.out
 </td></tr></table>
 
-<b>Note that column-wise operations return a 'row-vector' while row-wise operations
-return a 'column-vector'</b>
+<b>Note that column-wise operations return a row vector, while row-wise operations return a column vector.</b>
 
 \subsection TutorialReductionsVisitorsBroadcastingPartialReductionsCombined Combining partial reductions with other operations
 It is also possible to use the result of a partial reduction to do further processing.
@@ -176,7 +174,7 @@ The concept behind broadcasting is similar to partial reductions, with the diffe
 constructs an expression where a vector (column or row) is interpreted as a matrix by replicating it in 
 one direction.
 
-A simple example is to add a certain column-vector to each column in a matrix. 
+A simple example is to add a certain column vector to each column in a matrix. 
 This can be accomplished with:
 
 <table class="example">
@@ -253,7 +251,7 @@ is a new matrix whose size is the same as matrix <tt>m</tt>: \f[
 \f]
 
   - <tt>(m.colwise() - v).colwise().squaredNorm()</tt> is a partial reduction, computing the squared norm column-wise. The result of
-this operation is a row-vector where each coefficient is the squared Euclidean distance between each column in <tt>m</tt> and <tt>v</tt>: \f[
+this operation is a row vector where each coefficient is the squared Euclidean distance between each column in <tt>m</tt> and <tt>v</tt>: \f[
   \mbox{(m.colwise() - v).colwise().squaredNorm()} =
   \begin{bmatrix}
      1 & 505 & 32 & 50
diff --git a/doc/TutorialSparse.dox b/doc/TutorialSparse.dox
index fb07adaa2..1f0be387d 100644
--- a/doc/TutorialSparse.dox
+++ b/doc/TutorialSparse.dox
@@ -257,7 +257,14 @@ Binary coefficient wise operators can also mix sparse and dense expressions:
 \code
 sm2 = sm1.cwiseProduct(dm1);
 dm2 = sm1 + dm1;
+dm2 = dm1 - sm1;
 \endcode
+Performance-wise, the adding/subtracting sparse and dense matrices is better performed in two steps. For instance, instead of doing <tt>dm2 = sm1 + dm1</tt>, better write:
+\code
+dm2 = dm1;
+dm2 += sm1;
+\endcode
+This version has the advantage to fully exploit the higher performance of dense storage (no indirection, SIMD, etc.), and to pay the cost of slow sparse evaluation on the few non-zeros of the sparse matrix only.
 
 
 %Sparse expressions also support transposition:
diff --git a/doc/UsingIntelMKL.dox b/doc/UsingIntelMKL.dox
index 84db992b6..02c62ad85 100644
--- a/doc/UsingIntelMKL.dox
+++ b/doc/UsingIntelMKL.dox
@@ -52,7 +52,7 @@ When doing so, a number of Eigen's algorithms are silently substituted with call
 These substitutions apply only for \b Dynamic \b or \b large enough objects with one of the following four standard scalar types: \c float, \c double, \c complex<float>, and \c complex<double>.
 Operations on other scalar types or mixing reals and complexes will continue to use the built-in algorithms.
 
-In addition you can coarsely select choose which parts will be substituted by defining one or multiple of the following macros:
+In addition you can choose which parts will be substituted by defining one or multiple of the following macros:
 
 <table class="manual">
 <tr><td>\c EIGEN_USE_BLAS </td><td>Enables the use of external BLAS level 2 and 3 routines (currently works with Intel MKL only)</td></tr>
diff --git a/doc/snippets/TopicAliasing_mult4.cpp b/doc/snippets/TopicAliasing_mult4.cpp
new file mode 100644
index 000000000..8a8992f6c
--- /dev/null
+++ b/doc/snippets/TopicAliasing_mult4.cpp
@@ -0,0 +1,5 @@
+MatrixXf A(2,2), B(3,2);
+B << 2, 0,  0, 3, 1, 1;
+A << 2, 0, 0, -2;
+A = (B * A).cwiseAbs();
+cout << A;
+\ No newline at end of file
diff --git a/doc/snippets/TopicAliasing_mult5.cpp b/doc/snippets/TopicAliasing_mult5.cpp
new file mode 100644
index 000000000..1a36defde
--- /dev/null
+++ b/doc/snippets/TopicAliasing_mult5.cpp
@@ -0,0 +1,5 @@
+MatrixXf A(2,2), B(3,2);
+B << 2, 0,  0, 3, 1, 1;
+A << 2, 0, 0, -2;
+A = (B * A).eval().cwiseAbs();
+cout << A;
diff --git a/test/adjoint.cpp b/test/adjoint.cpp
index b1e69c2e5..9c895e0ac 100644
--- a/test/adjoint.cpp
+++ b/test/adjoint.cpp
@@ -45,12 +45,14 @@ template<> struct adjoint_specific<false> {
 
     // check null inputs
     VERIFY_IS_APPROX((v1*0).normalized(), (v1*0));
+#if (!EIGEN_ARCH_i386) || defined(EIGEN_VECTORIZE)
     RealScalar very_small = (std::numeric_limits<RealScalar>::min)();
     VERIFY( (v1*very_small).norm() == 0 );
     VERIFY_IS_APPROX((v1*very_small).normalized(), (v1*very_small));
     v3 = v1*very_small;
     v3.normalize();
     VERIFY_IS_APPROX(v3, (v1*very_small));
+#endif
     
     // check compatibility of dot and adjoint
     ref = NumTraits<Scalar>::IsInteger ? 0 : (std::max)((std::max)(v1.norm(),v2.norm()),(std::max)((square * v2).norm(),(square.adjoint() * v1).norm()));
diff --git a/test/array.cpp b/test/array.cpp
index 6adedfb06..96aef31c7 100644
--- a/test/array.cpp
+++ b/test/array.cpp
@@ -219,6 +219,7 @@ template<typename ArrayType> void array_real(const ArrayType& m)
   VERIFY_IS_APPROX(m1.tanh(), tanh(m1));
 #ifdef EIGEN_HAS_C99_MATH
   VERIFY_IS_APPROX(m1.lgamma(), lgamma(m1));
+  VERIFY_IS_APPROX(m1.digamma(), digamma(m1));
   VERIFY_IS_APPROX(m1.erf(), erf(m1));
   VERIFY_IS_APPROX(m1.erfc(), erfc(m1));
 #endif  // EIGEN_HAS_C99_MATH
@@ -309,7 +310,22 @@ template<typename ArrayType> void array_real(const ArrayType& m)
   s1 += Scalar(tiny);
   m1 += ArrayType::Constant(rows,cols,Scalar(tiny));
   VERIFY_IS_APPROX(s1/m1, s1 * m1.inverse());
-  
+
+  // check special functions (comparing against numpy implementation)
+#ifdef EIGEN_HAS_C99_MATH
+  if (!NumTraits<Scalar>::IsComplex) {
+    VERIFY_IS_APPROX(numext::digamma(Scalar(1)), RealScalar(-0.5772156649015329));
+    VERIFY_IS_APPROX(numext::digamma(Scalar(1.5)), RealScalar(0.03648997397857645));
+    VERIFY_IS_APPROX(numext::digamma(Scalar(4)), RealScalar(1.2561176684318));
+    VERIFY_IS_APPROX(numext::digamma(Scalar(-10.5)), RealScalar(2.398239129535781));
+    VERIFY_IS_APPROX(numext::digamma(Scalar(10000.5)), RealScalar(9.210340372392849));
+    VERIFY_IS_EQUAL(numext::digamma(Scalar(0)),
+                    std::numeric_limits<RealScalar>::infinity());
+    VERIFY_IS_EQUAL(numext::digamma(Scalar(-1)),
+                    std::numeric_limits<RealScalar>::infinity());
+  }
+#endif  // EIGEN_HAS_C99_MATH
+
   // check inplace transpose
   m3 = m1;
   m3.transposeInPlace();
@@ -336,8 +352,6 @@ template<typename ArrayType> void array_complex(const ArrayType& m)
 
   Array<RealScalar, -1, -1> m3(rows, cols);
 
-  Scalar  s1 = internal::random<Scalar>();
-
   for (Index i = 0; i < m.rows(); ++i)
     for (Index j = 0; j < m.cols(); ++j)
       m2(i,j) = sqrt(m1(i,j));
@@ -410,6 +424,7 @@ template<typename ArrayType> void array_complex(const ArrayType& m)
   VERIFY_IS_APPROX( m1.sign() * m1.abs(), m1);
 
   // scalar by array division
+  Scalar  s1 = internal::random<Scalar>();
   const RealScalar tiny = sqrt(std::numeric_limits<RealScalar>::epsilon());
   s1 += Scalar(tiny);
   m1 += ArrayType::Constant(rows,cols,Scalar(tiny));
diff --git a/test/array_for_matrix.cpp b/test/array_for_matrix.cpp
index 9667e1f14..db5f3b34a 100644
--- a/test/array_for_matrix.cpp
+++ b/test/array_for_matrix.cpp
@@ -68,6 +68,16 @@ template<typename MatrixType> void array_for_matrix(const MatrixType& m)
   const Scalar& ref_a2 = m.array().matrix().coeffRef(0,0);
   VERIFY(&ref_a1 == &ref_m1);
   VERIFY(&ref_a2 == &ref_m2);
+
+  // Check write accessors:
+  m1.array().coeffRef(0,0) = 1;
+  VERIFY_IS_APPROX(m1(0,0),Scalar(1));
+  m1.array()(0,0) = 2;
+  VERIFY_IS_APPROX(m1(0,0),Scalar(2));
+  m1.array().matrix().coeffRef(0,0) = 3;
+  VERIFY_IS_APPROX(m1(0,0),Scalar(3));
+  m1.array().matrix()(0,0) = 4;
+  VERIFY_IS_APPROX(m1(0,0),Scalar(4));
 }
 
 template<typename MatrixType> void comparisons(const MatrixType& m)
diff --git a/test/diagonal.cpp b/test/diagonal.cpp
index 53814a588..ee00cad55 100644
--- a/test/diagonal.cpp
+++ b/test/diagonal.cpp
@@ -20,6 +20,8 @@ template<typename MatrixType> void diagonal(const MatrixType& m)
   MatrixType m1 = MatrixType::Random(rows, cols),
              m2 = MatrixType::Random(rows, cols);
 
+  Scalar s1 = internal::random<Scalar>();
+
   //check diagonal()
   VERIFY_IS_APPROX(m1.diagonal(), m1.transpose().diagonal());
   m2.diagonal() = 2 * m1.diagonal();
@@ -58,6 +60,11 @@ template<typename MatrixType> void diagonal(const MatrixType& m)
     VERIFY_IS_APPROX(m2.template diagonal<N2>(), static_cast<Scalar>(2) * m1.diagonal(N2));
     m2.diagonal(N2)[0] *= 3;
     VERIFY_IS_APPROX(m2.diagonal(N2)[0], static_cast<Scalar>(6) * m1.diagonal(N2)[0]);
+
+    m2.diagonal(N2).x() = s1;
+    VERIFY_IS_APPROX(m2.diagonal(N2).x(), s1);
+    m2.diagonal(N2).coeffRef(0) = Scalar(2)*s1;
+    VERIFY_IS_APPROX(m2.diagonal(N2).coeff(0), Scalar(2)*s1);
   }
 }
 
diff --git a/test/incomplete_cholesky.cpp b/test/incomplete_cholesky.cpp
index 7acad9872..59ffe9259 100644
--- a/test/incomplete_cholesky.cpp
+++ b/test/incomplete_cholesky.cpp
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2015-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -34,4 +34,32 @@ void test_incomplete_cholesky()
   CALL_SUBTEST_1(( test_incomplete_cholesky_T<double,int>() ));
   CALL_SUBTEST_2(( test_incomplete_cholesky_T<std::complex<double>, int>() ));
   CALL_SUBTEST_3(( test_incomplete_cholesky_T<double,long int>() ));
+
+#ifdef EIGEN_TEST_PART_1
+    // regression for bug 1150
+  for(int N = 1; N<20; ++N)
+  {
+    Eigen::MatrixXd b( N, N );
+    b.setOnes();
+
+    Eigen::SparseMatrix<double> m( N, N );
+    m.reserve(Eigen::VectorXi::Constant(N,4));
+    for( int i = 0; i < N; ++i )
+    {
+        m.insert( i, i ) = 1;
+        m.coeffRef( i, i / 2 ) = 2;
+        m.coeffRef( i, i / 3 ) = 2;
+        m.coeffRef( i, i / 4 ) = 2;
+    }
+
+    Eigen::SparseMatrix<double> A;
+    A = m * m.transpose();
+
+    Eigen::ConjugateGradient<Eigen::SparseMatrix<double>,
+        Eigen::Lower | Eigen::Upper,
+        Eigen::IncompleteCholesky<double> > solver( A );
+    VERIFY(solver.preconditioner().info() == Eigen::Success);
+    VERIFY(solver.info() == Eigen::Success);
+  }
+#endif
 }
diff --git a/test/mixingtypes.cpp b/test/mixingtypes.cpp
index 32d9d0be9..a3b469af8 100644
--- a/test/mixingtypes.cpp
+++ b/test/mixingtypes.cpp
@@ -44,6 +44,7 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
   Mat_d md    = mf.template cast<double>();
   Mat_cf mcf  = Mat_cf::Random(size,size);
   Mat_cd mcd  = mcf.template cast<complex<double> >();
+  Mat_cd rcd = mcd;
   Vec_f vf    = Vec_f::Random(size,1);
   Vec_d vd    = vf.template cast<double>();
   Vec_cf vcf  = Vec_cf::Random(size,1);
@@ -103,24 +104,23 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
   VERIFY_IS_APPROX(mcd.array() *= md.array(), mcd2.array() *= md.array().template cast<std::complex<double> >());
   
   // check matrix-matrix products
-
   VERIFY_IS_APPROX(sd*md*mcd, (sd*md).template cast<CD>().eval()*mcd);
   VERIFY_IS_APPROX(sd*mcd*md, sd*mcd*md.template cast<CD>());
   VERIFY_IS_APPROX(scd*md*mcd, scd*md.template cast<CD>().eval()*mcd);
   VERIFY_IS_APPROX(scd*mcd*md, scd*mcd*md.template cast<CD>());
-  
+
   VERIFY_IS_APPROX(sf*mf*mcf, sf*mf.template cast<CF>()*mcf);
   VERIFY_IS_APPROX(sf*mcf*mf, sf*mcf*mf.template cast<CF>());
   VERIFY_IS_APPROX(scf*mf*mcf, scf*mf.template cast<CF>()*mcf);
   VERIFY_IS_APPROX(scf*mcf*mf, scf*mcf*mf.template cast<CF>());
-  
+
   VERIFY_IS_APPROX(sd*md.adjoint()*mcd, (sd*md).template cast<CD>().eval().adjoint()*mcd);
   VERIFY_IS_APPROX(sd*mcd.adjoint()*md, sd*mcd.adjoint()*md.template cast<CD>());
   VERIFY_IS_APPROX(sd*md.adjoint()*mcd.adjoint(), (sd*md).template cast<CD>().eval().adjoint()*mcd.adjoint());
   VERIFY_IS_APPROX(sd*mcd.adjoint()*md.adjoint(), sd*mcd.adjoint()*md.template cast<CD>().adjoint());
   VERIFY_IS_APPROX(sd*md*mcd.adjoint(), (sd*md).template cast<CD>().eval()*mcd.adjoint());
   VERIFY_IS_APPROX(sd*mcd*md.adjoint(), sd*mcd*md.template cast<CD>().adjoint());
-  
+
   VERIFY_IS_APPROX(sf*mf.adjoint()*mcf, (sf*mf).template cast<CF>().eval().adjoint()*mcf);
   VERIFY_IS_APPROX(sf*mcf.adjoint()*mf, sf*mcf.adjoint()*mf.template cast<CF>());
   VERIFY_IS_APPROX(sf*mf.adjoint()*mcf.adjoint(), (sf*mf).template cast<CF>().eval().adjoint()*mcf.adjoint());
@@ -147,6 +147,39 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
   VERIFY_IS_APPROX(scd*vcd.adjoint()*md, scd*vcd.adjoint()*md.template cast<CD>().eval());
   VERIFY_IS_APPROX(sd*vd.adjoint()*mcd,  sd*vd.adjoint().template cast<CD>().eval()*mcd);
   VERIFY_IS_APPROX(scd*vd.adjoint()*mcd, scd*vd.adjoint().template cast<CD>().eval()*mcd);
+
+  VERIFY_IS_APPROX(sd*vcd.adjoint()*md.template triangularView<Upper>(),  sd*vcd.adjoint()*md.template cast<CD>().eval().template triangularView<Upper>());
+  VERIFY_IS_APPROX(scd*vcd.adjoint()*md.template triangularView<Lower>(), scd*vcd.adjoint()*md.template cast<CD>().eval().template triangularView<Lower>());
+  VERIFY_IS_APPROX(sd*vd.adjoint()*mcd.template triangularView<Lower>(),  sd*vd.adjoint().template cast<CD>().eval()*mcd.template triangularView<Lower>());
+  VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.template triangularView<Upper>(), scd*vd.adjoint().template cast<CD>().eval()*mcd.template triangularView<Upper>());
+
+  // Not supported yet: trmm
+//   VERIFY_IS_APPROX(sd*mcd*md.template triangularView<Lower>(),  sd*mcd*md.template cast<CD>().eval().template triangularView<Lower>());
+//   VERIFY_IS_APPROX(scd*mcd*md.template triangularView<Upper>(), scd*mcd*md.template cast<CD>().eval().template triangularView<Upper>());
+//   VERIFY_IS_APPROX(sd*md*mcd.template triangularView<Lower>(),  sd*md.template cast<CD>().eval()*mcd.template triangularView<Lower>());
+//   VERIFY_IS_APPROX(scd*md*mcd.template triangularView<Upper>(), scd*md.template cast<CD>().eval()*mcd.template triangularView<Upper>());
+
+  // Not supported yet: symv
+//   VERIFY_IS_APPROX(sd*vcd.adjoint()*md.template selfadjointView<Upper>(),  sd*vcd.adjoint()*md.template cast<CD>().eval().template selfadjointView<Upper>());
+//   VERIFY_IS_APPROX(scd*vcd.adjoint()*md.template selfadjointView<Lower>(), scd*vcd.adjoint()*md.template cast<CD>().eval().template selfadjointView<Lower>());
+//   VERIFY_IS_APPROX(sd*vd.adjoint()*mcd.template selfadjointView<Lower>(),  sd*vd.adjoint().template cast<CD>().eval()*mcd.template selfadjointView<Lower>());
+//   VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.template selfadjointView<Upper>(), scd*vd.adjoint().template cast<CD>().eval()*mcd.template selfadjointView<Upper>());
+
+  // Not supported yet: symm
+//   VERIFY_IS_APPROX(sd*vcd.adjoint()*md.template selfadjointView<Upper>(),  sd*vcd.adjoint()*md.template cast<CD>().eval().template selfadjointView<Upper>());
+//   VERIFY_IS_APPROX(scd*vcd.adjoint()*md.template selfadjointView<Upper>(), scd*vcd.adjoint()*md.template cast<CD>().eval().template selfadjointView<Upper>());
+//   VERIFY_IS_APPROX(sd*vd.adjoint()*mcd.template selfadjointView<Upper>(),  sd*vd.adjoint().template cast<CD>().eval()*mcd.template selfadjointView<Upper>());
+//   VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.template selfadjointView<Upper>(), scd*vd.adjoint().template cast<CD>().eval()*mcd.template selfadjointView<Upper>());
+
+  rcd.setZero();
+  VERIFY_IS_APPROX(Mat_cd(rcd.template triangularView<Upper>() = sd * mcd * md),
+                   Mat_cd((sd * mcd * md.template cast<CD>().eval()).template triangularView<Upper>()));
+  VERIFY_IS_APPROX(Mat_cd(rcd.template triangularView<Upper>() = sd * md * mcd),
+                   Mat_cd((sd * md.template cast<CD>().eval() * mcd).template triangularView<Upper>()));
+  VERIFY_IS_APPROX(Mat_cd(rcd.template triangularView<Upper>() = scd * mcd * md),
+                   Mat_cd((scd * mcd * md.template cast<CD>().eval()).template triangularView<Upper>()));
+  VERIFY_IS_APPROX(Mat_cd(rcd.template triangularView<Upper>() = scd * md * mcd),
+                   Mat_cd((scd * md.template cast<CD>().eval() * mcd).template triangularView<Upper>()));
 }
 
 void test_mixingtypes()
diff --git a/test/nomalloc.cpp b/test/nomalloc.cpp
index 060276a20..50756c2fb 100644
--- a/test/nomalloc.cpp
+++ b/test/nomalloc.cpp
@@ -78,14 +78,15 @@ template<typename MatrixType> void nomalloc(const MatrixType& m)
   VERIFY_IS_APPROX(m2,m2);
   
   m2.template selfadjointView<Lower>().rankUpdate(m1.col(0),-1);
-  m2.template selfadjointView<Lower>().rankUpdate(m1.row(0),-1);
+  m2.template selfadjointView<Upper>().rankUpdate(m1.row(0),-1);
+  m2.template selfadjointView<Lower>().rankUpdate(m1.col(0), m1.col(0)); // rank-2
 
   // The following fancy matrix-matrix products are not safe yet regarding static allocation
-//   m1 += m1.template triangularView<Upper>() * m2.col(;
-//   m1.template selfadjointView<Lower>().rankUpdate(m2);
-//   m1 += m1.template triangularView<Upper>() * m2;
-//   m1 += m1.template selfadjointView<Lower>() * m2;
-//   VERIFY_IS_APPROX(m1,m1);
+  m2.template selfadjointView<Lower>().rankUpdate(m1);
+  m2 += m2.template triangularView<Upper>() * m1;
+  m2.template triangularView<Upper>() = m2 * m2;
+  m1 += m1.template selfadjointView<Lower>() * m2;
+  VERIFY_IS_APPROX(m2,m2);
 }
 
 template<typename Scalar>
diff --git a/test/nullary.cpp b/test/nullary.cpp
index 4844f2952..cb87695ee 100644
--- a/test/nullary.cpp
+++ b/test/nullary.cpp
@@ -48,30 +48,32 @@ void testVectorType(const VectorType& base)
   VectorType m(base);
   m.setLinSpaced(size,low,high);
 
+  if(!NumTraits<Scalar>::IsInteger)
+  {
+    VectorType n(size);
+    for (int i=0; i<size; ++i)
+      n(i) = low+i*step;
+    VERIFY_IS_APPROX(m,n);
+  }
+
   VectorType n(size);
   for (int i=0; i<size; ++i)
-    n(i) = low+i*step;
-
+    n(i) = size==1 ? low : (low + ((high-low)*Scalar(i))/(size-1));
   VERIFY_IS_APPROX(m,n);
 
   // random access version
   m = VectorType::LinSpaced(size,low,high);
   VERIFY_IS_APPROX(m,n);
 
-  // Assignment of a RowVectorXd to a MatrixXd (regression test for bug #79).
-  VERIFY( (MatrixXd(RowVectorXd::LinSpaced(3, 0, 1)) - RowVector3d(0, 0.5, 1)).norm() < std::numeric_limits<Scalar>::epsilon() );
-
-  // These guys sometimes fail! This is not good. Any ideas how to fix them!?
-  //VERIFY( m(m.size()-1) == high );
-  //VERIFY( m(0) == low );
+  VERIFY( internal::isApprox(m(m.size()-1),high) );
+  VERIFY( size==1 || internal::isApprox(m(0),low) );
 
   // sequential access version
   m = VectorType::LinSpaced(Sequential,size,low,high);
   VERIFY_IS_APPROX(m,n);
 
-  // These guys sometimes fail! This is not good. Any ideas how to fix them!?
-  //VERIFY( m(m.size()-1) == high );
-  //VERIFY( m(0) == low );
+  VERIFY( internal::isApprox(m(m.size()-1),high) );
+  VERIFY( size==1 || internal::isApprox(m(0),low) );
 
   // check whether everything works with row and col major vectors
   Matrix<Scalar,Dynamic,1> row_vector(size);
@@ -126,5 +128,13 @@ void test_nullary()
     CALL_SUBTEST_8( testVectorType(Vector4f()) );
     CALL_SUBTEST_8( testVectorType(Matrix<float,8,1>()) );
     CALL_SUBTEST_8( testVectorType(Matrix<float,1,1>()) );
+
+    CALL_SUBTEST_9( testVectorType(VectorXi(internal::random<int>(1,300))) );
+    CALL_SUBTEST_9( testVectorType(Matrix<int,1,1>()) );
   }
+
+#ifdef EIGEN_TEST_PART_6
+  // Assignment of a RowVectorXd to a MatrixXd (regression test for bug #79).
+  VERIFY( (MatrixXd(RowVectorXd::LinSpaced(3, 0, 1)) - RowVector3d(0, 0.5, 1)).norm() < std::numeric_limits<double>::epsilon() );
+#endif
 }
diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp
index d803e7dae..0a06c828b 100644
--- a/test/sparse_basic.cpp
+++ b/test/sparse_basic.cpp
@@ -192,6 +192,11 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
     VERIFY_IS_APPROX(refM4.cwiseProduct(m3), refM4.cwiseProduct(refM3));
 //     VERIFY_IS_APPROX(m3.cwise()/refM4, refM3.cwise()/refM4);
 
+    VERIFY_IS_APPROX(refM4 + m3, refM4 + refM3);
+    VERIFY_IS_APPROX(m3 + refM4, refM3 + refM4);
+    VERIFY_IS_APPROX(refM4 - m3, refM4 - refM3);
+    VERIFY_IS_APPROX(m3 - refM4, refM3 - refM4);
+
     // test aliasing
     VERIFY_IS_APPROX((m1 = -m1), (refM1 = -refM1));
     VERIFY_IS_APPROX((m1 = m1.transpose()), (refM1 = refM1.transpose().eval()));
@@ -455,6 +460,33 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
     refMat1.setIdentity();
     VERIFY_IS_APPROX(m1, refMat1);
   }
+
+  // test array/vector of InnerIterator
+  {
+    typedef typename SparseMatrixType::InnerIterator IteratorType;
+
+    DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
+    SparseMatrixType m2(rows, cols);
+    initSparse<Scalar>(density, refMat2, m2);
+    IteratorType static_array[2];
+    static_array[0] = IteratorType(m2,0);
+    static_array[1] = IteratorType(m2,m2.outerSize()-1);
+    VERIFY( static_array[0] || m2.innerVector(static_array[0].outer()).nonZeros() == 0 );
+    VERIFY( static_array[1] || m2.innerVector(static_array[1].outer()).nonZeros() == 0 );
+    if(static_array[0] && static_array[1])
+    {
+      ++(static_array[1]);
+      static_array[1] = IteratorType(m2,0);
+      VERIFY( static_array[1] );
+      VERIFY( static_array[1].index() == static_array[0].index() );
+      VERIFY( static_array[1].outer() == static_array[0].outer() );
+      VERIFY( static_array[1].value() == static_array[0].value() );
+    }
+
+    std::vector<IteratorType> iters(2);
+    iters[0] = IteratorType(m2,0);
+    iters[1] = IteratorType(m2,m2.outerSize()-1);
+  }
 }
 
 
diff --git a/test/sparse_vector.cpp b/test/sparse_vector.cpp
index d3975b99b..d95f301d5 100644
--- a/test/sparse_vector.cpp
+++ b/test/sparse_vector.cpp
@@ -9,14 +9,14 @@
 
 #include "sparse.h"
 
-template<typename Scalar,typename Index> void sparse_vector(int rows, int cols)
+template<typename Scalar,typename StorageIndex> void sparse_vector(int rows, int cols)
 {
   double densityMat = (std::max)(8./(rows*cols), 0.01);
   double densityVec = (std::max)(8./float(rows), 0.1);
   typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
   typedef Matrix<Scalar,Dynamic,1> DenseVector;
-  typedef SparseVector<Scalar,0,Index> SparseVectorType;
-  typedef SparseMatrix<Scalar,0,Index> SparseMatrixType;
+  typedef SparseVector<Scalar,0,StorageIndex> SparseVectorType;
+  typedef SparseMatrix<Scalar,0,StorageIndex> SparseMatrixType;
   Scalar eps = 1e-6;
 
   SparseMatrixType m1(rows,rows);
@@ -87,8 +87,10 @@ template<typename Scalar,typename Index> void sparse_vector(int rows, int cols)
 
   VERIFY_IS_APPROX(m1*v2, refM1*refV2);
   VERIFY_IS_APPROX(v1.dot(m1*v2), refV1.dot(refM1*refV2));
-  int i = internal::random<int>(0,rows-1);
-  VERIFY_IS_APPROX(v1.dot(m1.col(i)), refV1.dot(refM1.col(i)));
+  {
+    int i = internal::random<int>(0,rows-1);
+    VERIFY_IS_APPROX(v1.dot(m1.col(i)), refV1.dot(refM1.col(i)));
+  }
 
 
   VERIFY_IS_APPROX(v1.squaredNorm(), refV1.squaredNorm());
@@ -111,15 +113,51 @@ template<typename Scalar,typename Index> void sparse_vector(int rows, int cols)
   VERIFY_IS_APPROX(refV3 = v1.transpose(),v1.toDense()); 
   VERIFY_IS_APPROX(DenseVector(v1),v1.toDense()); 
 
+  // test conservative resize
+  {
+    std::vector<StorageIndex> inc;
+    if(rows > 3)
+      inc.push_back(-3);
+    inc.push_back(0);
+    inc.push_back(3);
+    inc.push_back(1);
+    inc.push_back(10);
+
+    for(std::size_t i = 0; i< inc.size(); i++) {
+      StorageIndex incRows = inc[i];
+      SparseVectorType vec1(rows);
+      DenseVector refVec1 = DenseVector::Zero(rows);
+      initSparse<Scalar>(densityVec, refVec1, vec1);
+
+      vec1.conservativeResize(rows+incRows);
+      refVec1.conservativeResize(rows+incRows);
+      if (incRows > 0) refVec1.tail(incRows).setZero();
+
+      VERIFY_IS_APPROX(vec1, refVec1);
+
+      // Insert new values
+      if (incRows > 0)
+        vec1.insert(vec1.rows()-1) = refVec1(refVec1.rows()-1) = 1;
+
+      VERIFY_IS_APPROX(vec1, refVec1);
+    }
+  }
+
 }
 
 void test_sparse_vector()
 {
   for(int i = 0; i < g_repeat; i++) {
+    int r = Eigen::internal::random<int>(1,500), c = Eigen::internal::random<int>(1,500);
+    if(Eigen::internal::random<int>(0,4) == 0) {
+      r = c; // check square matrices in 25% of tries
+    }
+    EIGEN_UNUSED_VARIABLE(r+c);
+
     CALL_SUBTEST_1(( sparse_vector<double,int>(8, 8) ));
-    CALL_SUBTEST_2(( sparse_vector<std::complex<double>, int>(16, 16) ));
-    CALL_SUBTEST_1(( sparse_vector<double,long int>(299, 535) ));
-    CALL_SUBTEST_1(( sparse_vector<double,short>(299, 535) ));
+    CALL_SUBTEST_2(( sparse_vector<std::complex<double>, int>(r, c) ));
+    CALL_SUBTEST_1(( sparse_vector<double,long int>(r, c) ));
+    CALL_SUBTEST_1(( sparse_vector<double,short>(r, c) ));
   }
 }
 
diff --git a/test/stable_norm.cpp b/test/stable_norm.cpp
index 7561ae8be..c3eb5ff31 100644
--- a/test/stable_norm.cpp
+++ b/test/stable_norm.cpp
@@ -163,6 +163,21 @@ template<typename MatrixType> void stable_norm(const MatrixType& m)
     VERIFY(!(numext::isfinite)(v.blueNorm()));      VERIFY((numext::isnan)(v.blueNorm()));
     VERIFY(!(numext::isfinite)(v.hypotNorm()));     VERIFY((numext::isnan)(v.hypotNorm()));
   }
+
+  // stableNormalize[d]
+  {
+    VERIFY_IS_APPROX(vrand.stableNormalized(), vrand.normalized());
+    MatrixType vcopy(vrand);
+    vcopy.stableNormalize();
+    VERIFY_IS_APPROX(vcopy, vrand.normalized());
+    VERIFY_IS_APPROX((vrand.stableNormalized()).norm(), RealScalar(1));
+    VERIFY_IS_APPROX(vcopy.norm(), RealScalar(1));
+    VERIFY_IS_APPROX((vbig.stableNormalized()).norm(), RealScalar(1));
+    VERIFY_IS_APPROX((vsmall.stableNormalized()).norm(), RealScalar(1));
+    RealScalar big_scaling = ((std::numeric_limits<RealScalar>::max)() * RealScalar(1e-4));
+    VERIFY_IS_APPROX(vbig/big_scaling, (vbig.stableNorm() * vbig.stableNormalized()).eval()/big_scaling);
+    VERIFY_IS_APPROX(vsmall, vsmall.stableNorm() * vsmall.stableNormalized());
+  }
 }
 
 void test_stable_norm()
diff --git a/test/vectorwiseop.cpp b/test/vectorwiseop.cpp
index 87476f95b..3cc198772 100644
--- a/test/vectorwiseop.cpp
+++ b/test/vectorwiseop.cpp
@@ -210,6 +210,9 @@ template<typename MatrixType> void vectorwiseop_matrix(const MatrixType& m)
   VERIFY_IS_APPROX(m1.cwiseAbs().colwise().maxCoeff(), m1.colwise().template lpNorm<Infinity>());
   VERIFY_IS_APPROX(m1.cwiseAbs().rowwise().maxCoeff(), m1.rowwise().template lpNorm<Infinity>());
 
+  // regression for bug 1158
+  VERIFY_IS_APPROX(m1.cwiseAbs().colwise().sum().x(), m1.col(0).cwiseAbs().sum());
+
   // test normalized
   m2 = m1.colwise().normalized();
   VERIFY_IS_APPROX(m2.col(c), m1.col(c).normalized());
diff --git a/test/zerosized.cpp b/test/zerosized.cpp
index da7dd0481..85c553453 100644
--- a/test/zerosized.cpp
+++ b/test/zerosized.cpp
@@ -25,6 +25,7 @@ template<typename MatrixType> void zeroReduction(const MatrixType& m) {
 template<typename MatrixType> void zeroSizedMatrix()
 {
   MatrixType t1;
+  typedef typename MatrixType::Scalar Scalar;
 
   if (MatrixType::SizeAtCompileTime == Dynamic || MatrixType::SizeAtCompileTime == 0)
   {
@@ -45,6 +46,23 @@ template<typename MatrixType> void zeroSizedMatrix()
       VERIFY(t1==t2);
     }
   }
+
+  if(MatrixType::MaxColsAtCompileTime!=0 && MatrixType::MaxRowsAtCompileTime!=0)
+  {
+    Index rows = MatrixType::RowsAtCompileTime==Dynamic ? internal::random<Index>(1,10) : MatrixType::RowsAtCompileTime;
+    Index cols = MatrixType::ColsAtCompileTime==Dynamic ? internal::random<Index>(1,10) : MatrixType::ColsAtCompileTime;
+    MatrixType m(rows,cols);
+    zeroReduction(m.template block<0,MatrixType::ColsAtCompileTime>(0,0,0,cols));
+    zeroReduction(m.template block<MatrixType::RowsAtCompileTime,0>(0,0,rows,0));
+    zeroReduction(m.template block<0,1>(0,0));
+    zeroReduction(m.template block<1,0>(0,0));
+    Matrix<Scalar,Dynamic,Dynamic> prod = m.template block<MatrixType::RowsAtCompileTime,0>(0,0,rows,0) * m.template block<0,MatrixType::ColsAtCompileTime>(0,0,0,cols);
+    VERIFY(prod.rows()==rows && prod.cols()==cols);
+    VERIFY(prod.isZero());
+    prod = m.template block<1,0>(0,0) * m.template block<0,1>(0,0);
+    VERIFY(prod.size()==1);
+    VERIFY(prod.isZero());
+  }
 }
 
 template<typename VectorType> void zeroSizedVector()
diff --git a/unsupported/Eigen/AlignedVector3 b/unsupported/Eigen/AlignedVector3
index f5c40a189..135eec572 100644
--- a/unsupported/Eigen/AlignedVector3
+++ b/unsupported/Eigen/AlignedVector3
@@ -188,7 +188,7 @@ template<typename _Scalar> class AlignedVector3
     }
 
     template<typename Derived>
-    inline bool isApprox(const MatrixBase<Derived>& other, RealScalar eps=NumTraits<Scalar>::dummy_precision()) const
+    inline bool isApprox(const MatrixBase<Derived>& other, const RealScalar& eps=NumTraits<Scalar>::dummy_precision()) const
     {
       return m_coeffs.template head<3>().isApprox(other,eps);
     }
diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
index 456b34d0b..89aeb03e7 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
@@ -25,6 +25,16 @@ template <typename T, size_t n> class array {
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; }
 
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& front() { return values[0]; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& front() const { return values[0]; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& back() { return values[n-1]; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& back() const { return values[n-1]; }
+
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   static std::size_t size() { return n; }
 
@@ -123,13 +133,33 @@ template <typename T> class array<T, 0> {
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE T& operator[] (size_t) {
     eigen_assert(false && "Can't index a zero size array");
-    return *static_cast<T*>(NULL);
+    return dummy;
   }
-
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE const T& operator[] (size_t) const {
     eigen_assert(false && "Can't index a zero size array");
-    return *static_cast<const T*>(NULL);
+    return dummy;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& front() {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& front() const {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& back() {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& back() const {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
   }
 
   static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::size_t size() { return 0; }
@@ -142,6 +172,9 @@ template <typename T> class array<T, 0> {
     eigen_assert(l.size() == 0);
   }
 #endif
+
+ private:
+  T dummy;
 };
 
 namespace internal {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index 392acf302..cca716d6f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -129,6 +129,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
     }
 
     EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_digamma_op<Scalar>, const Derived>
+    digamma() const {
+      return unaryExpr(internal::scalar_digamma_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_erf_op<Scalar>, const Derived>
     erf() const {
       return unaryExpr(internal::scalar_erf_op<Scalar>());
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 624e814e2..1adb68894 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -378,7 +378,7 @@ struct TensorContractionEvaluatorBase
   }
 
   template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  void evalGemv(Scalar* buffer) const {
+  EIGEN_DEVICE_FUNC void evalGemv(Scalar* buffer) const {
     const Index rows = m_i_size;
     const Index cols = m_k_size;
 
@@ -516,7 +516,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
       Base(op, device) { }
 
   template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  void evalProduct(Scalar* buffer) const {
+  EIGEN_DEVICE_FUNC void evalProduct(Scalar* buffer) const {
     if (this->m_j_size == 1) {
       this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
       return;
@@ -582,10 +582,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
 
     OutputMapper output(buffer, m);
 
-    typedef typename internal::gemm_blocking_space<ColMajor, LhsScalar, RhsScalar, Dynamic, Dynamic, Dynamic> BlockingType;
-
     // Sizes of the blocks to load in cache. See the Goto paper for details.
-    BlockingType blocking(m, n, k, 1, true);
+    internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, 1);
     const Index kc = blocking.kc();
     const Index mc = numext::mini(m, blocking.mc());
     const Index nc = numext::mini(n, blocking.nc());
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
index 78ed5038f..3d3f6904f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
@@ -28,7 +28,7 @@ class TensorContractionBlocking {
   typedef typename LhsMapper::Scalar LhsScalar;
   typedef typename RhsMapper::Scalar RhsScalar;
 
-  TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) :
+  EIGEN_DEVICE_FUNC TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) :
       kc_(k), mc_(m), nc_(n)
   {
     if (ShardingType == ShardByCol) {
@@ -41,9 +41,9 @@ class TensorContractionBlocking {
     }
   }
 
-  EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
-  EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
-  EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
 
  private:
   Index kc_;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
index 9b6d18090..63c8ae126 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@@ -426,15 +426,16 @@ class TensorContractionSubMapper {
 };
 
 
-template<typename Scalar, typename Index, int side,
+template<typename Scalar_, typename Index, int side,
          typename Tensor,
          typename nocontract_t, typename contract_t,
          int packet_size,
          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
 class TensorContractionInputMapper
-  : public BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> {
+  : public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> {
 
  public:
+  typedef Scalar_ Scalar;
   typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Base;
   typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
   typedef SubMapper VectorMapper;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index 576bea295..51a3b9490 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -176,10 +176,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
 
     // compute block sizes (which depend on number of threads)
     const Index num_threads = this->m_device.numThreads();
-    Index mc = m;
-    Index nc = n;
-    Index kc = k;
-    internal::computeProductBlockingSizes<LhsScalar,RhsScalar,1>(kc, mc, nc, num_threads);
+    internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, num_threads);
+    Index mc = blocking.mc();
+    Index nc = blocking.nc();
+    Index kc = blocking.kc();
     eigen_assert(mc <= m);
     eigen_assert(nc <= n);
     eigen_assert(kc <= k);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 367a152a0..67c797802 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -21,7 +21,7 @@ namespace Eigen {
   */
 namespace internal {
 
-template <typename Index, typename InputDims, size_t NumKernelDims, int Layout>
+template <typename Index, typename InputDims, int NumKernelDims, int Layout>
 class IndexMapper {
  public:
   IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
@@ -123,7 +123,7 @@ class IndexMapper {
       }
       inputIndex += p * m_inputStrides[NumKernelDims];
     } else {
-      int limit = 0;
+      std::ptrdiff_t limit = 0;
       if (NumKernelDims < NumDims) {
         limit = NumDims - NumKernelDims - 1;
       }
@@ -147,7 +147,7 @@ class IndexMapper {
       }
       outputIndex += p * m_outputStrides[NumKernelDims];
     } else {
-      int limit = 0;
+      std::ptrdiff_t limit = 0;
       if (NumKernelDims < NumDims) {
         limit = NumDims - NumKernelDims - 1;
       }
@@ -206,7 +206,7 @@ class IndexMapper {
   }
 
  private:
-  static const size_t NumDims = internal::array_size<InputDims>::value;
+  static const int NumDims = internal::array_size<InputDims>::value;
   array<Index, NumDims> m_inputStrides;
   array<Index, NumDims> m_outputStrides;
   array<Index, NumDims> m_cudaInputStrides;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index 5abdc489b..e684ab8f7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -109,10 +109,12 @@ class CudaStreamDevice : public StreamInterface {
 struct GpuDevice {
   // The StreamInterface is not owned: the caller is
   // responsible for its initialization and eventual destruction.
-  explicit GpuDevice(const StreamInterface* stream) : stream_(stream) {
+  explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
+    eigen_assert(stream);
+  }
+  explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
     eigen_assert(stream);
   }
-
   // TODO(bsteiner): This is an internal API, we should not expose it.
   EIGEN_STRONG_INLINE const cudaStream_t& stream() const {
     return stream_->stream();
@@ -246,6 +248,10 @@ struct GpuDevice {
 #endif
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const {
+    return max_blocks_;
+  }
+
   // This function checks if the CUDA runtime recorded an error for the
   // underlying stream device.
   inline bool ok() const {
@@ -259,7 +265,7 @@ struct GpuDevice {
 
  private:
   const StreamInterface* stream_;
-
+  int max_blocks_;
 };
 
 #ifndef __CUDA_ARCH__
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index e7daf7304..bd83d5de8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -136,7 +136,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
   }
 
   template<int LoadMode>
-  EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
     return internal::ploadt<Packet, LoadMode>(m_buffer + index);
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index d2ab70f2b..df15c6204 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -220,7 +220,7 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run(
   if (needs_assign)
   {
     const int block_size = device.maxCudaThreadsPerBlock();
-    const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size;
+    const int max_blocks = numext::maxi<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size);
     const Index size = array_prod(evaluator.dimensions());
     // Create a least one block to ensure we won't crash if we're called with tensors of size 0.
     const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
@@ -239,7 +239,7 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(c
   if (needs_assign)
   {
     const int block_size = device.maxCudaThreadsPerBlock();
-    const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size;
+    const int max_blocks = numext::maxi<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size);
     const Index size = array_prod(evaluator.dimensions());
     // Create a least one block to ensure we won't crash if we're called with tensors of size 0.
     const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index c9b0b2f28..58b864787 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -106,7 +106,6 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
-    m_impl.evalSubExprsIfNeeded(NULL);
     const Index numValues = m_impl.dimensions().TotalSize();
     m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType));
     // Should initialize the memory in case we're dealing with non POD types.
@@ -119,7 +118,6 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
     EvalTo evalToTmp(m_buffer, m_op);
     const bool PacketAccess = internal::IsVectorizable<Device, const ArgType>::value;
     internal::TensorExecutor<const EvalTo, Device, PacketAccess>::run(evalToTmp, m_device);
-    m_impl.cleanup();
     return true;
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 09ee0c2c6..22aea5ea4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -345,8 +345,8 @@ template <typename Self, typename Op, typename Device>
 struct InnerReducer {
   static const bool HasOptimizedImplementation = false;
 
-  static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
-    assert(false && "Not implemented");
+  EIGEN_DEVICE_FUNC static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
+    eigen_assert(false && "Not implemented");
   }
 };
 
@@ -355,8 +355,8 @@ template <typename Self, typename Op, typename Device>
 struct OuterReducer {
   static const bool HasOptimizedImplementation = false;
 
-  static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
-    assert(false && "Not implemented");
+  EIGEN_DEVICE_FUNC static void run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
+    eigen_assert(false && "Not implemented");
   }
 };
 
@@ -463,7 +463,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
 	  m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
 	}
       } else {
-	m_outputStrides[NumOutputDims - 1] = 1;
+	m_outputStrides.back() = 1;
 	for (int i = NumOutputDims - 2; i >= 0; --i) {
 	  m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
 	}
@@ -479,7 +479,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
 	  input_strides[i] = input_strides[i-1] * input_dims[i-1];
 	}
       } else {
-	input_strides[NumInputDims - 1] = 1;
+	input_strides.back() = 1;
 	for (int i = NumInputDims - 2; i >= 0; --i) {
 	  input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
 	}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
index 98631fc7f..ed933b6ac 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
@@ -41,7 +41,10 @@ class TensorStorage<T, FixedDimensions, Options_>
  private:
   static const std::size_t Size = FixedDimensions::total_size;
 
-  EIGEN_ALIGN_MAX T m_data[Size];
+  // Allocate an array of size at least one to prevent compiler warnings.
+  static const std::size_t MinSize = max_n_1<Size>::size;
+  EIGEN_ALIGN_MAX T m_data[MinSize];
+
   FixedDimensions m_dimensions;
 
  public:
@@ -105,7 +108,6 @@ class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_>
 
     EIGEN_DEVICE_FUNC void resize(Index size, const array<Index, NumIndices_>& nbDimensions)
     {
-      eigen_assert(size >= 1);
       const Index currentSz = internal::array_prod(m_dimensions);
       if(size != currentSz)
       {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
index 19352eb5e..0d34f7ee6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
@@ -34,11 +34,11 @@ struct TensorUInt128
   LOW low;
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  TensorUInt128(int x) : high(0), low(x) {
+  TensorUInt128(int32_t x) : high(0), low(x) {
     eigen_assert(x >= 0);
   }
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  TensorUInt128(unsigned int x) : high(0), low(x) { }
+  TensorUInt128(uint32_t x) : high(0), low(x) { }
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   TensorUInt128(long x) : high(0), low(x) {
     eigen_assert(x >= 0);
diff --git a/unsupported/Eigen/src/Splines/SplineFitting.h b/unsupported/Eigen/src/Splines/SplineFitting.h
index d3c245fa9..8e6a5aaed 100644
--- a/unsupported/Eigen/src/Splines/SplineFitting.h
+++ b/unsupported/Eigen/src/Splines/SplineFitting.h
@@ -167,7 +167,7 @@ namespace Eigen
                derivativeKnots.data(), derivativeKnots.data() + derivativeKnots.size(),
                temporaryKnots.data());
 
-    // Number of control points (one for each point and derivative) plus spline order.
+    // Number of knots (one for each point and derivative) plus spline order.
     DenseIndex numKnots = numParameters + numDerivatives + degree + 1;
     knots.resize(numKnots);
 
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 97257b183..eed724bcf 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -147,13 +147,27 @@ if(EIGEN_TEST_CXX11)
   ei_add_test(cxx11_tensor_sugar "-std=c++0x")
   ei_add_test(cxx11_tensor_fft "-std=c++0x")
   ei_add_test(cxx11_tensor_ifft "-std=c++0x")
+  ei_add_test(cxx11_tensor_empty "-std=c++0x")
 
-  # These tests needs nvcc
-#  ei_add_test(cxx11_tensor_device "-std=c++0x")
-#  ei_add_test(cxx11_tensor_cuda "-std=c++0x")
-#  ei_add_test(cxx11_tensor_contract_cuda "-std=c++0x")
-#  ei_add_test(cxx11_tensor_reduction_cuda "-std=c++0x")
-#  ei_add_test(cxx11_tensor_random_cuda "-std=c++0x")
-#  ei_add_test(cxx11_tensor_argmax_cuda "-std=c++0x")
+endif()
 
+# These tests needs nvcc
+find_package(CUDA 7.0)
+if(CUDA_FOUND)
+  set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+    set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE)
+  endif()
+  set(CUDA_NVCC_FLAGS "-std=c++11 --relaxed-constexpr -arch compute_30")
+  cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include")
+  set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
+
+  ei_add_test(cxx11_tensor_device)
+  ei_add_test(cxx11_tensor_cuda)
+  ei_add_test(cxx11_tensor_contract_cuda)
+  ei_add_test(cxx11_tensor_reduction_cuda)
+  ei_add_test(cxx11_tensor_random_cuda)
+  ei_add_test(cxx11_tensor_argmax_cuda)
+
+  unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
 endif()
diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cpp b/unsupported/test/cxx11_tensor_argmax_cuda.cu
index d37490d15..45311d4f7 100644
--- a/unsupported/test/cxx11_tensor_argmax_cuda.cpp
+++ b/unsupported/test/cxx11_tensor_argmax_cuda.cu
@@ -56,6 +56,10 @@ void test_cuda_simple_argmax()
 
   VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1);
   VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0);
+
+  cudaFree(d_in);
+  cudaFree(d_out_max);
+  cudaFree(d_out_min);
 }
 
 template <int DataLayout>
@@ -141,6 +145,9 @@ void test_cuda_argmax_dim()
       // Expect max to be in the last index of the reduced dimension
       VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
     }
+
+    cudaFree(d_in);
+    cudaFree(d_out);
   }
 }
 
@@ -227,15 +234,18 @@ void test_cuda_argmin_dim()
       // Expect max to be in the last index of the reduced dimension
       VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
     }
+
+    cudaFree(d_in);
+    cudaFree(d_out);
   }
 }
 
 void test_cxx11_tensor_cuda()
 {
-  CALL_SUBTEST(test_cuda_simple_argmax<RowMajor>());
-  CALL_SUBTEST(test_cuda_simple_argmax<ColMajor>());
-  CALL_SUBTEST(test_cuda_argmax_dim<RowMajor>());
-  CALL_SUBTEST(test_cuda_argmax_dim<ColMajor>());
-  CALL_SUBTEST(test_cuda_argmin_dim<RowMajor>());
-  CALL_SUBTEST(test_cuda_argmin_dim<ColMajor>());
+  CALL_SUBTEST_1(test_cuda_simple_argmax<RowMajor>());
+  CALL_SUBTEST_1(test_cuda_simple_argmax<ColMajor>());
+  CALL_SUBTEST_2(test_cuda_argmax_dim<RowMajor>());
+  CALL_SUBTEST_2(test_cuda_argmax_dim<ColMajor>());
+  CALL_SUBTEST_3(test_cuda_argmin_dim<RowMajor>());
+  CALL_SUBTEST_3(test_cuda_argmin_dim<ColMajor>());
 }
diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cpp b/unsupported/test/cxx11_tensor_contract_cuda.cu
index 035a093e6..6d1ef07f9 100644
--- a/unsupported/test/cxx11_tensor_contract_cuda.cpp
+++ b/unsupported/test/cxx11_tensor_contract_cuda.cu
@@ -22,16 +22,16 @@ using Eigen::Tensor;
 typedef Tensor<float, 1>::DimensionPair DimPair;
 
 template<int DataLayout>
-static void test_cuda_contraction(int m_size, int k_size, int n_size)
+void test_cuda_contraction(int m_size, int k_size, int n_size)
 {
-  cout<<"Calling with ("<<m_size<<","<<k_size<<","<<n_size<<")"<<std::endl;
+  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
   // with these dimensions, the output has 300 * 140 elements, which is
   // more than 30 * 1024, which is the number of threads in blocks on
   // a 15 SM GK110 GPU
-  Tensor<float, 2, DataLayout> t_left(Eigen::array<int, 2>(m_size, k_size));
-  Tensor<float, 2, DataLayout> t_right(Eigen::array<int, 2>(k_size, n_size));
-  Tensor<float, 2, DataLayout> t_result(Eigen::array<int, 2>(m_size, n_size));
-  Tensor<float, 2, DataLayout> t_result_gpu(Eigen::array<int, 2>(m_size, n_size));
+  Tensor<float, 2, DataLayout> t_left(m_size, k_size);
+  Tensor<float, 2, DataLayout> t_right(k_size, n_size);
+  Tensor<float, 2, DataLayout> t_result(m_size, n_size);
+  Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size);
   Eigen::array<DimPair, 1> dims(DimPair(1, 0));
 
   t_left.setRandom();
@@ -67,12 +67,16 @@ static void test_cuda_contraction(int m_size, int k_size, int n_size)
   t_result = t_left.contract(t_right, dims);
 
   cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
-  for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
-    if (fabs(t_result.data()[i] - t_result_gpu.data()[i]) >= 1e-4) {
-      cout << "mismatch detected at index " << i << ": " << t_result.data()[i]
-           << " vs " <<  t_result_gpu.data()[i] << endl;
-      assert(false);
+  for (size_t i = 0; i < t_result.size(); i++) {
+    if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
+      continue;
     }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
+      continue;
+    }
+    std::cout << "mismatch detected at index " << i << ": " << t_result(i)
+              << " vs " <<  t_result_gpu(i) << std::endl;
+    assert(false);
   }
 
   cudaFree((void*)d_t_left);
@@ -80,41 +84,69 @@ static void test_cuda_contraction(int m_size, int k_size, int n_size)
   cudaFree((void*)d_t_result);
 }
 
-
-void test_cxx11_tensor_cuda()
-{
-  cout<<"Calling contraction tests"<<std::endl;
-  CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, 128, 128));
-  CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, 128, 128));
+template<int DataLayout>
+void test_cuda_contraction_m() {
   for (int k = 32; k < 256; k++) {
-    CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, k, 128));
-    CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, k, 128));
+    test_cuda_contraction<ColMajor>(k, 128, 128);
+    test_cuda_contraction<RowMajor>(k, 128, 128);
   }
+}
+
+template<int DataLayout>
+void test_cuda_contraction_k() {
   for (int k = 32; k < 256; k++) {
-    CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, 128, k));
-    CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, 128, k));
+    test_cuda_contraction<ColMajor>(128, k, 128);
+    test_cuda_contraction<RowMajor>(128, k, 128);
   }
+}
+
+template<int DataLayout>
+void test_cuda_contraction_n() {
   for (int k = 32; k < 256; k++) {
-    CALL_SUBTEST(test_cuda_contraction<ColMajor>(k, 128, 128));
-    CALL_SUBTEST(test_cuda_contraction<RowMajor>(k, 128, 128));
+    test_cuda_contraction<ColMajor>(128, 128, k);
+    test_cuda_contraction<RowMajor>(128, 128, k);
   }
+}
 
-  int m_sizes[] = {31,   39,   63,   64,  65,
-                   127, 129,  255,  257, 511,
-                   512, 513, 1023, 1024, 1025 };
-  int n_sizes[] = {31,   39,   63,   64,  65,
-                   127, 129,  255,  257, 511,
-                   512, 513, 1023, 1024, 1025 };
-
-  int k_sizes[] = { 31,  39,  63, 64,    65,
-                    95,  96, 127, 129,  255,
-                   257, 511, 512, 513, 1023,
-                  1024, 1025};
 
-  for (int i = 0; i <15; i++)
-    for (int j = 0; j < 15; j++)
+template<int DataLayout>
+void test_cuda_contraction_sizes() {
+  int m_sizes[] = { 31,  39,   63,   64,   65,
+                   127, 129,  255,  257 , 511,
+                   512, 513, 1023, 1024, 1025};
+
+  int n_sizes[] = { 31,  39,   63,   64,   65,
+                   127, 129,  255,  257,  511,
+                   512, 513, 1023, 1024, 1025};
+
+  int k_sizes[] = {  31,   39,  63,  64,   65,
+                     95,   96, 127, 129,  255,
+                    257,  511, 512, 513, 1023,
+                   1024, 1025};
+
+  for (int i = 0; i < 15; i++) {
+    for (int j = 0; j < 15; j++) {
       for (int k = 0; k < 17; k++) {
-        CALL_SUBTEST(test_cuda_contraction<ColMajor>(m_sizes[i], n_sizes[j], k_sizes[k]));
-        CALL_SUBTEST(test_cuda_contraction<RowMajor>(m_sizes[i], n_sizes[j], k_sizes[k]));
+        test_cuda_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
       }
+    }
+  }
+}
+
+void test_cxx11_tensor_cuda()
+{
+  CALL_SUBTEST_1(test_cuda_contraction<ColMajor>(128, 128, 128));
+  CALL_SUBTEST_1(test_cuda_contraction<RowMajor>(128, 128, 128));
+
+  CALL_SUBTEST_2(test_cuda_contraction_m<ColMajor>());
+  CALL_SUBTEST_3(test_cuda_contraction_m<RowMajor>());
+
+  CALL_SUBTEST_4(test_cuda_contraction_k<ColMajor>());
+  CALL_SUBTEST_5(test_cuda_contraction_k<RowMajor>());
+
+  CALL_SUBTEST_6(test_cuda_contraction_n<ColMajor>());
+  CALL_SUBTEST_7(test_cuda_contraction_n<RowMajor>());
+
+  CALL_SUBTEST_8(test_cuda_contraction_sizes<ColMajor>());
+  CALL_SUBTEST_9(test_cuda_contraction_sizes<RowMajor>());
 }
diff --git a/unsupported/test/cxx11_tensor_cuda.cpp b/unsupported/test/cxx11_tensor_cuda.cu
index 49e1894ab..60f9314a5 100644
--- a/unsupported/test/cxx11_tensor_cuda.cpp
+++ b/unsupported/test/cxx11_tensor_cuda.cu
@@ -63,6 +63,10 @@ void test_cuda_elementwise_small() {
         out(Eigen::array<int, 1>(i)),
         in1(Eigen::array<int, 1>(i)) + in2(Eigen::array<int, 1>(i)));
   }
+
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+  cudaFree(d_out);
 }
 
 void test_cuda_elementwise()
@@ -113,6 +117,11 @@ void test_cuda_elementwise()
       }
     }
   }
+
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+  cudaFree(d_in3);
+  cudaFree(d_out);
 }
 
 void test_cuda_reduction()
@@ -131,8 +140,7 @@ void test_cuda_reduction()
 
   cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
 
-  cudaStream_t stream;
-  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::CudaStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
@@ -159,10 +167,13 @@ void test_cuda_reduction()
       VERIFY_IS_APPROX(out(i,j), expected);
     }
   }
+
+  cudaFree(d_in1);
+  cudaFree(d_out);
 }
 
 template<int DataLayout>
-static void test_cuda_contraction()
+void test_cuda_contraction()
 {
   // with these dimensions, the output has 300 * 140 elements, which is
   // more than 30 * 1024, which is the number of threads in blocks on
@@ -189,8 +200,7 @@ static void test_cuda_contraction()
   cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
   cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
 
-  cudaStream_t stream;
-  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::CudaStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31);
@@ -214,14 +224,18 @@ static void test_cuda_contraction()
 
   for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
     if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
-      cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " <<  m_result.data()[i] << endl;
+      std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
       assert(false);
     }
   }
+
+  cudaFree(d_t_left);
+  cudaFree(d_t_right);
+  cudaFree(d_t_result);
 }
 
 template<int DataLayout>
-static void test_cuda_convolution_1d()
+void test_cuda_convolution_1d()
 {
   Tensor<float, 4, DataLayout> input(74,37,11,137);
   Tensor<float, 1, DataLayout> kernel(4);
@@ -243,8 +257,7 @@ static void test_cuda_convolution_1d()
   cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
   cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
 
-  cudaStream_t stream;
-  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::CudaStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137);
@@ -269,9 +282,13 @@ static void test_cuda_convolution_1d()
       }
     }
   }
+
+  cudaFree(d_input);
+  cudaFree(d_kernel);
+  cudaFree(d_out);
 }
 
-static void test_cuda_convolution_inner_dim_col_major_1d()
+void test_cuda_convolution_inner_dim_col_major_1d()
 {
   Tensor<float, 4, ColMajor> input(74,9,11,7);
   Tensor<float, 1, ColMajor> kernel(4);
@@ -293,8 +310,7 @@ static void test_cuda_convolution_inner_dim_col_major_1d()
   cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
   cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
 
-  cudaStream_t stream;
-  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::CudaStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7);
@@ -319,9 +335,13 @@ static void test_cuda_convolution_inner_dim_col_major_1d()
       }
     }
   }
+
+  cudaFree(d_input);
+  cudaFree(d_kernel);
+  cudaFree(d_out);
 }
 
-static void test_cuda_convolution_inner_dim_row_major_1d()
+void test_cuda_convolution_inner_dim_row_major_1d()
 {
   Tensor<float, 4, RowMajor> input(7,9,11,74);
   Tensor<float, 1, RowMajor> kernel(4);
@@ -343,8 +363,7 @@ static void test_cuda_convolution_inner_dim_row_major_1d()
   cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
   cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
 
-  cudaStream_t stream;
-  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::CudaStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74);
@@ -369,10 +388,14 @@ static void test_cuda_convolution_inner_dim_row_major_1d()
       }
     }
   }
+
+  cudaFree(d_input);
+  cudaFree(d_kernel);
+  cudaFree(d_out);
 }
 
 template<int DataLayout>
-static void test_cuda_convolution_2d()
+void test_cuda_convolution_2d()
 {
   Tensor<float, 4, DataLayout> input(74,37,11,137);
   Tensor<float, 2, DataLayout> kernel(3,4);
@@ -394,8 +417,7 @@ static void test_cuda_convolution_2d()
   cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
   cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
 
-  cudaStream_t stream;
-  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::CudaStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137);
@@ -430,10 +452,14 @@ static void test_cuda_convolution_2d()
       }
     }
   }
+
+  cudaFree(d_input);
+  cudaFree(d_kernel);
+  cudaFree(d_out);
 }
 
 template<int DataLayout>
-static void test_cuda_convolution_3d()
+void test_cuda_convolution_3d()
 {
   Tensor<float, 5, DataLayout> input(Eigen::array<int, 5>(74,37,11,137,17));
   Tensor<float, 3, DataLayout> kernel(3,4,2);
@@ -455,8 +481,7 @@ static void test_cuda_convolution_3d()
   cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
   cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
 
-  cudaStream_t stream;
-  assert(cudaStreamCreate(&stream) == cudaSuccess);
+  Eigen::CudaStreamDevice stream;    
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17);
@@ -505,6 +530,10 @@ static void test_cuda_convolution_3d()
       }
     }
   }
+
+  cudaFree(d_input);
+  cudaFree(d_kernel);
+  cudaFree(d_out);
 }
 
 
@@ -542,6 +571,9 @@ void test_cuda_lgamma(const Scalar stddev)
       VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j)));
     }
   }
+
+  cudaFree(d_in);
+  cudaFree(d_out);
 }
 
 template <typename Scalar>
@@ -578,6 +610,9 @@ void test_cuda_erf(const Scalar stddev)
       VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j)));
     }
   }
+
+  cudaFree(d_in);
+  cudaFree(d_out);
 }
 
 template <typename Scalar>
@@ -614,51 +649,50 @@ void test_cuda_erfc(const Scalar stddev)
       VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j)));
     }
   }
+
+  cudaFree(d_in);
+  cudaFree(d_out);
 }
 
 void test_cxx11_tensor_cuda()
 {
-  CALL_SUBTEST(test_cuda_elementwise_small());
-  CALL_SUBTEST(test_cuda_elementwise());
-  CALL_SUBTEST(test_cuda_reduction());
-  CALL_SUBTEST(test_cuda_contraction<ColMajor>());
-  CALL_SUBTEST(test_cuda_contraction<RowMajor>());
-  CALL_SUBTEST(test_cuda_convolution_1d<ColMajor>());
-  CALL_SUBTEST(test_cuda_convolution_1d<RowMajor>());
-  CALL_SUBTEST(test_cuda_convolution_inner_dim_col_major_1d());
-  CALL_SUBTEST(test_cuda_convolution_inner_dim_row_major_1d());
-  CALL_SUBTEST(test_cuda_convolution_2d<ColMajor>());
-  CALL_SUBTEST(test_cuda_convolution_2d<RowMajor>());
-  CALL_SUBTEST(test_cuda_convolution_3d<ColMajor>());
-  CALL_SUBTEST(test_cuda_convolution_3d<RowMajor>());
-  CALL_SUBTEST(test_cuda_lgamma<float>(1.0f));
-  CALL_SUBTEST(test_cuda_lgamma<float>(100.0f));
-  CALL_SUBTEST(test_cuda_lgamma<float>(0.01f));
-  CALL_SUBTEST(test_cuda_lgamma<float>(0.001f));
-  CALL_SUBTEST(test_cuda_erf<float>(1.0f));
-  CALL_SUBTEST(test_cuda_erf<float>(100.0f));
-  CALL_SUBTEST(test_cuda_erf<float>(0.01f));
-  CALL_SUBTEST(test_cuda_erf<float>(0.001f));
-  CALL_SUBTEST(test_cuda_erfc<float>(1.0f));
+  CALL_SUBTEST_1(test_cuda_elementwise_small());
+  CALL_SUBTEST_1(test_cuda_elementwise());
+  CALL_SUBTEST_1(test_cuda_reduction());
+  CALL_SUBTEST_2(test_cuda_contraction<ColMajor>());
+  CALL_SUBTEST_2(test_cuda_contraction<RowMajor>());
+  CALL_SUBTEST_3(test_cuda_convolution_1d<ColMajor>());
+  CALL_SUBTEST_3(test_cuda_convolution_1d<RowMajor>());
+  CALL_SUBTEST_3(test_cuda_convolution_inner_dim_col_major_1d());
+  CALL_SUBTEST_3(test_cuda_convolution_inner_dim_row_major_1d());
+  CALL_SUBTEST_3(test_cuda_convolution_2d<ColMajor>());
+  CALL_SUBTEST_3(test_cuda_convolution_2d<RowMajor>());
+  CALL_SUBTEST_3(test_cuda_convolution_3d<ColMajor>());
+  CALL_SUBTEST_3(test_cuda_convolution_3d<RowMajor>());
+  CALL_SUBTEST_4(test_cuda_lgamma<float>(1.0f));
+  CALL_SUBTEST_4(test_cuda_lgamma<float>(100.0f));
+  CALL_SUBTEST_4(test_cuda_lgamma<float>(0.01f));
+  CALL_SUBTEST_4(test_cuda_lgamma<float>(0.001f));
+  CALL_SUBTEST_4(test_cuda_erf<float>(1.0f));
+  CALL_SUBTEST_4(test_cuda_erf<float>(100.0f));
+  CALL_SUBTEST_4(test_cuda_erf<float>(0.01f));
+  CALL_SUBTEST_4(test_cuda_erf<float>(0.001f));
+  CALL_SUBTEST_4(test_cuda_erfc<float>(1.0f));
   // CALL_SUBTEST(test_cuda_erfc<float>(100.0f));
-  CALL_SUBTEST(test_cuda_erfc<float>(5.0f)); // CUDA erfc lacks precision for large inputs
-  CALL_SUBTEST(test_cuda_erfc<float>(0.01f));
-  CALL_SUBTEST(test_cuda_erfc<float>(0.001f));
-  CALL_SUBTEST(test_cuda_tanh<double>(1.0));
-  CALL_SUBTEST(test_cuda_tanh<double>(100.0));
-  CALL_SUBTEST(test_cuda_tanh<double>(0.01));
-  CALL_SUBTEST(test_cuda_tanh<double>(0.001));
-  CALL_SUBTEST(test_cuda_lgamma<double>(1.0));
-  CALL_SUBTEST(test_cuda_lgamma<double>(100.0));
-  CALL_SUBTEST(test_cuda_lgamma<double>(0.01));
-  CALL_SUBTEST(test_cuda_lgamma<double>(0.001));
-  CALL_SUBTEST(test_cuda_erf<double>(1.0));
-  CALL_SUBTEST(test_cuda_erf<double>(100.0));
-  CALL_SUBTEST(test_cuda_erf<double>(0.01));
-  CALL_SUBTEST(test_cuda_erf<double>(0.001));
-  CALL_SUBTEST(test_cuda_erfc<double>(1.0));
+  CALL_SUBTEST_4(test_cuda_erfc<float>(5.0f)); // CUDA erfc lacks precision for large inputs
+  CALL_SUBTEST_4(test_cuda_erfc<float>(0.01f));
+  CALL_SUBTEST_4(test_cuda_erfc<float>(0.001f));
+  CALL_SUBTEST_4(test_cuda_lgamma<double>(1.0));
+  CALL_SUBTEST_4(test_cuda_lgamma<double>(100.0));
+  CALL_SUBTEST_4(test_cuda_lgamma<double>(0.01));
+  CALL_SUBTEST_4(test_cuda_lgamma<double>(0.001));
+  CALL_SUBTEST_4(test_cuda_erf<double>(1.0));
+  CALL_SUBTEST_4(test_cuda_erf<double>(100.0));
+  CALL_SUBTEST_4(test_cuda_erf<double>(0.01));
+  CALL_SUBTEST_4(test_cuda_erf<double>(0.001));
+  CALL_SUBTEST_4(test_cuda_erfc<double>(1.0));
   // CALL_SUBTEST(test_cuda_erfc<double>(100.0));
-  CALL_SUBTEST(test_cuda_erfc<double>(5.0)); // CUDA erfc lacks precision for large inputs
-  CALL_SUBTEST(test_cuda_erfc<double>(0.01));
-  CALL_SUBTEST(test_cuda_erfc<double>(0.001));
+  CALL_SUBTEST_4(test_cuda_erfc<double>(5.0)); // CUDA erfc lacks precision for large inputs
+  CALL_SUBTEST_4(test_cuda_erfc<double>(0.01));
+  CALL_SUBTEST_4(test_cuda_erfc<double>(0.001));
 }
diff --git a/unsupported/test/cxx11_tensor_device.cpp b/unsupported/test/cxx11_tensor_device.cu
index ed5dd7505..cbe9e6449 100644
--- a/unsupported/test/cxx11_tensor_device.cpp
+++ b/unsupported/test/cxx11_tensor_device.cu
@@ -109,19 +109,19 @@ struct GPUContext {
 
 // The actual expression to evaluate
 template <typename Context>
-static void test_contextual_eval(Context* context)
+void test_contextual_eval(Context* context)
 {
   context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f);
 }
 
 template <typename Context>
-static void test_forced_contextual_eval(Context* context)
+void test_forced_contextual_eval(Context* context)
 {
   context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f);
 }
 
 template <typename Context>
-static void test_compound_assignment(Context* context)
+void test_compound_assignment(Context* context)
 {
   context->out().device(context->device()) = context->in1().constant(2.718f);
   context->out().device(context->device()) += context->in1() + context->in2() * 3.14f;
@@ -129,7 +129,7 @@ static void test_compound_assignment(Context* context)
 
 
 template <typename Context>
-static void test_contraction(Context* context)
+void test_contraction(Context* context)
 {
   Eigen::array<std::pair<int, int>, 2> dims;
   dims[0] = std::make_pair(1, 1);
@@ -145,7 +145,7 @@ static void test_contraction(Context* context)
 
 
 template <typename Context>
-static void test_1d_convolution(Context* context)
+void test_1d_convolution(Context* context)
 {
   Eigen::DSizes<int, 3> indices(0,0,0);
   Eigen::DSizes<int, 3> sizes(40,49,70);
@@ -155,7 +155,7 @@ static void test_1d_convolution(Context* context)
 }
 
 template <typename Context>
-static void test_2d_convolution(Context* context)
+void test_2d_convolution(Context* context)
 {
   Eigen::DSizes<int, 3> indices(0,0,0);
   Eigen::DSizes<int, 3> sizes(40,49,69);
@@ -165,7 +165,7 @@ static void test_2d_convolution(Context* context)
 }
 
 template <typename Context>
-static void test_3d_convolution(Context* context)
+void test_3d_convolution(Context* context)
 {
   Eigen::DSizes<int, 3> indices(0,0,0);
   Eigen::DSizes<int, 3> sizes(39,49,69);
@@ -175,7 +175,7 @@ static void test_3d_convolution(Context* context)
 }
 
 
-static void test_cpu() {
+void test_cpu() {
   Eigen::Tensor<float, 3> in1(40,50,70);
   Eigen::Tensor<float, 3> in2(40,50,70);
   Eigen::Tensor<float, 3> out(40,50,70);
@@ -267,7 +267,7 @@ static void test_cpu() {
   }
 }
 
-static void test_gpu() {
+void test_gpu() {
   Eigen::Tensor<float, 3> in1(40,50,70);
   Eigen::Tensor<float, 3> in2(40,50,70);
   Eigen::Tensor<float, 3> out(40,50,70);
@@ -383,6 +383,6 @@ static void test_gpu() {
 
 void test_cxx11_tensor_device()
 {
-  CALL_SUBTEST(test_cpu());
-  CALL_SUBTEST(test_gpu());
+  CALL_SUBTEST_1(test_cpu());
+  CALL_SUBTEST_2(test_gpu());
 }
diff --git a/unsupported/test/cxx11_tensor_empty.cpp b/unsupported/test/cxx11_tensor_empty.cpp
new file mode 100644
index 000000000..9130fff35
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_empty.cpp
@@ -0,0 +1,40 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+
+static void test_empty_tensor()
+{
+  Tensor<float, 2> source;
+  Tensor<float, 2> tgt1 = source;
+  Tensor<float, 2> tgt2(source);
+  Tensor<float, 2> tgt3;
+  tgt3 = tgt1;
+  tgt3 = tgt2;
+}
+
+static void test_empty_fixed_size_tensor()
+{
+  TensorFixedSize<float, Sizes<0>> source;
+  TensorFixedSize<float, Sizes<0>> tgt1 = source;
+  TensorFixedSize<float, Sizes<0>> tgt2(source);
+  TensorFixedSize<float, Sizes<0>> tgt3;
+  tgt3 = tgt1;
+  tgt3 = tgt2;
+}
+
+
+void test_cxx11_tensor_empty()
+{
+   CALL_SUBTEST(test_empty_tensor());
+   CALL_SUBTEST(test_empty_fixed_size_tensor());
+}
diff --git a/unsupported/test/cxx11_tensor_random_cuda.cpp b/unsupported/test/cxx11_tensor_random_cuda.cu
index 5d091de15..5d091de15 100644
--- a/unsupported/test/cxx11_tensor_random_cuda.cpp
+++ b/unsupported/test/cxx11_tensor_random_cuda.cu
diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cpp b/unsupported/test/cxx11_tensor_reduction_cuda.cu
index 9e06eb126..cad0c08e0 100644
--- a/unsupported/test/cxx11_tensor_reduction_cuda.cpp
+++ b/unsupported/test/cxx11_tensor_reduction_cuda.cu
@@ -48,9 +48,12 @@ static void test_full_reductions() {
 
   // Check that the CPU and GPU reductions return the same result.
   VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
+
+  gpu_device.deallocate(gpu_in_ptr);
+  gpu_device.deallocate(gpu_out_ptr);
 }
 
 void test_cxx11_tensor_reduction_cuda() {
-  CALL_SUBTEST(test_full_reductions<ColMajor>());
-  CALL_SUBTEST(test_full_reductions<RowMajor>());
+  CALL_SUBTEST_1(test_full_reductions<ColMajor>());
+  CALL_SUBTEST_2(test_full_reductions<RowMajor>());
 }
diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp
index e28cf55e2..e46197464 100644
--- a/unsupported/test/cxx11_tensor_thread_pool.cpp
+++ b/unsupported/test/cxx11_tensor_thread_pool.cpp
@@ -17,7 +17,7 @@
 using Eigen::Tensor;
 
 
-static void test_multithread_elementwise()
+void test_multithread_elementwise()
 {
   Tensor<float, 3> in1(2,3,7);
   Tensor<float, 3> in2(2,3,7);
@@ -40,7 +40,7 @@ static void test_multithread_elementwise()
 }
 
 
-static void test_multithread_compound_assignment()
+void test_multithread_compound_assignment()
 {
   Tensor<float, 3> in1(2,3,7);
   Tensor<float, 3> in2(2,3,7);
@@ -64,7 +64,7 @@ static void test_multithread_compound_assignment()
 }
 
 template<int DataLayout>
-static void test_multithread_contraction()
+void test_multithread_contraction()
 {
   Tensor<float, 4, DataLayout> t_left(30, 50, 37, 31);
   Tensor<float, 5, DataLayout> t_right(37, 31, 70, 2, 10);
@@ -91,15 +91,20 @@ static void test_multithread_contraction()
 
  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
     VERIFY(&t_result.data()[i] != &m_result.data()[i]);
-    if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
-      std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
-      assert(false);
+    if (fabs(t_result(i) - m_result(i)) < 1e-4) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), m_result(i), 1e-4f)) {
+      continue;
     }
+    std::cout << "mismatch detected at index " << i << ": " << t_result(i)
+              << " vs " <<  m_result(i) << std::endl;
+    assert(false);
   }
 }
 
 template<int DataLayout>
-static void test_contraction_corner_cases()
+void test_contraction_corner_cases()
 {
   Tensor<float, 2, DataLayout> t_left(32, 500);
   Tensor<float, 2, DataLayout> t_right(32, 28*28);
@@ -186,7 +191,7 @@ static void test_contraction_corner_cases()
 }
 
 template<int DataLayout>
-static void test_multithread_contraction_agrees_with_singlethread() {
+void test_multithread_contraction_agrees_with_singlethread() {
   int contract_size = internal::random<int>(1, 5000);
 
   Tensor<float, 3, DataLayout> left(internal::random<int>(1, 80),
@@ -229,7 +234,7 @@ static void test_multithread_contraction_agrees_with_singlethread() {
 
 
 template<int DataLayout>
-static void test_multithreaded_reductions() {
+void test_multithreaded_reductions() {
   const int num_threads = internal::random<int>(3, 11);
   ThreadPool thread_pool(num_threads);
   Eigen::ThreadPoolDevice thread_pool_device(&thread_pool, num_threads);
@@ -239,19 +244,19 @@ static void test_multithreaded_reductions() {
   Tensor<float, 2, DataLayout> t1(num_rows, num_cols);
   t1.setRandom();
 
-  Tensor<float, 1, DataLayout> full_redux(1);
+  Tensor<float, 0, DataLayout> full_redux;
   full_redux = t1.sum();
 
-  Tensor<float, 1, DataLayout> full_redux_tp(1);
+  Tensor<float, 0, DataLayout> full_redux_tp;
   full_redux_tp.device(thread_pool_device) = t1.sum();
 
   // Check that the single threaded and the multi threaded reductions return
   // the same result.
-  VERIFY_IS_APPROX(full_redux(0), full_redux_tp(0));
+  VERIFY_IS_APPROX(full_redux(), full_redux_tp());
 }
 
 
-static void test_memcpy() {
+void test_memcpy() {
 
   for (int i = 0; i < 5; ++i) {
     const int num_threads = internal::random<int>(3, 11);
@@ -270,7 +275,7 @@ static void test_memcpy() {
 }
 
 
-static void test_multithread_random()
+void test_multithread_random()
 {
   Eigen::ThreadPool tp(2);
   Eigen::ThreadPoolDevice device(&tp, 2);
@@ -278,26 +283,52 @@ static void test_multithread_random()
   t.device(device) = t.random<Eigen::internal::NormalRandomGenerator<float>>();
 }
 
+template<int DataLayout>
+void test_multithread_shuffle()
+{
+  Tensor<float, 4, DataLayout> tensor(17,5,7,11);
+  tensor.setRandom();
+
+  const int num_threads = internal::random<int>(2, 11);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+  Tensor<float, 4, DataLayout> shuffle(7,5,11,17);
+  array<ptrdiff_t, 4> shuffles = {{2,1,3,0}};
+  shuffle.device(device) = tensor.shuffle(shuffles);
+
+  for (int i = 0; i < 17; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,j,l,i));
+        }
+      }
+    }
+  }
+}
+
 
 void test_cxx11_tensor_thread_pool()
 {
-  CALL_SUBTEST(test_multithread_elementwise());
-  CALL_SUBTEST(test_multithread_compound_assignment());
+  CALL_SUBTEST_1(test_multithread_elementwise());
+  CALL_SUBTEST_1(test_multithread_compound_assignment());
 
-  CALL_SUBTEST(test_multithread_contraction<ColMajor>());
-  CALL_SUBTEST(test_multithread_contraction<RowMajor>());
+  CALL_SUBTEST_2(test_multithread_contraction<ColMajor>());
+  CALL_SUBTEST_2(test_multithread_contraction<RowMajor>());
 
-  CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread<ColMajor>());
-  CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread<RowMajor>());
+  CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<ColMajor>());
+  CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<RowMajor>());
 
   // Exercise various cases that have been problematic in the past.
-  CALL_SUBTEST(test_contraction_corner_cases<ColMajor>());
-  CALL_SUBTEST(test_contraction_corner_cases<RowMajor>());
-
-  CALL_SUBTEST(test_multithreaded_reductions<ColMajor>());
-  CALL_SUBTEST(test_multithreaded_reductions<RowMajor>());
+  CALL_SUBTEST_4(test_contraction_corner_cases<ColMajor>());
+  CALL_SUBTEST_4(test_contraction_corner_cases<RowMajor>());
 
-  CALL_SUBTEST(test_memcpy());
+  CALL_SUBTEST_5(test_multithreaded_reductions<ColMajor>());
+  CALL_SUBTEST_5(test_multithreaded_reductions<RowMajor>());
 
-  CALL_SUBTEST(test_multithread_random());
+  CALL_SUBTEST_6(test_memcpy());
+  CALL_SUBTEST_6(test_multithread_random());
+  CALL_SUBTEST_6(test_multithread_shuffle<ColMajor>());
+  CALL_SUBTEST_6(test_multithread_shuffle<RowMajor>());
 }