190 files changed, 5155 insertions, 2184 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 51beba118..b1247f75f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -120,7 +120,7 @@ endmacro(ei_add_cxx_compiler_flag)
 if(NOT MSVC)
   # We assume that other compilers are partly compatible with GNUCC
 
-  # clang outputs some warnings for unknwon flags that are not caught by check_cxx_compiler_flag
+  # clang outputs some warnings for unknown flags that are not caught by check_cxx_compiler_flag
   # adding -Werror turns such warnings into errors
   check_cxx_compiler_flag("-Werror" COMPILER_SUPPORT_WERROR)
   if(COMPILER_SUPPORT_WERROR)
@@ -143,6 +143,8 @@ if(NOT MSVC)
   ei_add_cxx_compiler_flag("-Wshorten-64-to-32")
   ei_add_cxx_compiler_flag("-Wenum-conversion")
   ei_add_cxx_compiler_flag("-Wc++11-extensions")
+  ei_add_cxx_compiler_flag("-Wdouble-promotion")
+#  ei_add_cxx_compiler_flag("-Wconversion")
   
   # -Wshadow is insanely too strict with gcc, hopefully it will become usable with gcc 6
   # if(NOT CMAKE_COMPILER_IS_GNUCXX OR (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "5.0.0"))
@@ -158,7 +160,7 @@ if(NOT MSVC)
   ei_add_cxx_compiler_flag("-fno-common")
   ei_add_cxx_compiler_flag("-fstrict-aliasing")
   ei_add_cxx_compiler_flag("-wd981")                    # disable ICC's "operands are evaluated in unspecified order" remark
-  ei_add_cxx_compiler_flag("-wd2304")                   # disbale ICC's "warning #2304: non-explicit constructor with single argument may cause implicit type conversion" produced by -Wnon-virtual-dtor
+  ei_add_cxx_compiler_flag("-wd2304")                   # disable ICC's "warning #2304: non-explicit constructor with single argument may cause implicit type conversion" produced by -Wnon-virtual-dtor
   
   
   # The -ansi flag must be added last, otherwise it is also used as a linker flag by check_cxx_compiler_flag making it fails
@@ -221,6 +223,12 @@ if(NOT MSVC)
     message(STATUS "Enabling FMA in tests/examples")
   endif()
 
+  option(EIGEN_TEST_F16C "Enable/Disable F16C in tests/examples" OFF)
+  if(EIGEN_TEST_F16C)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mf16c")
+    message(STATUS "Enabling F16C in tests/examples")
+  endif()
+
   option(EIGEN_TEST_ALTIVEC "Enable/Disable AltiVec in tests/examples" OFF)
   if(EIGEN_TEST_ALTIVEC)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec")
diff --git a/Eigen/Core b/Eigen/Core
index 1e62f3ec1..50040135f 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -33,13 +33,13 @@
   #ifdef EIGEN_EXCEPTIONS
   #undef EIGEN_EXCEPTIONS
   #endif
-  
+
   // All functions callable from CUDA code must be qualified with __device__
   #define EIGEN_DEVICE_FUNC __host__ __device__
-  
+
 #else
   #define EIGEN_DEVICE_FUNC
-  
+
 #endif
 
 // When compiling CUDA device code with NVCC, pull in math functions from the
@@ -204,7 +204,7 @@
   #endif
 #endif
 
-#if defined(__F16C__)
+#if defined(__F16C__) && !defined(EIGEN_COMP_CLANG)
   // We can use the optimized fp16 to float and float to fp16 conversion routines
   #define EIGEN_HAS_FP16_C
 #endif
@@ -214,10 +214,14 @@
   #include <vector_types.h>
   #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
     #define EIGEN_HAS_CUDA_FP16
-    #include <cuda_fp16.h>
   #endif
 #endif
 
+#if defined EIGEN_HAS_CUDA_FP16
+  #include <host_defines.h>
+  #include <cuda_fp16.h>
+#endif
+
 #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
   #define EIGEN_HAS_OPENMP
 #endif
@@ -296,7 +300,7 @@ inline static const char *SimdInstructionSetsInUse(void) {
 // we use size_t frequently and we'll never remember to prepend it with std:: everytime just to
 // ensure QNX/QCC support
 using std::size_t;
-// gcc 4.6.0 wants std:: for ptrdiff_t 
+// gcc 4.6.0 wants std:: for ptrdiff_t
 using std::ptrdiff_t;
 
 /** \defgroup Core_Module Core module
@@ -440,6 +444,7 @@ using std::ptrdiff_t;
 #include "src/Core/products/TriangularSolverVector.h"
 #include "src/Core/BandMatrix.h"
 #include "src/Core/CoreIterators.h"
+#include "src/Core/ConditionEstimator.h"
 
 #include "src/Core/BooleanRedux.h"
 #include "src/Core/Select.h"
@@ -450,14 +455,14 @@ using std::ptrdiff_t;
 #include "src/Core/ArrayWrapper.h"
 
 #ifdef EIGEN_USE_BLAS
-#include "src/Core/products/GeneralMatrixMatrix_MKL.h"
-#include "src/Core/products/GeneralMatrixVector_MKL.h"
-#include "src/Core/products/GeneralMatrixMatrixTriangular_MKL.h"
-#include "src/Core/products/SelfadjointMatrixMatrix_MKL.h"
-#include "src/Core/products/SelfadjointMatrixVector_MKL.h"
-#include "src/Core/products/TriangularMatrixMatrix_MKL.h"
-#include "src/Core/products/TriangularMatrixVector_MKL.h"
-#include "src/Core/products/TriangularSolverMatrix_MKL.h"
+#include "src/Core/products/GeneralMatrixMatrix_BLAS.h"
+#include "src/Core/products/GeneralMatrixVector_BLAS.h"
+#include "src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h"
+#include "src/Core/products/SelfadjointMatrixMatrix_BLAS.h"
+#include "src/Core/products/SelfadjointMatrixVector_BLAS.h"
+#include "src/Core/products/TriangularMatrixMatrix_BLAS.h"
+#include "src/Core/products/TriangularMatrixVector_BLAS.h"
+#include "src/Core/products/TriangularSolverMatrix_BLAS.h"
 #endif // EIGEN_USE_BLAS
 
 #ifdef EIGEN_USE_MKL_VML
diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h
index c3cc3746c..538aff956 100644
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -13,7 +13,7 @@
 #ifndef EIGEN_LDLT_H
 #define EIGEN_LDLT_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
   template<typename MatrixType, int UpLo> struct LDLT_Traits;
@@ -73,11 +73,11 @@ template<typename _MatrixType, int _UpLo> class LDLT
       * The default constructor is useful in cases in which the user intends to
       * perform decompositions via LDLT::compute(const MatrixType&).
       */
-    LDLT() 
-      : m_matrix(), 
-        m_transpositions(), 
+    LDLT()
+      : m_matrix(),
+        m_transpositions(),
         m_sign(internal::ZeroSign),
-        m_isInitialized(false) 
+        m_isInitialized(false)
     {}
 
     /** \brief Default Constructor with memory preallocation
@@ -168,7 +168,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
       * \note_about_checking_solutions
       *
       * More precisely, this method solves \f$ A x = b \f$ using the decomposition \f$ A = P^T L D L^* P \f$
-      * by solving the systems \f$ P^T y_1 = b \f$, \f$ L y_2 = y_1 \f$, \f$ D y_3 = y_2 \f$, 
+      * by solving the systems \f$ P^T y_1 = b \f$, \f$ L y_2 = y_1 \f$, \f$ D y_3 = y_2 \f$,
       * \f$ L^* y_4 = y_3 \f$ and \f$ P x = y_4 \f$ in succession. If the matrix \f$ A \f$ is singular, then
       * \f$ D \f$ will also be singular (all the other matrices are invertible). In that case, the
       * least-square solution of \f$ D y_3 = y_2 \f$ is computed. This does not mean that this function
@@ -192,6 +192,15 @@ template<typename _MatrixType, int _UpLo> class LDLT
     template<typename InputType>
     LDLT& compute(const EigenBase<InputType>& matrix);
 
+    /** \returns an estimate of the reciprocal condition number of the matrix of
+     *  which \c *this is the LDLT decomposition.
+     */
+    RealScalar rcond() const
+    {
+      eigen_assert(m_isInitialized && "LDLT is not initialized.");
+      return internal::rcond_estimate_helper(m_l1_norm, *this);
+    }
+
     template <typename Derived>
     LDLT& rankUpdate(const MatrixBase<Derived>& w, const RealScalar& alpha=1);
 
@@ -207,6 +216,13 @@ template<typename _MatrixType, int _UpLo> class LDLT
 
     MatrixType reconstructedMatrix() const;
 
+    /** \returns the adjoint of \c *this, that is, a const reference to the decomposition itself as the underlying matrix is self-adjoint.
+      *
+      * This method is provided for compatibility with other matrix decompositions, thus enabling generic code such as:
+      * \code x = decomposition.adjoint().solve(b) \endcode
+      */
+    const LDLT& adjoint() const { return *this; };
+
     inline Index rows() const { return m_matrix.rows(); }
     inline Index cols() const { return m_matrix.cols(); }
 
@@ -220,7 +236,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
       eigen_assert(m_isInitialized && "LDLT is not initialized.");
       return Success;
     }
-    
+
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
     EIGEN_DEVICE_FUNC
@@ -228,7 +244,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
     #endif
 
   protected:
-    
+
     static void check_template_parameters()
     {
       EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
@@ -241,6 +257,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
       * is not stored), and the diagonal entries correspond to D.
       */
     MatrixType m_matrix;
+    RealScalar m_l1_norm;
     TranspositionType m_transpositions;
     TmpMatrixType m_temporary;
     internal::SignMatrix m_sign;
@@ -266,8 +283,8 @@ template<> struct ldlt_inplace<Lower>
     if (size <= 1)
     {
       transpositions.setIdentity();
-      if (numext::real(mat.coeff(0,0)) > 0) sign = PositiveSemiDef;
-      else if (numext::real(mat.coeff(0,0)) < 0) sign = NegativeSemiDef;
+      if (numext::real(mat.coeff(0,0)) > static_cast<RealScalar>(0) ) sign = PositiveSemiDef;
+      else if (numext::real(mat.coeff(0,0)) < static_cast<RealScalar>(0)) sign = NegativeSemiDef;
       else sign = ZeroSign;
       return true;
     }
@@ -314,7 +331,7 @@ template<> struct ldlt_inplace<Lower>
         if(rs>0)
           A21.noalias() -= A20 * temp.head(k);
       }
-      
+
       // In some previous versions of Eigen (e.g., 3.2.1), the scaling was omitted if the pivot
       // was smaller than the cutoff value. However, since LDLT is not rank-revealing
       // we should only make sure that we do not introduce INF or NaN values.
@@ -324,12 +341,12 @@ template<> struct ldlt_inplace<Lower>
         A21 /= realAkk;
 
       if (sign == PositiveSemiDef) {
-        if (realAkk < 0) sign = Indefinite;
+        if (realAkk < static_cast<RealScalar>(0)) sign = Indefinite;
       } else if (sign == NegativeSemiDef) {
-        if (realAkk > 0) sign = Indefinite;
+        if (realAkk > static_cast<RealScalar>(0)) sign = Indefinite;
       } else if (sign == ZeroSign) {
-        if (realAkk > 0) sign = PositiveSemiDef;
-        else if (realAkk < 0) sign = NegativeSemiDef;
+        if (realAkk > static_cast<RealScalar>(0)) sign = PositiveSemiDef;
+        else if (realAkk < static_cast<RealScalar>(0)) sign = NegativeSemiDef;
       }
     }
 
@@ -433,12 +450,25 @@ template<typename InputType>
 LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>& a)
 {
   check_template_parameters();
-  
+
   eigen_assert(a.rows()==a.cols());
   const Index size = a.rows();
 
   m_matrix = a.derived();
 
+  // Compute matrix L1 norm = max abs column sum.
+  m_l1_norm = RealScalar(0);
+  // TODO move this code to SelfAdjointView
+  for (Index col = 0; col < size; ++col) {
+    RealScalar abs_col_sum;
+    if (_UpLo == Lower)
+      abs_col_sum = m_matrix.col(col).tail(size - col).template lpNorm<1>() + m_matrix.row(col).head(col).template lpNorm<1>();
+    else
+      abs_col_sum = m_matrix.col(col).head(col).template lpNorm<1>() + m_matrix.row(col).tail(size - col).template lpNorm<1>();
+    if (abs_col_sum > m_l1_norm)
+      m_l1_norm = abs_col_sum;
+  }
+
   m_transpositions.resize(size);
   m_isInitialized = false;
   m_temporary.resize(size);
@@ -466,7 +496,7 @@ LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Deri
     eigen_assert(m_matrix.rows()==size);
   }
   else
-  {    
+  {
     m_matrix.resize(size,size);
     m_matrix.setZero();
     m_transpositions.resize(size);
@@ -505,7 +535,7 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons
   // diagonal element is not well justified and leads to numerical issues in some cases.
   // Moreover, Lapack's xSYTRS routines use 0 for the tolerance.
   RealScalar tolerance = RealScalar(1) / NumTraits<RealScalar>::highest();
-  
+
   for (Index i = 0; i < vecD.size(); ++i)
   {
     if(abs(vecD(i)) > tolerance)
diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h
index 74cf5bfe1..19578b216 100644
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_LLT_H
 #define EIGEN_LLT_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal{
 template<typename MatrixType, int UpLo> struct LLT_Traits;
@@ -40,7 +40,7 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
   *
   * Example: \include LLT_example.cpp
   * Output: \verbinclude LLT_example.out
-  *    
+  *
   * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
   */
  /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
@@ -135,6 +135,16 @@ template<typename _MatrixType, int _UpLo> class LLT
     template<typename InputType>
     LLT& compute(const EigenBase<InputType>& matrix);
 
+    /** \returns an estimate of the reciprocal condition number of the matrix of
+      *  which \c *this is the Cholesky decomposition.
+      */
+    RealScalar rcond() const
+    {
+      eigen_assert(m_isInitialized && "LLT is not initialized.");
+      eigen_assert(m_info == Success && "LLT failed because matrix appears to be negative");
+      return internal::rcond_estimate_helper(m_l1_norm, *this);
+    }
+
     /** \returns the LLT decomposition matrix
       *
       * TODO: document the storage layout
@@ -159,12 +169,19 @@ template<typename _MatrixType, int _UpLo> class LLT
       return m_info;
     }
 
+    /** \returns the adjoint of \c *this, that is, a const reference to the decomposition itself as the underlying matrix is self-adjoint.
+      *
+      * This method is provided for compatibility with other matrix decompositions, thus enabling generic code such as:
+      * \code x = decomposition.adjoint().solve(b) \endcode
+      */
+    const LLT& adjoint() const { return *this; };
+
     inline Index rows() const { return m_matrix.rows(); }
     inline Index cols() const { return m_matrix.cols(); }
 
     template<typename VectorType>
     LLT rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
-    
+
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
     EIGEN_DEVICE_FUNC
@@ -172,17 +189,18 @@ template<typename _MatrixType, int _UpLo> class LLT
     #endif
 
   protected:
-    
+
     static void check_template_parameters()
     {
       EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
     }
-    
+
     /** \internal
       * Used to compute and store L
       * The strict upper part is not used and even not initialized.
       */
     MatrixType m_matrix;
+    RealScalar m_l1_norm;
     bool m_isInitialized;
     ComputationInfo m_info;
 };
@@ -268,7 +286,7 @@ template<typename Scalar> struct llt_inplace<Scalar, Lower>
   static Index unblocked(MatrixType& mat)
   {
     using std::sqrt;
-    
+
     eigen_assert(mat.rows()==mat.cols());
     const Index size = mat.rows();
     for(Index k = 0; k < size; ++k)
@@ -328,7 +346,7 @@ template<typename Scalar> struct llt_inplace<Scalar, Lower>
     return Eigen::internal::llt_rank_update_lower(mat, vec, sigma);
   }
 };
-  
+
 template<typename Scalar> struct llt_inplace<Scalar, Upper>
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
@@ -387,12 +405,25 @@ template<typename InputType>
 LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>& a)
 {
   check_template_parameters();
-  
+
   eigen_assert(a.rows()==a.cols());
   const Index size = a.rows();
   m_matrix.resize(size, size);
   m_matrix = a.derived();
 
+  // Compute matrix L1 norm = max abs column sum.
+  m_l1_norm = RealScalar(0);
+  // TODO move this code to SelfAdjointView
+  for (Index col = 0; col < size; ++col) {
+    RealScalar abs_col_sum;
+    if (_UpLo == Lower)
+      abs_col_sum = m_matrix.col(col).tail(size - col).template lpNorm<1>() + m_matrix.row(col).head(col).template lpNorm<1>();
+    else
+      abs_col_sum = m_matrix.col(col).head(col).template lpNorm<1>() + m_matrix.row(col).tail(size - col).template lpNorm<1>();
+    if (abs_col_sum > m_l1_norm)
+      m_l1_norm = abs_col_sum;
+  }
+
   m_isInitialized = true;
   bool ok = Traits::inplace_decomposition(m_matrix);
   m_info = ok ? Success : NumericalIssue;
@@ -419,7 +450,7 @@ LLT<_MatrixType,_UpLo> LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, c
 
   return *this;
 }
- 
+
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename _MatrixType,int _UpLo>
 template<typename RhsType, typename DstType>
@@ -431,7 +462,7 @@ void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
 #endif
 
 /** \internal use x = llt_object.solve(x);
-  * 
+  *
   * This is the \em in-place version of solve().
   *
   * \param bAndX represents both the right-hand side matrix b and result x.
@@ -483,7 +514,7 @@ SelfAdjointView<MatrixType, UpLo>::llt() const
   return LLT<PlainObject,UpLo>(m_matrix);
 }
 #endif // __CUDACC__
-  
+
 } // end namespace Eigen
 
 #endif // EIGEN_LLT_H
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h
index a9a524130..b1193e421 100644
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -29,13 +29,10 @@ struct copy_using_evaluator_traits
 {
   typedef typename DstEvaluator::XprType Dst;
   typedef typename Dst::Scalar DstScalar;
-  // TODO distinguish between linear traversal and inner-traversals
-  typedef typename find_best_packet<DstScalar,Dst::SizeAtCompileTime>::type PacketType; 
   
   enum {
     DstFlags = DstEvaluator::Flags,
-    SrcFlags = SrcEvaluator::Flags,
-    RequiredAlignment = unpacket_traits<PacketType>::alignment
+    SrcFlags = SrcEvaluator::Flags
   };
   
 public:
@@ -55,10 +52,25 @@ private:
               : int(DstFlags)&RowMajorBit ? int(Dst::MaxColsAtCompileTime)
               : int(Dst::MaxRowsAtCompileTime),
     OuterStride = int(outer_stride_at_compile_time<Dst>::ret),
-    MaxSizeAtCompileTime = Dst::SizeAtCompileTime,
-    PacketSize = unpacket_traits<PacketType>::size
+    MaxSizeAtCompileTime = Dst::SizeAtCompileTime
+  };
+
+  // TODO distinguish between linear traversal and inner-traversals
+  typedef typename find_best_packet<DstScalar,Dst::SizeAtCompileTime>::type LinearPacketType;
+  typedef typename find_best_packet<DstScalar,InnerSize>::type InnerPacketType;
+
+  enum {
+    LinearPacketSize = unpacket_traits<LinearPacketType>::size,
+    InnerPacketSize = unpacket_traits<InnerPacketType>::size
+  };
+
+public:
+  enum {
+    LinearRequiredAlignment = unpacket_traits<LinearPacketType>::alignment,
+    InnerRequiredAlignment = unpacket_traits<InnerPacketType>::alignment
   };
 
+private:
   enum {
     DstIsRowMajor = DstFlags&RowMajorBit,
     SrcIsRowMajor = SrcFlags&RowMajorBit,
@@ -67,16 +79,16 @@ private:
                   && (int(DstFlags) & int(SrcFlags) & ActualPacketAccessBit)
                   && (functor_traits<AssignFunc>::PacketAccess),
     MayInnerVectorize  = MightVectorize
-                       && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0
-                       && int(OuterStride)!=Dynamic && int(OuterStride)%int(PacketSize)==0
-                       && int(JointAlignment)>=int(RequiredAlignment),
+                       && int(InnerSize)!=Dynamic && int(InnerSize)%int(InnerPacketSize)==0
+                       && int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0
+                       && int(JointAlignment)>=int(InnerRequiredAlignment),
     MayLinearize = StorageOrdersAgree && (int(DstFlags) & int(SrcFlags) & LinearAccessBit),
     MayLinearVectorize = MightVectorize && MayLinearize && DstHasDirectAccess
-                       && ((int(DstAlignment)>=int(RequiredAlignment)) || MaxSizeAtCompileTime == Dynamic),
+                       && ((int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic),
       /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
          so it's only good for large enough sizes. */
     MaySliceVectorize  = MightVectorize && DstHasDirectAccess
-                       && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=3*PacketSize)
+                       && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=3*InnerPacketSize)
       /* slice vectorization can be slow, so we only want it if the slices are big, which is
          indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block
          in a fixed-size matrix */
@@ -84,7 +96,8 @@ private:
 
 public:
   enum {
-    Traversal = int(MayInnerVectorize)   ? int(InnerVectorizedTraversal)
+    Traversal = int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize) ? int(LinearVectorizedTraversal)
+              : int(MayInnerVectorize)   ? int(InnerVectorizedTraversal)
               : int(MayLinearVectorize)  ? int(LinearVectorizedTraversal)
               : int(MaySliceVectorize)   ? int(SliceVectorizedTraversal)
               : int(MayLinearize)        ? int(LinearTraversal)
@@ -94,9 +107,14 @@ public:
               || int(Traversal) == SliceVectorizedTraversal
   };
 
+  typedef typename conditional<int(Traversal)==LinearVectorizedTraversal, LinearPacketType, InnerPacketType>::type PacketType;
+
 private:
   enum {
-    UnrollingLimit      = EIGEN_UNROLLING_LIMIT * (Vectorized ? int(PacketSize) : 1),
+    ActualPacketSize    = int(Traversal)==LinearVectorizedTraversal ? LinearPacketSize
+                        : Vectorized ? InnerPacketSize
+                        : 1,
+    UnrollingLimit      = EIGEN_UNROLLING_LIMIT * ActualPacketSize,
     MayUnrollCompletely = int(Dst::SizeAtCompileTime) != Dynamic
                        && int(Dst::SizeAtCompileTime) * int(SrcEvaluator::CoeffReadCost) <= int(UnrollingLimit),
     MayUnrollInner      = int(InnerSize) != Dynamic
@@ -112,7 +130,7 @@ public:
                                              : int(NoUnrolling)
                   )
               : int(Traversal) == int(LinearVectorizedTraversal)
-                ? ( bool(MayUnrollCompletely) && (int(DstAlignment)>=int(RequiredAlignment)) ? int(CompleteUnrolling)
+                ? ( bool(MayUnrollCompletely) && (int(DstAlignment)>=int(LinearRequiredAlignment)) ? int(CompleteUnrolling)
                                                                                              : int(NoUnrolling) )
               : int(Traversal) == int(LinearTraversal)
                 ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) 
@@ -131,11 +149,13 @@ public:
     std::cerr.unsetf(std::ios::hex);
     EIGEN_DEBUG_VAR(DstAlignment)
     EIGEN_DEBUG_VAR(SrcAlignment)
-    EIGEN_DEBUG_VAR(RequiredAlignment)
+    EIGEN_DEBUG_VAR(LinearRequiredAlignment)
+    EIGEN_DEBUG_VAR(InnerRequiredAlignment)
     EIGEN_DEBUG_VAR(JointAlignment)
     EIGEN_DEBUG_VAR(InnerSize)
     EIGEN_DEBUG_VAR(InnerMaxSize)
-    EIGEN_DEBUG_VAR(PacketSize)
+    EIGEN_DEBUG_VAR(LinearPacketSize)
+    EIGEN_DEBUG_VAR(InnerPacketSize)
     EIGEN_DEBUG_VAR(StorageOrdersAgree)
     EIGEN_DEBUG_VAR(MightVectorize)
     EIGEN_DEBUG_VAR(MayLinearize)
@@ -236,12 +256,13 @@ struct copy_using_evaluator_innervec_CompleteUnrolling
   enum {
     outer = Index / DstXprType::InnerSizeAtCompileTime,
     inner = Index % DstXprType::InnerSizeAtCompileTime,
-    JointAlignment = Kernel::AssignmentTraits::JointAlignment
+    JointAlignment = Kernel::AssignmentTraits::JointAlignment,
+    DefaultAlignment = unpacket_traits<PacketType>::alignment
   };
 
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
-    kernel.template assignPacketByOuterInner<Aligned, JointAlignment, PacketType>(outer, inner);
+    kernel.template assignPacketByOuterInner<DefaultAlignment, JointAlignment, PacketType>(outer, inner);
     enum { NextIndex = Index + unpacket_traits<PacketType>::size };
     copy_using_evaluator_innervec_CompleteUnrolling<Kernel, NextIndex, Stop>::run(kernel);
   }
@@ -257,9 +278,12 @@ template<typename Kernel, int Index_, int Stop>
 struct copy_using_evaluator_innervec_InnerUnrolling
 {
   typedef typename Kernel::PacketType PacketType;
+  enum {
+    DefaultAlignment = unpacket_traits<PacketType>::alignment
+  };
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, Index outer)
   {
-    kernel.template assignPacketByOuterInner<Aligned, Aligned, PacketType>(outer, Index_);
+    kernel.template assignPacketByOuterInner<DefaultAlignment, DefaultAlignment, PacketType>(outer, Index_);
     enum { NextIndex = Index_ + unpacket_traits<PacketType>::size };
     copy_using_evaluator_innervec_InnerUnrolling<Kernel, NextIndex, Stop>::run(kernel, outer);
   }
@@ -370,7 +394,7 @@ struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling>
     typedef typename Kernel::Scalar Scalar;
     typedef typename Kernel::PacketType PacketType;
     enum {
-      requestedAlignment = Kernel::AssignmentTraits::RequiredAlignment,
+      requestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment,
       packetSize = unpacket_traits<PacketType>::size,
       dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment),
       dstAlignment = packet_traits<Scalar>::AlignedOnScalar ? int(requestedAlignment)
@@ -413,6 +437,9 @@ template<typename Kernel>
 struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, NoUnrolling>
 {
   typedef typename Kernel::PacketType PacketType;
+  enum {
+    DefaultAlignment = unpacket_traits<PacketType>::alignment
+  };
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
     const Index innerSize = kernel.innerSize();
@@ -420,7 +447,7 @@ struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, NoUnrolling>
     const Index packetSize = unpacket_traits<PacketType>::size;
     for(Index outer = 0; outer < outerSize; ++outer)
       for(Index inner = 0; inner < innerSize; inner+=packetSize)
-        kernel.template assignPacketByOuterInner<Aligned, Aligned, PacketType>(outer, inner);
+        kernel.template assignPacketByOuterInner<DefaultAlignment, DefaultAlignment, PacketType>(outer, inner);
   }
 };
 
@@ -484,7 +511,7 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
     typedef typename Kernel::PacketType PacketType;
     enum {
       packetSize = unpacket_traits<PacketType>::size,
-      requestedAlignment = int(Kernel::AssignmentTraits::RequiredAlignment),
+      requestedAlignment = int(Kernel::AssignmentTraits::InnerRequiredAlignment),
       alignable = packet_traits<Scalar>::AlignedOnScalar || int(Kernel::AssignmentTraits::DstAlignment)>=sizeof(Scalar),
       dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment),
       dstAlignment = alignable ? int(requestedAlignment)
@@ -788,8 +815,8 @@ template<typename Dst, typename Src> void check_for_aliasing(const Dst &dst, con
 template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
 struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Scalar>
 {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
   {
     eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
     
@@ -806,8 +833,8 @@ struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Scalar>
 template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
 struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Scalar>
 {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
   {
     eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
     src.evalTo(dst);
diff --git a/Eigen/src/Core/ConditionEstimator.h b/Eigen/src/Core/ConditionEstimator.h
new file mode 100644
index 000000000..68c5e918e
--- /dev/null
+++ b/Eigen/src/Core/ConditionEstimator.h
@@ -0,0 +1,166 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Rasmus Munk Larsen (rmlarsen@google.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CONDITIONESTIMATOR_H
+#define EIGEN_CONDITIONESTIMATOR_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Vector, typename RealVector, bool IsComplex>
+struct rcond_compute_sign {
+  static inline Vector run(const Vector& v) {
+    const RealVector v_abs = v.cwiseAbs();
+    return (v_abs.array() == static_cast<typename Vector::RealScalar>(0))
+            .select(Vector::Ones(v.size()), v.cwiseQuotient(v_abs));
+  }
+};
+
+// Partial specialization to avoid elementwise division for real vectors.
+template <typename Vector>
+struct rcond_compute_sign<Vector, Vector, false> {
+  static inline Vector run(const Vector& v) {
+    return (v.array() < static_cast<typename Vector::RealScalar>(0))
+           .select(-Vector::Ones(v.size()), Vector::Ones(v.size()));
+  }
+};
+
+/** \brief Reciprocal condition number estimator.
+  *
+  * Computing a decomposition of a dense matrix takes O(n^3) operations, while
+  * this method estimates the condition number quickly and reliably in O(n^2)
+  * operations.
+  *
+  * \returns an estimate of the reciprocal condition number
+  * (1 / (||matrix||_1 * ||inv(matrix)||_1)) of matrix, given ||matrix||_1 and
+  * its decomposition. Supports the following decompositions: FullPivLU,
+  * PartialPivLU, LDLT, and LLT.
+  *
+  * \sa FullPivLU, PartialPivLU, LDLT, LLT.
+  */
+template <typename Decomposition>
+typename Decomposition::RealScalar
+rcond_estimate_helper(typename Decomposition::RealScalar matrix_norm, const Decomposition& dec)
+{
+  typedef typename Decomposition::RealScalar RealScalar;
+  eigen_assert(dec.rows() == dec.cols());
+  if (dec.rows() == 0)              return RealScalar(1);
+  if (matrix_norm == RealScalar(0)) return RealScalar(0);
+  if (dec.rows() == 1)              return RealScalar(1);
+  const RealScalar inverse_matrix_norm = rcond_invmatrix_L1_norm_estimate(dec);
+  return (inverse_matrix_norm == RealScalar(0) ? RealScalar(0)
+                                               : (RealScalar(1) / inverse_matrix_norm) / matrix_norm);
+}
+
+/**
+  * \returns an estimate of ||inv(matrix)||_1 given a decomposition of
+  * \a matrix that implements .solve() and .adjoint().solve() methods.
+  *
+  * This function implements Algorithms 4.1 and 5.1 from
+  *   http://www.maths.manchester.ac.uk/~higham/narep/narep135.pdf
+  * which also forms the basis for the condition number estimators in
+  * LAPACK. Since at most 10 calls to the solve method of dec are
+  * performed, the total cost is O(dims^2), as opposed to O(dims^3)
+  * needed to compute the inverse matrix explicitly.
+  *
+  * The most common usage is in estimating the condition number
+  * ||matrix||_1 * ||inv(matrix)||_1. The first term ||matrix||_1 can be
+  * computed directly in O(n^2) operations.
+  *
+  * Supports the following decompositions: FullPivLU, PartialPivLU, LDLT, and
+  * LLT.
+  *
+  * \sa FullPivLU, PartialPivLU, LDLT, LLT.
+  */
+template <typename Decomposition>
+typename Decomposition::RealScalar rcond_invmatrix_L1_norm_estimate(const Decomposition& dec)
+{
+  typedef typename Decomposition::MatrixType MatrixType;
+  typedef typename Decomposition::Scalar Scalar;
+  typedef typename Decomposition::RealScalar RealScalar;
+  typedef typename internal::plain_col_type<MatrixType>::type Vector;
+  typedef typename internal::plain_col_type<MatrixType, RealScalar>::type RealVector;
+  const bool is_complex = (NumTraits<Scalar>::IsComplex != 0);
+
+  eigen_assert(dec.rows() == dec.cols());
+  const Index n = dec.rows();
+  if (n == 0)
+    return 0;
+
+  Vector v = dec.solve(Vector::Ones(n) / Scalar(n));
+
+  // lower_bound is a lower bound on
+  //   ||inv(matrix)||_1  = sup_v ||inv(matrix) v||_1 / ||v||_1
+  // and is the objective maximized by the ("super-") gradient ascent
+  // algorithm below.
+  RealScalar lower_bound = v.template lpNorm<1>();
+  if (n == 1)
+    return lower_bound;
+
+  // Gradient ascent algorithm follows: We know that the optimum is achieved at
+  // one of the simplices v = e_i, so in each iteration we follow a
+  // super-gradient to move towards the optimal one.
+  RealScalar old_lower_bound = lower_bound;
+  Vector sign_vector(n);
+  Vector old_sign_vector;
+  Index v_max_abs_index = -1;
+  Index old_v_max_abs_index = v_max_abs_index;
+  for (int k = 0; k < 4; ++k)
+  {
+    sign_vector = internal::rcond_compute_sign<Vector, RealVector, is_complex>::run(v);
+    if (k > 0 && !is_complex && sign_vector == old_sign_vector) {
+      // Break if the solution stagnated.
+      break;
+    }
+    // v_max_abs_index = argmax |real( inv(matrix)^T * sign_vector )|
+    v = dec.adjoint().solve(sign_vector);
+    v.real().cwiseAbs().maxCoeff(&v_max_abs_index);
+    if (v_max_abs_index == old_v_max_abs_index) {
+      // Break if the solution stagnated.
+      break;
+    }
+    // Move to the new simplex e_j, where j = v_max_abs_index.
+    v = dec.solve(Vector::Unit(n, v_max_abs_index));  // v = inv(matrix) * e_j.
+    lower_bound = v.template lpNorm<1>();
+    if (lower_bound <= old_lower_bound) {
+      // Break if the gradient step did not increase the lower_bound.
+      break;
+    }
+    if (!is_complex) {
+      old_sign_vector = sign_vector;
+    }
+    old_v_max_abs_index = v_max_abs_index;
+    old_lower_bound = lower_bound;
+  }
+  // The following calculates an independent estimate of ||matrix||_1 by
+  // multiplying matrix by a vector with entries of slowly increasing
+  // magnitude and alternating sign:
+  //   v_i = (-1)^{i} (1 + (i / (dim-1))), i = 0,...,dim-1.
+  // This improvement to Hager's algorithm above is due to Higham. It was
+  // added to make the algorithm more robust in certain corner cases where
+  // large elements in the matrix might otherwise escape detection due to
+  // exact cancellation (especially when op and op_adjoint correspond to a
+  // sequence of backsubstitutions and permutations), which could cause
+  // Hager's algorithm to vastly underestimate ||matrix||_1.
+  Scalar alternating_sign(RealScalar(1));
+  for (Index i = 0; i < n; ++i) {
+    v[i] = alternating_sign * (RealScalar(1) + (RealScalar(i) / (RealScalar(n - 1))));
+    alternating_sign = -alternating_sign;
+  }
+  v = dec.solve(v);
+  const RealScalar alternate_lower_bound = (2 * v.template lpNorm<1>()) / (3 * RealScalar(n));
+  return numext::maxi(lower_bound, alternate_lower_bound);
+}
+
+}  // namespace internal
+
+}  // namespace Eigen
+
+#endif
diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index 388805f0d..932178f53 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -850,14 +850,14 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
   template<int StoreMode, typename PacketType>
   EIGEN_STRONG_INLINE
   void writePacket(Index row, Index col, const PacketType& x) 
-  { 
+  {
     return m_argImpl.template writePacket<StoreMode,PacketType>(m_startRow.value() + row, m_startCol.value() + col, x); 
   }
   
   template<int StoreMode, typename PacketType>
   EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketType& x) 
-  { 
+  {
     return writePacket<StoreMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
                                              RowsAtCompileTime == 1 ? index : 0,
                                              x);
diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h
index 53f934999..f7c5f4276 100644
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -81,6 +81,8 @@ public:
  * This is a compile time mapping from {1,Small,Large}^3 -> {product types} */
 // FIXME I'm not sure the current mapping is the ideal one.
 template<int M, int N>  struct product_type_selector<M,N,1>              { enum { ret = OuterProduct }; };
+template<int M>         struct product_type_selector<M, 1, 1>            { enum { ret = LazyCoeffBasedProductMode }; };
+template<int N>         struct product_type_selector<1, N, 1>            { enum { ret = LazyCoeffBasedProductMode }; };
 template<int Depth>     struct product_type_selector<1,    1,    Depth>  { enum { ret = InnerProduct }; };
 template<>              struct product_type_selector<1,    1,    1>      { enum { ret = InnerProduct }; };
 template<>              struct product_type_selector<Small,1,    Small>  { enum { ret = CoeffBasedProductMode }; };
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 6ff61c18a..001c2ffbf 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -62,7 +62,7 @@ struct default_packet_traits
     HasRsqrt  = 0,
     HasExp    = 0,
     HasLog    = 0,
-    HasLog10    = 0,
+    HasLog10  = 0,
     HasPow    = 0,
 
     HasSin    = 0,
@@ -71,9 +71,9 @@ struct default_packet_traits
     HasASin   = 0,
     HasACos   = 0,
     HasATan   = 0,
-    HasSinh    = 0,
-    HasCosh    = 0,
-    HasTanh    = 0,
+    HasSinh   = 0,
+    HasCosh   = 0,
+    HasTanh   = 0,
     HasLGamma = 0,
     HasDiGamma = 0,
     HasZeta = 0,
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index fd73f543b..f31046b54 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -705,12 +705,12 @@ typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>:
 isfinite_impl(const T& x)
 {
   #ifdef __CUDA_ARCH__
-    return (isfinite)(x);
+    return (::isfinite)(x);
   #elif EIGEN_USE_STD_FPCLASSIFY
     using std::isfinite;
     return isfinite EIGEN_NOT_A_MACRO (x);
   #else
-    return x<NumTraits<T>::highest() && x>NumTraits<T>::lowest();
+    return x<=NumTraits<T>::highest() && x>=NumTraits<T>::lowest();
   #endif
 }
 
@@ -720,7 +720,7 @@ typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>:
 isinf_impl(const T& x)
 {
   #ifdef __CUDA_ARCH__
-    return (isinf)(x);
+    return (::isinf)(x);
   #elif EIGEN_USE_STD_FPCLASSIFY
     using std::isinf;
     return isinf EIGEN_NOT_A_MACRO (x);
@@ -735,7 +735,7 @@ typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>:
 isnan_impl(const T& x)
 {
   #ifdef __CUDA_ARCH__
-    return (isnan)(x);
+    return (::isnan)(x);
   #elif EIGEN_USE_STD_FPCLASSIFY
     using std::isnan;
     return isnan EIGEN_NOT_A_MACRO (x);
@@ -1025,6 +1025,66 @@ double log(const double &x) { return ::log(x); }
 
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+typename NumTraits<T>::Real abs(const T &x) {
+  EIGEN_USING_STD_MATH(abs);
+  return abs(x);
+}
+
+#ifdef __CUDACC__
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float abs(const float &x) { return ::fabsf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double abs(const double &x) { return ::fabs(x); }
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T exp(const T &x) {
+  EIGEN_USING_STD_MATH(exp);
+  return exp(x);
+}
+
+#ifdef __CUDACC__
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float exp(const float &x) { return ::expf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double exp(const double &x) { return ::exp(x); }
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T cos(const T &x) {
+  EIGEN_USING_STD_MATH(cos);
+  return cos(x);
+}
+
+#ifdef __CUDACC__
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float cos(const float &x) { return ::cosf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double cos(const double &x) { return ::cos(x); }
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T sin(const T &x) {
+  EIGEN_USING_STD_MATH(sin);
+  return sin(x);
+}
+
+#ifdef __CUDACC__
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float sin(const float &x) { return ::sinf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double sin(const double &x) { return ::sin(x); }
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T tan(const T &x) {
   EIGEN_USING_STD_MATH(tan);
   return tan(x);
@@ -1040,39 +1100,99 @@ double tan(const double &x) { return ::tan(x); }
 
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-typename NumTraits<T>::Real abs(const T &x) {
-  EIGEN_USING_STD_MATH(abs);
-  return abs(x);
+T acos(const T &x) {
+  EIGEN_USING_STD_MATH(acos);
+  return acos(x);
 }
 
 #ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-float abs(const float &x) { return ::fabsf(x); }
+float acos(const float &x) { return ::acosf(x); }
 
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-double abs(const double &x) { return ::fabs(x); }
+double acos(const double &x) { return ::acos(x); }
 #endif
 
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-T exp(const T &x) {
-  EIGEN_USING_STD_MATH(exp);
-  return exp(x);
+T asin(const T &x) {
+  EIGEN_USING_STD_MATH(asin);
+  return asin(x);
 }
 
 #ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-float exp(const float &x) { return ::expf(x); }
+float asin(const float &x) { return ::asinf(x); }
 
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-double exp(const double &x) { return ::exp(x); }
+double asin(const double &x) { return ::asin(x); }
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T atan(const T &x) {
+  EIGEN_USING_STD_MATH(atan);
+  return atan(x);
+}
+
+#ifdef __CUDACC__
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float atan(const float &x) { return ::atanf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double atan(const double &x) { return ::atan(x); }
 #endif
 
 
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T cosh(const T &x) {
+  EIGEN_USING_STD_MATH(cosh);
+  return cosh(x);
+}
+
+#ifdef __CUDACC__
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float cosh(const float &x) { return ::coshf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double cosh(const double &x) { return ::cosh(x); }
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T sinh(const T &x) {
+  EIGEN_USING_STD_MATH(sinh);
+  return sinh(x);
+}
+
+#ifdef __CUDACC__
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float sinh(const float &x) { return ::sinhf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double sinh(const double &x) { return ::sinh(x); }
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T tanh(const T &x) {
+  EIGEN_USING_STD_MATH(tanh);
+  return tanh(x);
+}
+
+#ifdef __CUDACC__
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float tanh(const float &x) { return ::tanhf(x); }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double tanh(const double &x) { return ::tanh(x); }
+#endif
+
 template <typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T fmod(const T& a, const T& b) {
-  EIGEN_USING_STD_MATH(floor);
+  EIGEN_USING_STD_MATH(fmod);
   return fmod(a, b);
 }
 
@@ -1128,14 +1248,12 @@ struct scalar_fuzzy_default_impl<Scalar, false, false>
   template<typename OtherScalar> EIGEN_DEVICE_FUNC
   static inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, const RealScalar& prec)
   {
-    EIGEN_USING_STD_MATH(abs);
-    return abs(x) <= abs(y) * prec;
+    return numext::abs(x) <= numext::abs(y) * prec;
   }
   EIGEN_DEVICE_FUNC
   static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec)
   {
-    EIGEN_USING_STD_MATH(abs);
-    return abs(x - y) <= numext::mini(abs(x), abs(y)) * prec;
+    return numext::abs(x - y) <= numext::mini(numext::abs(x), numext::abs(y)) * prec;
   }
   EIGEN_DEVICE_FUNC
   static inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y, const RealScalar& prec)
diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index 3ce86e8cd..d9fd888cf 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -410,8 +410,6 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
   typedef Product<Lhs, Rhs, LazyProduct> XprType;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketScalar PacketScalar;
-  typedef typename XprType::PacketReturnType PacketReturnType;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   explicit product_evaluator(const XprType& xpr)
@@ -437,16 +435,20 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
 
   typedef evaluator<LhsNestedCleaned> LhsEtorType;
   typedef evaluator<RhsNestedCleaned> RhsEtorType;
-  
+
   enum {
     RowsAtCompileTime = LhsNestedCleaned::RowsAtCompileTime,
     ColsAtCompileTime = RhsNestedCleaned::ColsAtCompileTime,
     InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(LhsNestedCleaned::ColsAtCompileTime, RhsNestedCleaned::RowsAtCompileTime),
     MaxRowsAtCompileTime = LhsNestedCleaned::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = RhsNestedCleaned::MaxColsAtCompileTime,
-      
-    PacketSize = packet_traits<Scalar>::size,
+    MaxColsAtCompileTime = RhsNestedCleaned::MaxColsAtCompileTime
+  };
+
+  typedef typename find_best_packet<Scalar,RowsAtCompileTime>::type LhsVecPacketType;
+  typedef typename find_best_packet<Scalar,ColsAtCompileTime>::type RhsVecPacketType;
 
+  enum {
+      
     LhsCoeffReadCost = LhsEtorType::CoeffReadCost,
     RhsCoeffReadCost = RhsEtorType::CoeffReadCost,
     CoeffReadCost = InnerSize==0 ? NumTraits<Scalar>::ReadCost
@@ -459,19 +461,23 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
     LhsFlags = LhsEtorType::Flags,
     RhsFlags = RhsEtorType::Flags,
     
-    LhsAlignment = LhsEtorType::Alignment,
-    RhsAlignment = RhsEtorType::Alignment,
-    
     LhsRowMajor = LhsFlags & RowMajorBit,
     RhsRowMajor = RhsFlags & RowMajorBit,
+
+    LhsVecPacketSize = unpacket_traits<LhsVecPacketType>::size,
+    RhsVecPacketSize = unpacket_traits<RhsVecPacketType>::size,
+
+    // Here, we don't care about alignment larger than the usable packet size.
+    LhsAlignment = EIGEN_PLAIN_ENUM_MIN(LhsEtorType::Alignment,LhsVecPacketSize*int(sizeof(typename LhsNestedCleaned::Scalar))),
+    RhsAlignment = EIGEN_PLAIN_ENUM_MIN(RhsEtorType::Alignment,RhsVecPacketSize*int(sizeof(typename RhsNestedCleaned::Scalar))),
       
     SameType = is_same<typename LhsNestedCleaned::Scalar,typename RhsNestedCleaned::Scalar>::value,
 
     CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit)
-                    && (ColsAtCompileTime == Dynamic || ((ColsAtCompileTime % PacketSize) == 0) ),
+                    && (ColsAtCompileTime == Dynamic || ((ColsAtCompileTime % RhsVecPacketSize) == 0) ),
 
     CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit)
-                    && (RowsAtCompileTime == Dynamic || ((RowsAtCompileTime % PacketSize) == 0) ),
+                    && (RowsAtCompileTime == Dynamic || ((RowsAtCompileTime % LhsVecPacketSize) == 0) ),
 
     EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
                     : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
@@ -491,10 +497,10 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
               : 0,
 
     /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
-    * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
-    * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
-    * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
-    */
+     * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
+     * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
+     * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
+     */
     CanVectorizeInner =    SameType
                         && LhsRowMajor
                         && (!RhsRowMajor)
@@ -1000,7 +1006,7 @@ struct transposition_matrix_product
     const Index size = tr.size();
     StorageIndex j = 0;
 
-    if(!(is_same<MatrixTypeCleaned,Dest>::value && extract_data(dst) == extract_data(mat)))
+    if(!is_same_dense(dst,mat))
       dst = mat;
 
     for(Index k=(Transposed?size-1:0) ; Transposed?k>=0:k<size ; Transposed?--k:++k)
diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h
index d170cae29..98b2fd868 100644
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -27,8 +27,9 @@ template<typename Func, typename Derived>
 struct redux_traits
 {
 public:
+    typedef typename find_best_packet<typename Derived::Scalar,Derived::SizeAtCompileTime>::type PacketType;
   enum {
-    PacketSize = packet_traits<typename Derived::Scalar>::size,
+    PacketSize = unpacket_traits<PacketType>::size,
     InnerMaxSize = int(Derived::IsRowMajor)
                  ? Derived::MaxColsAtCompileTime
                  : Derived::MaxRowsAtCompileTime
@@ -137,12 +138,12 @@ template<typename Func, typename Derived, int Start, int Length>
 struct redux_vec_unroller
 {
   enum {
-    PacketSize = packet_traits<typename Derived::Scalar>::size,
+    PacketSize = redux_traits<Func, Derived>::PacketSize,
     HalfLength = Length/2
   };
 
   typedef typename Derived::Scalar Scalar;
-  typedef typename packet_traits<Scalar>::type PacketScalar;
+  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
 
   static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func)
   {
@@ -156,14 +157,14 @@ template<typename Func, typename Derived, int Start>
 struct redux_vec_unroller<Func, Derived, Start, 1>
 {
   enum {
-    index = Start * packet_traits<typename Derived::Scalar>::size,
+    index = Start * redux_traits<Func, Derived>::PacketSize,
     outer = index / int(Derived::InnerSizeAtCompileTime),
     inner = index % int(Derived::InnerSizeAtCompileTime),
     alignment = Derived::Alignment
   };
 
   typedef typename Derived::Scalar Scalar;
-  typedef typename packet_traits<Scalar>::type PacketScalar;
+  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
 
   static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&)
   {
@@ -209,13 +210,13 @@ template<typename Func, typename Derived>
 struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
 {
   typedef typename Derived::Scalar Scalar;
-  typedef typename packet_traits<Scalar>::type PacketScalar;
+  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
 
   static Scalar run(const Derived &mat, const Func& func)
   {
     const Index size = mat.size();
     
-    const Index packetSize = packet_traits<Scalar>::size;
+    const Index packetSize = redux_traits<Func, Derived>::PacketSize;
     const int packetAlignment = unpacket_traits<PacketScalar>::alignment;
     enum {
       alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
@@ -268,7 +269,7 @@ template<typename Func, typename Derived, int Unrolling>
 struct redux_impl<Func, Derived, SliceVectorizedTraversal, Unrolling>
 {
   typedef typename Derived::Scalar Scalar;
-  typedef typename packet_traits<Scalar>::type PacketType;
+  typedef typename redux_traits<Func, Derived>::PacketType PacketType;
 
   EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func)
   {
@@ -276,7 +277,7 @@ struct redux_impl<Func, Derived, SliceVectorizedTraversal, Unrolling>
     const Index innerSize = mat.innerSize();
     const Index outerSize = mat.outerSize();
     enum {
-      packetSize = packet_traits<Scalar>::size
+      packetSize = redux_traits<Func, Derived>::PacketSize
     };
     const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize;
     Scalar res;
@@ -306,9 +307,10 @@ template<typename Func, typename Derived>
 struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
 {
   typedef typename Derived::Scalar Scalar;
-  typedef typename packet_traits<Scalar>::type PacketScalar;
+
+  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
   enum {
-    PacketSize = packet_traits<Scalar>::size,
+    PacketSize = redux_traits<Func, Derived>::PacketSize,
     Size = Derived::SizeAtCompileTime,
     VectorizedSize = (Size / PacketSize) * PacketSize
   };
@@ -367,11 +369,11 @@ public:
   { return m_evaluator.coeff(index); }
 
   template<int LoadMode, typename PacketType>
-  PacketReturnType packet(Index row, Index col) const
+  PacketType packet(Index row, Index col) const
   { return m_evaluator.template packet<LoadMode,PacketType>(row, col); }
 
   template<int LoadMode, typename PacketType>
-  PacketReturnType packet(Index index) const
+  PacketType packet(Index index) const
   { return m_evaluator.template packet<LoadMode,PacketType>(index); }
   
   EIGEN_DEVICE_FUNC
@@ -379,7 +381,7 @@ public:
   { return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
   
   template<int LoadMode, typename PacketType>
-  PacketReturnType packetByOuterInner(Index outer, Index inner) const
+  PacketType packetByOuterInner(Index outer, Index inner) const
   { return m_evaluator.template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
   
   const XprType & nestedExpression() const { return m_xpr; }
diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h
index 5a2010449..a33356423 100644
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -213,7 +213,7 @@ template<int Side, typename TriangularType, typename Rhs> struct triangular_solv
 
   template<typename Dest> inline void evalTo(Dest& dst) const
   {
-    if(!(is_same<RhsNestedCleaned,Dest>::value && extract_data(dst) == extract_data(m_rhs)))
+    if(!is_same_dense(dst,m_rhs))
       dst = m_rhs;
     m_triangularMatrix.template solveInPlace<Side>(dst);
   }
diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h
index 2a0a6ff15..c6a50bb1d 100644
--- a/Eigen/src/Core/SpecialFunctions.h
+++ b/Eigen/src/Core/SpecialFunctions.h
@@ -79,8 +79,8 @@ namespace cephes {
  */
 template <typename Scalar, int N>
 struct polevl {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  static Scalar run(const Scalar x, const Scalar coef[]) {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar x, const Scalar coef[]) {
     EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
 
     return polevl<Scalar, N - 1>::run(x, coef) * x + coef[N];
@@ -89,8 +89,8 @@ struct polevl {
 
 template <typename Scalar>
 struct polevl<Scalar, 0> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  static Scalar run(const Scalar, const Scalar coef[]) {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar, const Scalar coef[]) {
     return coef[0];
   }
 };
@@ -144,7 +144,7 @@ struct digamma_retval {
 template <typename Scalar>
 struct digamma_impl {
   EIGEN_DEVICE_FUNC
-  static Scalar run(Scalar x) {
+  static EIGEN_STRONG_INLINE Scalar run(Scalar x) {
     EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
                         THIS_TYPE_IS_NOT_SUPPORTED);
     return Scalar(0);
@@ -281,20 +281,18 @@ struct digamma_impl {
      */
 
     Scalar p, q, nz, s, w, y;
-    bool negative;
+    bool negative = false;
 
     const Scalar maxnum = NumTraits<Scalar>::infinity();
-    const Scalar m_pi = EIGEN_PI;
-
-    negative = 0;
-    nz = 0.0;
+    const Scalar m_pi = Scalar(EIGEN_PI);
 
-    const Scalar zero = 0.0;
-    const Scalar one = 1.0;
-    const Scalar half = 0.5;
+    const Scalar zero = Scalar(0);
+    const Scalar one = Scalar(1);
+    const Scalar half = Scalar(0.5);
+    nz = zero;
 
     if (x <= zero) {
-      negative = one;
+      negative = true;
       q = x;
       p = numext::floor(q);
       if (p == q) {
@@ -428,33 +426,33 @@ template <typename Scalar> struct igamma_impl;  // predeclare igamma_impl
 
 template <typename Scalar>
 struct igamma_helper {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  static Scalar machep() { assert(false && "machep not supported for this type"); return 0.0; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  static Scalar big() { assert(false && "big not supported for this type"); return 0.0; }
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar machep() { assert(false && "machep not supported for this type"); return 0.0; }
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar big() { assert(false && "big not supported for this type"); return 0.0; }
 };
 
 template <>
 struct igamma_helper<float> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  static float machep() {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE float machep() {
     return NumTraits<float>::epsilon() / 2;  // 1.0 - machep == 1.0
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  static float big() {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE float big() {
     // use epsneg (1.0 - epsneg == 1.0)
-    return 1.0 / (NumTraits<float>::epsilon() / 2);
+    return 1.0f / (NumTraits<float>::epsilon() / 2);
   }
 };
 
 template <>
 struct igamma_helper<double> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  static double machep() {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double machep() {
     return NumTraits<double>::epsilon() / 2;  // 1.0 - machep == 1.0
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  static double big() {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double big() {
     return 1.0 / NumTraits<double>::epsilon();
   }
 };
@@ -463,7 +461,7 @@ template <typename Scalar>
 struct igammac_impl {
   EIGEN_DEVICE_FUNC
   static Scalar run(Scalar a, Scalar x) {
-    /*							igamc()
+    /*  igamc()
      *
      *	Incomplete gamma integral (modified for Eigen)
      *
@@ -519,26 +517,51 @@ struct igammac_impl {
     */
     const Scalar zero = 0;
     const Scalar one = 1;
+    const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+
+    if ((x < zero) || (a <= zero)) {
+      // domain error
+      return nan;
+    }
+
+    if ((x < one) || (x < a)) {
+      /* The checks above ensure that we meet the preconditions for
+       * igamma_impl::Impl(), so call it, rather than igamma_impl::Run().
+       * Calling Run() would also work, but in that case the compiler may not be
+       * able to prove that igammac_impl::Run and igamma_impl::Run are not
+       * mutually recursive.  This leads to worse code, particularly on
+       * platforms like nvptx, where recursion is allowed only begrudgingly.
+       */
+      return (one - igamma_impl<Scalar>::Impl(a, x));
+    }
+
+    return Impl(a, x);
+  }
+
+ private:
+  /* igamma_impl calls igammac_impl::Impl. */
+  friend struct igamma_impl<Scalar>;
+
+  /* Actually computes igamc(a, x).
+   *
+   * Preconditions:
+   *   a > 0
+   *   x >= 1
+   *   x >= a
+   */
+  EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) {
+    const Scalar zero = 0;
+    const Scalar one = 1;
     const Scalar two = 2;
     const Scalar machep = igamma_helper<Scalar>::machep();
     const Scalar maxlog = numext::log(NumTraits<Scalar>::highest());
     const Scalar big = igamma_helper<Scalar>::big();
     const Scalar biginv = 1 / big;
-    const Scalar nan = NumTraits<Scalar>::quiet_NaN();
     const Scalar inf = NumTraits<Scalar>::infinity();
 
     Scalar ans, ax, c, yc, r, t, y, z;
     Scalar pk, pkm1, pkm2, qk, qkm1, qkm2;
 
-    if ((x < zero) || ( a <= zero)) {
-      // domain error
-      return nan;
-    }
-
-    if ((x < one) || (x < a)) {
-      return (one - igamma_impl<Scalar>::run(a, x));
-    }
-
     if (x == inf) return zero;  // std::isinf crashes on CUDA
 
     /* Compute  x**a * exp(-x) / gamma(a)  */
@@ -605,7 +628,7 @@ struct igamma_retval {
 template <typename Scalar>
 struct igamma_impl {
   EIGEN_DEVICE_FUNC
-  static Scalar run(Scalar a, Scalar x) {
+  static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar x) {
     EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
                         THIS_TYPE_IS_NOT_SUPPORTED);
     return Scalar(0);
@@ -618,7 +641,7 @@ template <typename Scalar>
 struct igamma_impl {
   EIGEN_DEVICE_FUNC
   static Scalar run(Scalar a, Scalar x) {
-    /*							igam()
+    /*	igam()
      *	Incomplete gamma integral
      *
      *
@@ -680,22 +703,47 @@ struct igamma_impl {
      */
     const Scalar zero = 0;
     const Scalar one = 1;
-    const Scalar machep = igamma_helper<Scalar>::machep();
-    const Scalar maxlog = numext::log(NumTraits<Scalar>::highest());
     const Scalar nan = NumTraits<Scalar>::quiet_NaN();
 
-    double ans, ax, c, r;
-
     if (x == zero) return zero;
 
-    if ((x < zero) || ( a <= zero)) {  // domain error
+    if ((x < zero) || (a <= zero)) {  // domain error
       return nan;
     }
 
     if ((x > one) && (x > a)) {
-      return (one - igammac_impl<Scalar>::run(a, x));
+      /* The checks above ensure that we meet the preconditions for
+       * igammac_impl::Impl(), so call it, rather than igammac_impl::Run().
+       * Calling Run() would also work, but in that case the compiler may not be
+       * able to prove that igammac_impl::Run and igamma_impl::Run are not
+       * mutually recursive.  This leads to worse code, particularly on
+       * platforms like nvptx, where recursion is allowed only begrudgingly.
+       */
+      return (one - igammac_impl<Scalar>::Impl(a, x));
     }
 
+    return Impl(a, x);
+  }
+
+ private:
+  /* igammac_impl calls igamma_impl::Impl. */
+  friend struct igammac_impl<Scalar>;
+
+  /* Actually computes igam(a, x).
+   *
+   * Preconditions:
+   *   x > 0
+   *   a > 0
+   *   !(x > 1 && x > a)
+   */
+  EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) {
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar machep = igamma_helper<Scalar>::machep();
+    const Scalar maxlog = numext::log(NumTraits<Scalar>::highest());
+
+    Scalar ans, ax, c, r;
+
     /* Compute  x**a * exp(-x) / gamma(a)  */
     ax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a);
     if (ax < -maxlog) {
@@ -736,7 +784,7 @@ struct zeta_retval {
 template <typename Scalar>
 struct zeta_impl {
     EIGEN_DEVICE_FUNC
-    static Scalar run(Scalar x, Scalar q) {
+    static EIGEN_STRONG_INLINE Scalar run(Scalar x, Scalar q) {
         EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
                             THIS_TYPE_IS_NOT_SUPPORTED);
         return Scalar(0);
@@ -757,8 +805,8 @@ struct zeta_impl_series {
 
 template <>
 struct zeta_impl_series<float> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  static bool run(float& a, float& b, float& s, const float x, const float machep) {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE bool run(float& a, float& b, float& s, const float x, const float machep) {
     int i = 0;  
     while(i < 9)
     {
@@ -777,8 +825,8 @@ struct zeta_impl_series<float> {
 
 template <>
 struct zeta_impl_series<double> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  static bool run(double& a, double& b, double& s, const double x, const double machep) {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE bool run(double& a, double& b, double& s, const double x, const double machep) {
     int i = 0;  
     while( (i < 9) || (a <= 9.0) )
     {
@@ -881,13 +929,14 @@ struct zeta_impl {
         const Scalar maxnum = NumTraits<Scalar>::infinity();
         const Scalar zero = 0.0, half = 0.5, one = 1.0;
         const Scalar machep = igamma_helper<Scalar>::machep();
+        const Scalar nan = NumTraits<Scalar>::quiet_NaN();
         
         if( x == one )
             return maxnum;
         
         if( x < one )
         {
-            return zero;
+            return nan;
         }
         
         if( q <= zero )
@@ -899,7 +948,7 @@ struct zeta_impl {
             p = x;
             r = numext::floor(p);
             if (p != r)
-                return zero;
+                return nan;
         }
         
         /* Permit negative q but continue sum until n+q > +9 .
@@ -954,7 +1003,7 @@ struct polygamma_retval {
 template <typename Scalar>
 struct polygamma_impl {
     EIGEN_DEVICE_FUNC
-    static Scalar run(Scalar n, Scalar x) {
+    static EIGEN_STRONG_INLINE Scalar run(Scalar n, Scalar x) {
         EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
                             THIS_TYPE_IS_NOT_SUPPORTED);
         return Scalar(0);
@@ -969,9 +1018,14 @@ struct polygamma_impl {
     static Scalar run(Scalar n, Scalar x) {
         Scalar zero = 0.0, one = 1.0;
         Scalar nplus = n + one;
+        const Scalar nan = NumTraits<Scalar>::quiet_NaN();
         
+        // Check that n is an integer
+        if (numext::floor(n) != n) {
+            return nan;
+        }
         // Just return the digamma function for n = 1
-        if (n == zero) {
+        else if (n == zero) {
             return digamma_impl<Scalar>::run(x);
         }
         // Use the same implementation as scipy
diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h
index 7fe39808b..d2fe1e199 100644
--- a/Eigen/src/Core/StableNorm.h
+++ b/Eigen/src/Core/StableNorm.h
@@ -168,11 +168,12 @@ MatrixBase<Derived>::stableNorm() const
   DerivedCopy copy(derived());
   
   enum {
-    CanAlign = (int(Flags)&DirectAccessBit) || (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME
+    CanAlign = (   (int(DerivedCopyClean::Flags)&DirectAccessBit)
+                || (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
+               ) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT) // ifwe cannot allocate on the stack, then let's not bother about this optimization
   };
   typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<DerivedCopyClean>::Alignment>,
-                                                   typename DerivedCopyClean
-                                                   ::ConstSegmentReturnType>::type SegmentWrapper;
+                                                   typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper;
   Index n = size();
   
   if(n==1)
diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h
index e6d137e40..5c5e5028e 100644
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -532,7 +532,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
     template<typename RhsType, typename DstType>
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE void _solve_impl(const RhsType &rhs, DstType &dst) const {
-      if(!(internal::is_same<RhsType,DstType>::value && internal::extract_data(dst) == internal::extract_data(rhs)))
+      if(!internal::is_same_dense(dst,rhs))
         dst = rhs;
       this->solveInPlace(dst);
     }
diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h
index 0a3b301bf..a63b20318 100644
--- a/Eigen/src/Core/arch/CUDA/Half.h
+++ b/Eigen/src/Core/arch/CUDA/Half.h
@@ -46,6 +46,8 @@
 
 // Make our own __half definition that is similar to CUDA's.
 struct __half {
+  __half() {}
+  explicit __half(unsigned short raw) : x(raw) {}
   unsigned short x;
 };
 
@@ -70,12 +72,18 @@ struct half : public __half {
 
   explicit EIGEN_DEVICE_FUNC half(bool b)
       : __half(internal::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
+  explicit EIGEN_DEVICE_FUNC half(unsigned int ui)
+      : __half(internal::float_to_half_rtne(static_cast<float>(ui))) {}
   explicit EIGEN_DEVICE_FUNC half(int i)
       : __half(internal::float_to_half_rtne(static_cast<float>(i))) {}
+  explicit EIGEN_DEVICE_FUNC half(unsigned long ul)
+      : __half(internal::float_to_half_rtne(static_cast<float>(ul))) {}
   explicit EIGEN_DEVICE_FUNC half(long l)
       : __half(internal::float_to_half_rtne(static_cast<float>(l))) {}
   explicit EIGEN_DEVICE_FUNC half(long long ll)
       : __half(internal::float_to_half_rtne(static_cast<float>(ll))) {}
+  explicit EIGEN_DEVICE_FUNC half(unsigned long long ull)
+      : __half(internal::float_to_half_rtne(static_cast<float>(ull))) {}
   explicit EIGEN_DEVICE_FUNC half(float f)
       : __half(internal::float_to_half_rtne(f)) {}
   explicit EIGEN_DEVICE_FUNC half(double d)
@@ -286,7 +294,8 @@ static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff)
   const FP32 f16max = { (127 + 16) << 23 };
   const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
   unsigned int sign_mask = 0x80000000u;
-  __half o = { 0 };
+  __half o;
+  o.x = static_cast<unsigned short>(0x0u);
 
   unsigned int sign = f.u & sign_mask;
   f.u ^= sign;
@@ -366,13 +375,22 @@ template<> struct is_arithmetic<half> { enum { value = true }; };
 template<> struct NumTraits<Eigen::half>
     : GenericNumTraits<Eigen::half>
 {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float dummy_precision() { return 1e-3f; }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() {
+    return internal::raw_uint16_to_half(0x0800);
+  }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { return half(1e-2f); }
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half highest() {
     return internal::raw_uint16_to_half(0x7bff);
   }
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half lowest() {
     return internal::raw_uint16_to_half(0xfbff);
   }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half infinity() {
+    return internal::raw_uint16_to_half(0x7c00);
+  }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() {
+    return internal::raw_uint16_to_half(0x7c01);
+  }
 };
 
 // Infinity/NaN checks.
@@ -392,6 +410,7 @@ static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const Eigen::half& a)
 static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const Eigen::half& a) {
   return !(Eigen::numext::isinf)(a) && !(Eigen::numext::isnan)(a);
 }
+
 template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half abs(const Eigen::half& a) {
   Eigen::half result;
   result.x = a.x & 0x7FFF;
@@ -406,6 +425,21 @@ template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half log(const Eigen::ha
 template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrt(const Eigen::half& a) {
   return Eigen::half(::sqrtf(float(a)));
 }
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half pow(const Eigen::half& a, const Eigen::half& b) {
+  return Eigen::half(::powf(float(a), float(b)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sin(const Eigen::half& a) {
+  return Eigen::half(::sinf(float(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half cos(const Eigen::half& a) {
+  return Eigen::half(::cosf(float(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half tan(const Eigen::half& a) {
+  return Eigen::half(::tanf(float(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half tanh(const Eigen::half& a) {
+  return Eigen::half(::tanhf(float(a)));
+}
 template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floor(const Eigen::half& a) {
   return Eigen::half(::floorf(float(a)));
 }
@@ -413,6 +447,51 @@ template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceil(const Eigen::h
   return Eigen::half(::ceilf(float(a)));
 }
 
+template <> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half mini(const Eigen::half& a, const Eigen::half& b) {
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hlt(b, a) ? b : a;
+#else
+  const float f1 = static_cast<float>(a);
+  const float f2 = static_cast<float>(b);
+  return f2 < f1 ? b : a;
+#endif
+}
+template <> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half maxi(const Eigen::half& a, const Eigen::half& b) {
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hlt(a, b) ? b : a;
+#else
+  const float f1 = static_cast<float>(a);
+  const float f2 = static_cast<float>(b);
+  return f1 < f2 ? b : a;
+#endif
+}
+
+#ifdef EIGEN_HAS_C99_MATH
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half lgamma(const Eigen::half& a) {
+  return Eigen::half(Eigen::numext::lgamma(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half digamma(const Eigen::half& a) {
+  return Eigen::half(Eigen::numext::digamma(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half zeta(const Eigen::half& x, const Eigen::half& q) {
+  return Eigen::half(Eigen::numext::zeta(static_cast<float>(x), static_cast<float>(q)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half polygamma(const Eigen::half& n, const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::polygamma(static_cast<float>(n), static_cast<float>(x)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erf(const Eigen::half& a) {
+  return Eigen::half(Eigen::numext::erf(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erfc(const Eigen::half& a) {
+  return Eigen::half(Eigen::numext::erfc(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma(const Eigen::half& a, const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::igamma(static_cast<float>(a), static_cast<float>(x)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igammac(const Eigen::half& a, const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::igammac(static_cast<float>(a), static_cast<float>(x)));
+}
+#endif
 } // end namespace numext
 
 } // end namespace Eigen
@@ -432,6 +511,9 @@ static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half&
 static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrth(const Eigen::half& a) {
   return Eigen::half(::sqrtf(float(a)));
 }
+static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half powh(const Eigen::half& a, const Eigen::half& b) {
+  return Eigen::half(::powf(float(a), float(b)));
+}
 static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floorh(const Eigen::half& a) {
   return Eigen::half(::floorf(float(a)));
 }
@@ -451,6 +533,11 @@ static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (isfinite)(const Eigen::half& a
 
 namespace std {
 
+EIGEN_ALWAYS_INLINE ostream& operator << (ostream& os, const Eigen::half& v) {
+  os << static_cast<float>(v);
+  return os;
+}
+
 #if __cplusplus > 199711L
 template <>
 struct hash<Eigen::half> {
diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
index 14f0c9415..0cebc1017 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@@ -17,6 +17,7 @@
 // we'll use on the host side (SSE, AVX, ...)
 #if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
 
+// Most of the following operations require arch >= 3.0
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
 
 namespace Eigen {
@@ -33,14 +34,7 @@ template<> struct packet_traits<half> : default_packet_traits
     AlignedOnScalar = 1,
     size=2,
     HasHalfPacket = 0,
-
-    HasDiv  = 1,
-    HasLog  = 1,
-    HasExp  = 1,
-    HasSqrt = 1,
-    HasRsqrt = 1,
-
-    HasBlend = 0,
+    HasDiv  = 1
   };
 };
 
@@ -73,9 +67,9 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<half>(half* to, co
 }
 
 template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const half* from) {
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const half* from) {
 #if __CUDA_ARCH__ >= 320
-  return __ldg((const half2*)from);
+   return __ldg((const half2*)from);
 #else
   return __halves2half2(*(from+0), *(from+1));
 #endif
@@ -84,7 +78,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const half
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const half* from) {
 #if __CUDA_ARCH__ >= 320
-  return __halves2half2(__ldg(from+0), __ldg(from+1));
+   return __halves2half2(__ldg(from+0), __ldg(from+1));
 #else
   return __halves2half2(*(from+0), *(from+1));
 #endif
@@ -120,33 +114,84 @@ ptranspose(PacketBlock<half2,2>& kernel) {
   kernel.packet[1] = __halves2half2(a2, b2);
 }
 
-// The following operations require arch >= 5.3
-#if  __CUDA_ARCH__ >= 530
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const half& a) {
+#if __CUDA_ARCH__ >= 530
   return __halves2half2(a, __hadd(a, __float2half(1.0f)));
+#else
+  float f = __half2float(a) + 1.0f;
+  return __halves2half2(a, __float2half(f));
+#endif
 }
 
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
+#if __CUDA_ARCH__ >= 530
   return __hadd2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 + b1;
+  float r2 = a2 + b2;
+  return __floats2half2_rn(r1, r2);
+#endif
 }
 
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
+#if __CUDA_ARCH__ >= 530
   return __hsub2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 - b1;
+  float r2 = a2 - b2;
+  return __floats2half2_rn(r1, r2);
+#endif
 }
 
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
+#if __CUDA_ARCH__ >= 530
   return __hneg2(a);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return __floats2half2_rn(-a1, -a2);
+#endif
 }
 
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
 
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
+#if __CUDA_ARCH__ >= 530
   return __hmul2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 * b1;
+  float r2 = a2 * b2;
+  return __floats2half2_rn(r1, r2);
+#endif
 }
 
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
+#if __CUDA_ARCH__ >= 530
    return __hfma2(a, b, c);
- }
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float c1 = __low2float(c);
+  float c2 = __high2float(c);
+  float r1 = a1 * b1 + c2;
+  float r2 = a2 * b2 + c2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
 
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
   float a1 = __low2float(a);
@@ -179,25 +224,48 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2&
 }
 
 template<> EIGEN_DEVICE_FUNC inline half predux<half2>(const half2& a) {
+#if __CUDA_ARCH__ >= 530
   return __hadd(__low2half(a), __high2half(a));
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return half(__float2half_rn(a1 + a2));
+#endif
 }
 
 template<> EIGEN_DEVICE_FUNC inline half predux_max<half2>(const half2& a) {
+#if __CUDA_ARCH__ >= 530
   half first = __low2half(a);
   half second = __high2half(a);
   return __hgt(first, second) ? first : second;
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return half(__float2half_rn(numext::maxi(a1, a2)));
+#endif
 }
 
 template<> EIGEN_DEVICE_FUNC inline half predux_min<half2>(const half2& a) {
+#if __CUDA_ARCH__ >= 530
   half first = __low2half(a);
   half second = __high2half(a);
   return __hlt(first, second) ? first : second;
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return half(__float2half_rn(numext::mini(a1, a2)));
+#endif
 }
 
 template<> EIGEN_DEVICE_FUNC inline half predux_mul<half2>(const half2& a) {
+#if __CUDA_ARCH__ >= 530
   return __hmul(__low2half(a), __high2half(a));
-}
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return half(__float2half_rn(a1 * a2));
 #endif
+}
 
 } // end namespace internal
 
diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h
index b2a9724de..3ea218133 100644
--- a/Eigen/src/Core/arch/CUDA/TypeCasting.h
+++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h
@@ -71,6 +71,7 @@ struct functor_traits<scalar_cast_op<half, float> >
 
 
 
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
 
 template <>
 struct type_casting_traits<half, float> {
@@ -82,22 +83,9 @@ struct type_casting_traits<half, float> {
 };
 
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<half2, float4>(const half2& a, const half2& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
   float2 r1 = __half22float2(a);
   float2 r2 = __half22float2(b);
   return make_float4(r1.x, r1.y, r2.x, r2.y);
-#else
-  half r1;
-  r1.x = a.x & 0xFFFF;
-  half r2;
-  r2.x = (a.x & 0xFFFF0000) >> 16;
-  half r3;
-  r3.x = b.x & 0xFFFF;
-  half r4;
-  r4.x = (b.x & 0xFFFF0000) >> 16;
-  return make_float4(static_cast<float>(r1), static_cast<float>(r2),
-                     static_cast<float>(r3), static_cast<float>(r4));
-#endif
 }
 
 template <>
@@ -111,20 +99,11 @@ struct type_casting_traits<float, half> {
 
 template<> EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
   // Simply discard the second half of the input
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
   return __float22half2_rn(make_float2(a.x, a.y));
-#else
-  half r1 = static_cast<half>(a.x);
-  half r2 = static_cast<half>(a.y);
-  half2 r;
-  r.x = 0;
-  r.x |= r1.x;
-  r.x |= (static_cast<unsigned int>(r2.x) << 16);
-  return r;
-#endif
 }
 
 #endif
+#endif
 
 } // end namespace internal
 
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 63a2d9f52..3224c36bd 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -179,6 +179,8 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
 
 // Clang/ARM wrongly advertises __ARM_FEATURE_FMA even when it's not available,
 // then implements a slow software scalar fallback calling fmaf()!
+// Filed LLVM bug:
+//     https://llvm.org/bugs/show_bug.cgi?id=27216
 #if (defined __ARM_FEATURE_FMA) && !(EIGEN_COMP_CLANG && EIGEN_ARCH_ARM)
 // See bug 936.
 // FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4.
@@ -195,6 +197,8 @@ template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f&
   // -march=armv7-a, that is a very common case.
   // See e.g. this thread:
   //     http://lists.llvm.org/pipermail/llvm-dev/2013-December/068806.html
+  // Filed LLVM bug:
+  //     https://llvm.org/bugs/show_bug.cgi?id=27219
   Packet4f r = c;
   asm volatile(
     "vmla.f32 %q[r], %q[a], %q[b]"
diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h
index e28fecfd0..5cd8ca950 100644
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h
@@ -345,6 +345,22 @@ template<> struct functor_traits<scalar_boolean_or_op> {
 };
 
 /** \internal
+ * \brief Template functor to compute the xor of two booleans
+ *
+ * \sa class CwiseBinaryOp, ArrayBase::operator^
+ */
+struct scalar_boolean_xor_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_xor_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a ^ b; }
+};
+template<> struct functor_traits<scalar_boolean_xor_op> {
+  enum {
+    Cost = NumTraits<bool>::AddCost,
+    PacketAccess = false
+  };
+};
+
+/** \internal
   * \brief Template functor to compute the incomplete gamma function igamma(a, x)
   *
   * \sa class CwiseBinaryOp, Cwise::igamma
diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h
index 7ba0abedc..488ebf1d2 100644
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@@ -234,9 +234,33 @@ template<typename Scalar> struct scalar_exp_op {
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pexp(a); }
 };
-template<typename Scalar>
-struct functor_traits<scalar_exp_op<Scalar> >
-{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasExp }; };
+template <typename Scalar>
+struct functor_traits<scalar_exp_op<Scalar> > {
+  enum {
+    PacketAccess = packet_traits<Scalar>::HasExp,
+    // The following numbers are based on the AVX implementation.
+#ifdef EIGEN_VECTORIZE_FMA
+    // Haswell can issue 2 add/mul/madd per cycle.
+    Cost =
+    (sizeof(Scalar) == 4
+     // float: 8 pmadd, 4 pmul, 2 padd/psub, 6 other
+     ? (8 * NumTraits<Scalar>::AddCost + 6 * NumTraits<Scalar>::MulCost)
+     // double: 7 pmadd, 5 pmul, 3 padd/psub, 1 div,  13 other
+     : (14 * NumTraits<Scalar>::AddCost +
+        6 * NumTraits<Scalar>::MulCost +
+        NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost))
+#else
+    Cost =
+    (sizeof(Scalar) == 4
+     // float: 7 pmadd, 6 pmul, 4 padd/psub, 10 other
+     ? (21 * NumTraits<Scalar>::AddCost + 13 * NumTraits<Scalar>::MulCost)
+     // double: 7 pmadd, 5 pmul, 3 padd/psub, 1 div,  13 other
+     : (23 * NumTraits<Scalar>::AddCost +
+        12 * NumTraits<Scalar>::MulCost +
+        NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost))
+#endif
+  };
+};
 
 /** \internal
   *
@@ -250,9 +274,24 @@ template<typename Scalar> struct scalar_log_op {
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog(a); }
 };
-template<typename Scalar>
-struct functor_traits<scalar_log_op<Scalar> >
-{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasLog }; };
+template <typename Scalar>
+struct functor_traits<scalar_log_op<Scalar> > {
+  enum {
+    PacketAccess = packet_traits<Scalar>::HasLog,
+    Cost =
+    (PacketAccess
+     // The following numbers are based on the AVX implementation.
+#ifdef EIGEN_VECTORIZE_FMA
+     // 8 pmadd, 6 pmul, 8 padd/psub, 16 other, can issue 2 add/mul/madd per cycle.
+     ? (20 * NumTraits<Scalar>::AddCost + 7 * NumTraits<Scalar>::MulCost)
+#else
+     // 8 pmadd, 6 pmul, 8 padd/psub, 20 other
+     ? (36 * NumTraits<Scalar>::AddCost + 14 * NumTraits<Scalar>::MulCost)
+#endif
+     // Measured cost of std::log.
+     : sizeof(Scalar)==4 ? 40 : 85)
+  };
+};
 
 /** \internal
   *
@@ -280,10 +319,19 @@ template<typename Scalar> struct scalar_sqrt_op {
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psqrt(a); }
 };
-template<typename Scalar>
-struct functor_traits<scalar_sqrt_op<Scalar> >
-{ enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
+template <typename Scalar>
+struct functor_traits<scalar_sqrt_op<Scalar> > {
+  enum {
+#if EIGEN_FAST_MATH
+    // The following numbers are based on the AVX implementation.
+    Cost = (sizeof(Scalar) == 8 ? 28
+                                // 4 pmul, 1 pmadd, 3 other
+                                : (3 * NumTraits<Scalar>::AddCost +
+                                   5 * NumTraits<Scalar>::MulCost)),
+#else
+    // The following numbers are based on min VSQRT throughput on Haswell.
+    Cost = (sizeof(Scalar) == 8 ? 28 : 14),
+#endif
     PacketAccess = packet_traits<Scalar>::HasSqrt
   };
 };
@@ -313,7 +361,7 @@ struct functor_traits<scalar_rsqrt_op<Scalar> >
   */
 template<typename Scalar> struct scalar_cos_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cos_op)
-  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { using std::cos; return cos(a); }
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return numext::cos(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pcos(a); }
 };
@@ -332,7 +380,7 @@ struct functor_traits<scalar_cos_op<Scalar> >
   */
 template<typename Scalar> struct scalar_sin_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_sin_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::sin; return sin(a); }
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::sin(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psin(a); }
 };
@@ -352,7 +400,7 @@ struct functor_traits<scalar_sin_op<Scalar> >
   */
 template<typename Scalar> struct scalar_tan_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_tan_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::tan; return tan(a); }
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::tan(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::ptan(a); }
 };
@@ -371,7 +419,7 @@ struct functor_traits<scalar_tan_op<Scalar> >
   */
 template<typename Scalar> struct scalar_acos_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_acos_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::acos; return acos(a); }
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::acos(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pacos(a); }
 };
@@ -390,7 +438,7 @@ struct functor_traits<scalar_acos_op<Scalar> >
   */
 template<typename Scalar> struct scalar_asin_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_asin_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::asin; return asin(a); }
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::asin(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pasin(a); }
 };
@@ -546,7 +594,7 @@ struct functor_traits<scalar_erfc_op<Scalar> >
   */
 template<typename Scalar> struct scalar_atan_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_atan_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::atan; return atan(a); }
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::atan(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::patan(a); }
 };
@@ -566,7 +614,7 @@ struct functor_traits<scalar_atan_op<Scalar> >
   */
 template<typename Scalar> struct scalar_tanh_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_tanh_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::tanh; return tanh(a); }
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::tanh(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::ptanh(a); }
 };
@@ -574,8 +622,24 @@ template<typename Scalar>
 struct functor_traits<scalar_tanh_op<Scalar> >
 {
   enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasTanh
+    PacketAccess = packet_traits<Scalar>::HasTanh,
+    Cost =
+    (PacketAccess
+     // The following numbers are based on the AVX implementation,
+#ifdef EIGEN_VECTORIZE_FMA
+     // Haswell can issue 2 add/mul/madd per cycle.
+     // 9 pmadd, 2 pmul, 1 div, 2 other
+     ? (2 * NumTraits<Scalar>::AddCost + 6 * NumTraits<Scalar>::MulCost +
+     NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost)
+#else
+     ? (11 * NumTraits<Scalar>::AddCost +
+        11 * NumTraits<Scalar>::MulCost +
+        NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost)
+#endif
+     // This number assumes a naive implementation of tanh
+     : (6 * NumTraits<Scalar>::AddCost + 3 * NumTraits<Scalar>::MulCost +
+        2 * NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost +
+        functor_traits<scalar_exp_op<Scalar> >::Cost))
   };
 };
 
@@ -585,7 +649,7 @@ struct functor_traits<scalar_tanh_op<Scalar> >
   */
 template<typename Scalar> struct scalar_sinh_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_sinh_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::sinh; return sinh(a); }
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::sinh(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psinh(a); }
 };
@@ -604,7 +668,7 @@ struct functor_traits<scalar_sinh_op<Scalar> >
   */
 template<typename Scalar> struct scalar_cosh_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cosh_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::cosh; return cosh(a); }
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::cosh(a); }
   template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pcosh(a); }
 };
@@ -816,9 +880,9 @@ struct scalar_sign_op<Scalar,true> {
   {
     typedef typename NumTraits<Scalar>::Real real_type;
     real_type aa = numext::abs(a);
-    if (aa==0)
+    if (aa==real_type(0))
       return Scalar(0);
-    aa = 1./aa;
+    aa = real_type(1)/aa;
     return Scalar(real(a)*aa, imag(a)*aa );
   }
   //TODO
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 54e118395..5b0473598 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -11,8 +11,8 @@
 #define EIGEN_GENERAL_BLOCK_PANEL_H
 
 
-namespace Eigen { 
-  
+namespace Eigen {
+
 namespace internal {
 
 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
@@ -36,7 +36,7 @@ const std::ptrdiff_t defaultL3CacheSize = 512*1024;
 #endif
 
 /** \internal */
-struct CacheSizes { 
+struct CacheSizes {
   CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
     int l1CacheSize, l2CacheSize, l3CacheSize;
     queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
@@ -89,7 +89,7 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff
  *
  * \sa setCpuCacheSizes */
 
-template<typename LhsScalar, typename RhsScalar, int KcFactor>
+template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
 void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1)
 {
   typedef gebp_traits<LhsScalar,RhsScalar> Traits;
@@ -107,21 +107,17 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
     enum {
       kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
       ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
-      k_mask = -8,
-
+      kr = 8,
       mr = Traits::mr,
-      mr_mask = -mr,
-
-      nr = Traits::nr,
-      nr_mask = -nr
+      nr = Traits::nr
     };
     // Increasing k gives us more time to prefetch the content of the "C"
     // registers. However once the latency is hidden there is no point in
     // increasing the value of k, so we'll cap it at 320 (value determined
     // experimentally).
-    const Index k_cache = (std::min<Index>)((l1-ksub)/kdiv, 320);
+    const Index k_cache = (numext::mini<Index>)((l1-ksub)/kdiv, 320);
     if (k_cache < k) {
-      k = k_cache & k_mask;
+      k = k_cache - (k_cache % kr);
       eigen_internal_assert(k > 0);
     }
 
@@ -130,10 +126,10 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
     if (n_cache <= n_per_thread) {
       // Don't exceed the capacity of the l2 cache.
       eigen_internal_assert(n_cache >= static_cast<Index>(nr));
-      n = n_cache & nr_mask;
+      n = n_cache - (n_cache % nr);
       eigen_internal_assert(n > 0);
     } else {
-      n = (std::min<Index>)(n, (n_per_thread + nr - 1) & nr_mask);
+      n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
     }
 
     if (l3 > l2) {
@@ -141,10 +137,10 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
       const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
       const Index m_per_thread = numext::div_ceil(m, num_threads);
       if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
-        m = m_cache & mr_mask;
+        m = m_cache - (m_cache % mr);
         eigen_internal_assert(m > 0);
       } else {
-        m = (std::min<Index>)(m, (m_per_thread + mr - 1) & mr_mask);
+        m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
       }
     }
   }
@@ -156,29 +152,29 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
     l2 = 32*1024;
     l3 = 512*1024;
 #endif
-    
+
     // Early return for small problems because the computation below are time consuming for small problems.
     // Perhaps it would make more sense to consider k*n*m??
     // Note that for very tiny problem, this function should be bypassed anyway
     // because we use the coefficient-based implementation for them.
-    if((std::max)(k,(std::max)(m,n))<48)
+    if((numext::maxi)(k,(numext::maxi)(m,n))<48)
       return;
-    
+
     typedef typename Traits::ResScalar ResScalar;
     enum {
       k_peeling = 8,
       k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
       k_sub = Traits::mr * Traits::nr * sizeof(ResScalar)
     };
-    
+
     // ---- 1st level of blocking on L1, yields kc ----
-    
+
     // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel
     // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache.
     // We also include a register-level block of the result (mx x nr).
     // (In an ideal world only the lhs panel would stay in L1)
     // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
-    const Index max_kc = std::max<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
+    const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
     const Index old_k = k;
     if(k>max_kc)
     {
@@ -187,12 +183,12 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
       //    while keeping the same number of sweeps over the result.
       k = (k%max_kc)==0 ? max_kc
                         : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
-                        
+
       eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same");
     }
-    
+
     // ---- 2nd level of blocking on max(L2,L3), yields nc ----
-    
+
     // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
     //      actual_l2 = max(l2, l3/nb_core_sharing_l3)
     // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
@@ -202,7 +198,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
     #else
     const Index actual_l2 = 1572864; // == 1.5 MB
     #endif
-    
+
     // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
     // The second half is implicitly reserved to access the result and lhs coefficients.
     // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
@@ -223,7 +219,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
       max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
     }
     // WARNING Below, we assume that Traits::nr is a power of two.
-    Index nc = std::min<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
+    Index nc = numext::mini<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
     if(n>nc)
     {
       // We are really blocking over the columns:
@@ -252,9 +248,9 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
         // we have both L2 and L3, and problem is small enough to be kept in L2
         // Let's choose m such that lhs's block fit in 1/3 of L2
         actual_lm = l2;
-        max_mc = (std::min<Index>)(576,max_mc);
+        max_mc = (numext::mini<Index>)(576,max_mc);
       }
-      Index mc = (std::min<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
+      Index mc = (numext::mini<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
       if (mc > Traits::mr) mc -= mc % Traits::mr;
       else if (mc==0) return;
       m = (m%mc)==0 ? mc
@@ -263,13 +259,14 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
   }
 }
 
+template <typename Index>
 inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
 {
 #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
   if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
-    k = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
-    m = std::min<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
-    n = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
+    k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
+    m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
+    n = numext::mini<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
     return true;
   }
 #else
@@ -296,11 +293,11 @@ inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
   *
   * \sa setCpuCacheSizes */
 
-template<typename LhsScalar, typename RhsScalar, int KcFactor>
+template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
 void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
 {
   if (!useSpecificBlockingSizes(k, m, n)) {
-    evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor>(k, m, n, num_threads);
+    evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads);
   }
 
   typedef gebp_traits<LhsScalar,RhsScalar> Traits;
@@ -314,10 +311,10 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
   if (n > nr) n -= n % nr;
 }
 
-template<typename LhsScalar, typename RhsScalar>
+template<typename LhsScalar, typename RhsScalar, typename Index>
 inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
 {
-  computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n, num_threads);
+  computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
 }
 
 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
@@ -2224,6 +2221,16 @@ inline std::ptrdiff_t l2CacheSize()
   return l2;
 }
 
+/** \returns the currently set level 3 cpu cache size (in bytes) used to estimate the ideal blocking size paramete\
+rs.                                                                                                                
+* \sa setCpuCacheSize */
+inline std::ptrdiff_t l3CacheSize()
+{
+  std::ptrdiff_t l1, l2, l3;
+  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
+  return l3;
+}
+
 /** Set the cpu L1 and L2 cache sizes (in bytes).
   * These values are use to adjust the size of the blocks
   * for the algorithms working per blocks.
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
index 831089dee..80ba89465 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -43,7 +43,7 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
   typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
   static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride,
                                       const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride,
-                                      const ResScalar& alpha, level3_blocking<LhsScalar,RhsScalar>& blocking)
+                                      const ResScalar& alpha, level3_blocking<RhsScalar,LhsScalar>& blocking)
   {
     general_matrix_matrix_triangular_product<Index,
         RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
index 3deed068e..911df8ff3 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
@@ -25,13 +25,13 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to BLAS F77
  *   Level 3 BLAS SYRK/HERK implementation.
  ********************************************************************************
 */
 
-#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H
-#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H
+#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H
+#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H
 
 namespace Eigen { 
 
@@ -44,34 +44,35 @@ struct general_matrix_matrix_rankupdate :
 
 
 // try to go to BLAS specialization
-#define EIGEN_MKL_RANKUPDATE_SPECIALIZE(Scalar) \
+#define EIGEN_BLAS_RANKUPDATE_SPECIALIZE(Scalar) \
 template <typename Index, int LhsStorageOrder, bool ConjugateLhs, \
                           int RhsStorageOrder, bool ConjugateRhs, int  UpLo> \
 struct general_matrix_matrix_triangular_product<Index,Scalar,LhsStorageOrder,ConjugateLhs, \
                Scalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Specialized> { \
   static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \
-                          const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha) \
+                          const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking<Scalar, Scalar>& blocking) \
   { \
     if (lhs==rhs) { \
       general_matrix_matrix_rankupdate<Index,Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,UpLo> \
-      ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha); \
+      ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \
     } else { \
       general_matrix_matrix_triangular_product<Index, \
         Scalar, LhsStorageOrder, ConjugateLhs, \
         Scalar, RhsStorageOrder, ConjugateRhs, \
         ColMajor, UpLo, BuiltIn> \
-      ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha); \
+      ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \
     } \
   } \
 };
 
-EIGEN_MKL_RANKUPDATE_SPECIALIZE(double)
-//EIGEN_MKL_RANKUPDATE_SPECIALIZE(dcomplex)
-EIGEN_MKL_RANKUPDATE_SPECIALIZE(float)
-//EIGEN_MKL_RANKUPDATE_SPECIALIZE(scomplex)
+EIGEN_BLAS_RANKUPDATE_SPECIALIZE(double)
+EIGEN_BLAS_RANKUPDATE_SPECIALIZE(float)
+// TODO handle complex cases
+// EIGEN_BLAS_RANKUPDATE_SPECIALIZE(dcomplex)
+// EIGEN_BLAS_RANKUPDATE_SPECIALIZE(scomplex)
 
 // SYRK for float/double
-#define EIGEN_MKL_RANKUPDATE_R(EIGTYPE, MKLTYPE, MKLFUNC) \
+#define EIGEN_BLAS_RANKUPDATE_R(EIGTYPE, BLASTYPE, BLASFUNC) \
 template <typename Index, int AStorageOrder, bool ConjugateA, int  UpLo> \
 struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \
   enum { \
@@ -80,23 +81,19 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C
     conjA = ((AStorageOrder==ColMajor) && ConjugateA) ? 1 : 0 \
   }; \
   static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \
-                          const EIGTYPE* rhs, Index rhsStride, EIGTYPE* res, Index resStride, EIGTYPE alpha) \
+                          const EIGTYPE* /*rhs*/, Index /*rhsStride*/, EIGTYPE* res, Index resStride, EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
   { \
   /* typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs;*/ \
 \
-   MKL_INT lda=lhsStride, ldc=resStride, n=size, k=depth; \
+   BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \
    char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'T':'N'; \
-   MKLTYPE alpha_, beta_; \
-\
-/* Set alpha_ & beta_ */ \
-   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
-   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \
-   MKLFUNC(&uplo, &trans, &n, &k, &alpha_, lhs, &lda, &beta_, res, &ldc); \
+   EIGTYPE beta; \
+   BLASFUNC(&uplo, &trans, &n, &k, &numext::real_ref(alpha), lhs, &lda, &numext::real_ref(beta), res, &ldc); \
   } \
 };
 
 // HERK for complex data
-#define EIGEN_MKL_RANKUPDATE_C(EIGTYPE, MKLTYPE, RTYPE, MKLFUNC) \
+#define EIGEN_BLAS_RANKUPDATE_C(EIGTYPE, BLASTYPE, RTYPE, BLASFUNC) \
 template <typename Index, int AStorageOrder, bool ConjugateA, int  UpLo> \
 struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \
   enum { \
@@ -105,18 +102,15 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C
     conjA = (((AStorageOrder==ColMajor) && ConjugateA) || ((AStorageOrder==RowMajor) && !ConjugateA)) ? 1 : 0 \
   }; \
   static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \
-                          const EIGTYPE* rhs, Index rhsStride, EIGTYPE* res, Index resStride, EIGTYPE alpha) \
+                          const EIGTYPE* /*rhs*/, Index /*rhsStride*/, EIGTYPE* res, Index resStride, EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
   { \
    typedef Matrix<EIGTYPE, Dynamic, Dynamic, AStorageOrder> MatrixType; \
 \
-   MKL_INT lda=lhsStride, ldc=resStride, n=size, k=depth; \
+   BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \
    char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'C':'N'; \
    RTYPE alpha_, beta_; \
    const EIGTYPE* a_ptr; \
 \
-/* Set alpha_ & beta_ */ \
-/*   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); */\
-/*   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1));*/ \
    alpha_ = alpha.real(); \
    beta_ = 1.0; \
 /* Copy with conjugation in some cases*/ \
@@ -127,20 +121,21 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C
      lda = a.outerStride(); \
      a_ptr = a.data(); \
    } else a_ptr=lhs; \
-   MKLFUNC(&uplo, &trans, &n, &k, &alpha_, (MKLTYPE*)a_ptr, &lda, &beta_, (MKLTYPE*)res, &ldc); \
+   BLASFUNC(&uplo, &trans, &n, &k, &alpha_, (BLASTYPE*)a_ptr, &lda, &beta_, (BLASTYPE*)res, &ldc); \
   } \
 };
 
 
-EIGEN_MKL_RANKUPDATE_R(double, double, dsyrk)
-EIGEN_MKL_RANKUPDATE_R(float,  float,  ssyrk)
+EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk_)
+EIGEN_BLAS_RANKUPDATE_R(float,  float,  ssyrk_)
 
-//EIGEN_MKL_RANKUPDATE_C(dcomplex, MKL_Complex16, double, zherk)
-//EIGEN_MKL_RANKUPDATE_C(scomplex, MKL_Complex8,  double, cherk)
+// TODO hanlde complex cases
+// EIGEN_BLAS_RANKUPDATE_C(dcomplex, double, double, zherk_)
+// EIGEN_BLAS_RANKUPDATE_C(scomplex, float,  float, cherk_)
 
 
 } // end namespace internal
 
 } // end namespace Eigen
 
-#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H
+#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h b/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h
index b6ae729b2..7a3bdbf20 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h
@@ -25,13 +25,13 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to BLAS F77
  *   General matrix-matrix product functionality based on ?GEMM.
  ********************************************************************************
 */
 
-#ifndef EIGEN_GENERAL_MATRIX_MATRIX_MKL_H
-#define EIGEN_GENERAL_MATRIX_MATRIX_MKL_H
+#ifndef EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H
+#define EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H
 
 namespace Eigen { 
 
@@ -46,7 +46,7 @@ namespace internal {
 
 // gemm specialization
 
-#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, MKLTYPE, MKLPREFIX) \
+#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASPREFIX) \
 template< \
   typename Index, \
   int LhsStorageOrder, bool ConjugateLhs, \
@@ -66,55 +66,50 @@ static void run(Index rows, Index cols, Index depth, \
   using std::conj; \
 \
   char transa, transb; \
-  MKL_INT m, n, k, lda, ldb, ldc; \
+  BlasIndex m, n, k, lda, ldb, ldc; \
   const EIGTYPE *a, *b; \
-  MKLTYPE alpha_, beta_; \
+  EIGTYPE beta(1); \
   MatrixX##EIGPREFIX a_tmp, b_tmp; \
-  EIGTYPE myone(1);\
 \
 /* Set transpose options */ \
   transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \
   transb = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \
 \
 /* Set m, n, k */ \
-  m = (MKL_INT)rows;  \
-  n = (MKL_INT)cols;  \
-  k = (MKL_INT)depth; \
-\
-/* Set alpha_ & beta_ */ \
-  assign_scalar_eig2mkl(alpha_, alpha); \
-  assign_scalar_eig2mkl(beta_, myone); \
+  m = convert_index<BlasIndex>(rows);  \
+  n = convert_index<BlasIndex>(cols);  \
+  k = convert_index<BlasIndex>(depth); \
 \
 /* Set lda, ldb, ldc */ \
-  lda = (MKL_INT)lhsStride; \
-  ldb = (MKL_INT)rhsStride; \
-  ldc = (MKL_INT)resStride; \
+  lda = convert_index<BlasIndex>(lhsStride); \
+  ldb = convert_index<BlasIndex>(rhsStride); \
+  ldc = convert_index<BlasIndex>(resStride); \
 \
 /* Set a, b, c */ \
   if ((LhsStorageOrder==ColMajor) && (ConjugateLhs)) { \
     Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,m,k,OuterStride<>(lhsStride)); \
     a_tmp = lhs.conjugate(); \
     a = a_tmp.data(); \
-    lda = a_tmp.outerStride(); \
+    lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
   } else a = _lhs; \
 \
   if ((RhsStorageOrder==ColMajor) && (ConjugateRhs)) { \
     Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,k,n,OuterStride<>(rhsStride)); \
     b_tmp = rhs.conjugate(); \
     b = b_tmp.data(); \
-    ldb = b_tmp.outerStride(); \
+    ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
   } else b = _rhs; \
 \
-  MKLPREFIX##gemm(&transa, &transb, &m, &n, &k, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
+  BLASPREFIX##gemm_(&transa, &transb, &m, &n, &k, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
 }};
 
-GEMM_SPECIALIZATION(double,   d,  double,        d)
-GEMM_SPECIALIZATION(float,    f,  float,         s)
-GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, z)
-GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8,  c)
+GEMM_SPECIALIZATION(double,   d,  double, d)
+GEMM_SPECIALIZATION(float,    f,  float,  s)
+GEMM_SPECIALIZATION(dcomplex, cd, double, z)
+GEMM_SPECIALIZATION(scomplex, cf, float,  c)
 
 } // end namespase internal
 
 } // end namespace Eigen
 
-#endif // EIGEN_GENERAL_MATRIX_MATRIX_MKL_H
+#endif // EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H
diff --git a/Eigen/src/Core/products/GeneralMatrixVector_MKL.h b/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h
index 12c3d13bd..e3a5d5892 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector_MKL.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h
@@ -25,13 +25,13 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to BLAS F77
  *   General matrix-vector product functionality based on ?GEMV.
  ********************************************************************************
 */
 
-#ifndef EIGEN_GENERAL_MATRIX_VECTOR_MKL_H
-#define EIGEN_GENERAL_MATRIX_VECTOR_MKL_H
+#ifndef EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H
+#define EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H
 
 namespace Eigen { 
 
@@ -49,7 +49,7 @@ namespace internal {
 template<typename Index, typename LhsScalar, int StorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs>
 struct general_matrix_vector_product_gemv;
 
-#define EIGEN_MKL_GEMV_SPECIALIZE(Scalar) \
+#define EIGEN_BLAS_GEMV_SPECIALIZE(Scalar) \
 template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
 struct general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ColMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,ConjugateRhs,Specialized> { \
 static void run( \
@@ -80,12 +80,12 @@ static void run( \
 } \
 }; \
 
-EIGEN_MKL_GEMV_SPECIALIZE(double)
-EIGEN_MKL_GEMV_SPECIALIZE(float)
-EIGEN_MKL_GEMV_SPECIALIZE(dcomplex)
-EIGEN_MKL_GEMV_SPECIALIZE(scomplex)
+EIGEN_BLAS_GEMV_SPECIALIZE(double)
+EIGEN_BLAS_GEMV_SPECIALIZE(float)
+EIGEN_BLAS_GEMV_SPECIALIZE(dcomplex)
+EIGEN_BLAS_GEMV_SPECIALIZE(scomplex)
 
-#define EIGEN_MKL_GEMV_SPECIALIZATION(EIGTYPE,MKLTYPE,MKLPREFIX) \
+#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASPREFIX) \
 template<typename Index, int LhsStorageOrder, bool ConjugateLhs, bool ConjugateRhs> \
 struct general_matrix_vector_product_gemv<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,ConjugateRhs> \
 { \
@@ -97,16 +97,15 @@ static void run( \
   const EIGTYPE* rhs, Index rhsIncr, \
   EIGTYPE* res, Index resIncr, EIGTYPE alpha) \
 { \
-  MKL_INT m=rows, n=cols, lda=lhsStride, incx=rhsIncr, incy=resIncr; \
-  MKLTYPE alpha_, beta_; \
-  const EIGTYPE *x_ptr, myone(1); \
+  BlasIndex m=convert_index<BlasIndex>(rows), n=convert_index<BlasIndex>(cols), \
+            lda=convert_index<BlasIndex>(lhsStride), incx=convert_index<BlasIndex>(rhsIncr), incy=convert_index<BlasIndex>(resIncr); \
+  const EIGTYPE beta(1); \
+  const EIGTYPE *x_ptr; \
   char trans=(LhsStorageOrder==ColMajor) ? 'N' : (ConjugateLhs) ? 'C' : 'T'; \
   if (LhsStorageOrder==RowMajor) { \
-    m=cols; \
-    n=rows; \
+    m = convert_index<BlasIndex>(cols); \
+    n = convert_index<BlasIndex>(rows); \
   }\
-  assign_scalar_eig2mkl(alpha_, alpha); \
-  assign_scalar_eig2mkl(beta_, myone); \
   GEMVVector x_tmp; \
   if (ConjugateRhs) { \
     Map<const GEMVVector, 0, InnerStride<> > map_x(rhs,cols,1,InnerStride<>(incx)); \
@@ -114,17 +113,17 @@ static void run( \
     x_ptr=x_tmp.data(); \
     incx=1; \
   } else x_ptr=rhs; \
-  MKLPREFIX##gemv(&trans, &m, &n, &alpha_, (const MKLTYPE*)lhs, &lda, (const MKLTYPE*)x_ptr, &incx, &beta_, (MKLTYPE*)res, &incy); \
+  BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \
 }\
 };
 
-EIGEN_MKL_GEMV_SPECIALIZATION(double,   double,        d)
-EIGEN_MKL_GEMV_SPECIALIZATION(float,    float,         s)
-EIGEN_MKL_GEMV_SPECIALIZATION(dcomplex, MKL_Complex16, z)
-EIGEN_MKL_GEMV_SPECIALIZATION(scomplex, MKL_Complex8,  c)
+EIGEN_BLAS_GEMV_SPECIALIZATION(double,   double, d)
+EIGEN_BLAS_GEMV_SPECIALIZATION(float,    float,  s)
+EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, z)
+EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float,  c)
 
 } // end namespase internal
 
 } // end namespace Eigen
 
-#endif // EIGEN_GENERAL_MATRIX_VECTOR_MKL_H
+#endif // EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h
index dfa687fef..c3e37b1e0 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h
@@ -25,13 +25,13 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to BLAS F77
  *   Self adjoint matrix * matrix product functionality based on ?SYMM/?HEMM.
  ********************************************************************************
 */
 
-#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H
-#define EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H
+#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H
+#define EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H
 
 namespace Eigen { 
 
@@ -40,7 +40,7 @@ namespace internal {
 
 /* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */
 
-#define EIGEN_MKL_SYMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
 template <typename Index, \
           int LhsStorageOrder, bool ConjugateLhs, \
           int RhsStorageOrder, bool ConjugateRhs> \
@@ -52,28 +52,23 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
     const EIGTYPE* _lhs, Index lhsStride, \
     const EIGTYPE* _rhs, Index rhsStride, \
     EIGTYPE* res,        Index resStride, \
-    EIGTYPE alpha) \
+    EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
   { \
     char side='L', uplo='L'; \
-    MKL_INT m, n, lda, ldb, ldc; \
+    BlasIndex m, n, lda, ldb, ldc; \
     const EIGTYPE *a, *b; \
-    MKLTYPE alpha_, beta_; \
+    EIGTYPE beta(1); \
     MatrixX##EIGPREFIX b_tmp; \
-    EIGTYPE myone(1);\
 \
 /* Set transpose options */ \
 /* Set m, n, k */ \
-    m = (MKL_INT)rows;  \
-    n = (MKL_INT)cols;  \
-\
-/* Set alpha_ & beta_ */ \
-    assign_scalar_eig2mkl(alpha_, alpha); \
-    assign_scalar_eig2mkl(beta_, myone); \
+    m = convert_index<BlasIndex>(rows);  \
+    n = convert_index<BlasIndex>(cols);  \
 \
 /* Set lda, ldb, ldc */ \
-    lda = (MKL_INT)lhsStride; \
-    ldb = (MKL_INT)rhsStride; \
-    ldc = (MKL_INT)resStride; \
+    lda = convert_index<BlasIndex>(lhsStride); \
+    ldb = convert_index<BlasIndex>(rhsStride); \
+    ldc = convert_index<BlasIndex>(resStride); \
 \
 /* Set a, b, c */ \
     if (LhsStorageOrder==RowMajor) uplo='U'; \
@@ -83,16 +78,16 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
       Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \
       b_tmp = rhs.adjoint(); \
       b = b_tmp.data(); \
-      ldb = b_tmp.outerStride(); \
+      ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
     } else b = _rhs; \
 \
-    MKLPREFIX##symm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
+    BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
 \
   } \
 };
 
 
-#define EIGEN_MKL_HEMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
 template <typename Index, \
           int LhsStorageOrder, bool ConjugateLhs, \
           int RhsStorageOrder, bool ConjugateRhs> \
@@ -103,29 +98,24 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
     const EIGTYPE* _lhs, Index lhsStride, \
     const EIGTYPE* _rhs, Index rhsStride, \
     EIGTYPE* res,        Index resStride, \
-    EIGTYPE alpha) \
+    EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
   { \
     char side='L', uplo='L'; \
-    MKL_INT m, n, lda, ldb, ldc; \
+    BlasIndex m, n, lda, ldb, ldc; \
     const EIGTYPE *a, *b; \
-    MKLTYPE alpha_, beta_; \
+    EIGTYPE beta(1); \
     MatrixX##EIGPREFIX b_tmp; \
     Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> a_tmp; \
-    EIGTYPE myone(1); \
 \
 /* Set transpose options */ \
 /* Set m, n, k */ \
-    m = (MKL_INT)rows; \
-    n = (MKL_INT)cols; \
-\
-/* Set alpha_ & beta_ */ \
-    assign_scalar_eig2mkl(alpha_, alpha); \
-    assign_scalar_eig2mkl(beta_, myone); \
+    m = convert_index<BlasIndex>(rows); \
+    n = convert_index<BlasIndex>(cols); \
 \
 /* Set lda, ldb, ldc */ \
-    lda = (MKL_INT)lhsStride; \
-    ldb = (MKL_INT)rhsStride; \
-    ldc = (MKL_INT)resStride; \
+    lda = convert_index<BlasIndex>(lhsStride); \
+    ldb = convert_index<BlasIndex>(rhsStride); \
+    ldc = convert_index<BlasIndex>(resStride); \
 \
 /* Set a, b, c */ \
     if (((LhsStorageOrder==ColMajor) && ConjugateLhs) || ((LhsStorageOrder==RowMajor) && (!ConjugateLhs))) { \
@@ -151,23 +141,23 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
         b_tmp = rhs.transpose(); \
       } \
       b = b_tmp.data(); \
-      ldb = b_tmp.outerStride(); \
+      ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
     } \
 \
-    MKLPREFIX##hemm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
+    BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
 \
   } \
 };
 
-EIGEN_MKL_SYMM_L(double, double, d, d)
-EIGEN_MKL_SYMM_L(float, float, f, s)
-EIGEN_MKL_HEMM_L(dcomplex, MKL_Complex16, cd, z)
-EIGEN_MKL_HEMM_L(scomplex, MKL_Complex8, cf, c)
+EIGEN_BLAS_SYMM_L(double, double, d, d)
+EIGEN_BLAS_SYMM_L(float, float, f, s)
+EIGEN_BLAS_HEMM_L(dcomplex, double, cd, z)
+EIGEN_BLAS_HEMM_L(scomplex, float, cf, c)
 
 
 /* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */
 
-#define EIGEN_MKL_SYMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
 template <typename Index, \
           int LhsStorageOrder, bool ConjugateLhs, \
           int RhsStorageOrder, bool ConjugateRhs> \
@@ -179,27 +169,22 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
     const EIGTYPE* _lhs, Index lhsStride, \
     const EIGTYPE* _rhs, Index rhsStride, \
     EIGTYPE* res,        Index resStride, \
-    EIGTYPE alpha) \
+    EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
   { \
     char side='R', uplo='L'; \
-    MKL_INT m, n, lda, ldb, ldc; \
+    BlasIndex m, n, lda, ldb, ldc; \
     const EIGTYPE *a, *b; \
-    MKLTYPE alpha_, beta_; \
+    EIGTYPE beta(1); \
     MatrixX##EIGPREFIX b_tmp; \
-    EIGTYPE myone(1);\
 \
 /* Set m, n, k */ \
-    m = (MKL_INT)rows;  \
-    n = (MKL_INT)cols;  \
-\
-/* Set alpha_ & beta_ */ \
-    assign_scalar_eig2mkl(alpha_, alpha); \
-    assign_scalar_eig2mkl(beta_, myone); \
+    m = convert_index<BlasIndex>(rows);  \
+    n = convert_index<BlasIndex>(cols);  \
 \
 /* Set lda, ldb, ldc */ \
-    lda = (MKL_INT)rhsStride; \
-    ldb = (MKL_INT)lhsStride; \
-    ldc = (MKL_INT)resStride; \
+    lda = convert_index<BlasIndex>(rhsStride); \
+    ldb = convert_index<BlasIndex>(lhsStride); \
+    ldc = convert_index<BlasIndex>(resStride); \
 \
 /* Set a, b, c */ \
     if (RhsStorageOrder==RowMajor) uplo='U'; \
@@ -209,16 +194,16 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
       Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(rhsStride)); \
       b_tmp = lhs.adjoint(); \
       b = b_tmp.data(); \
-      ldb = b_tmp.outerStride(); \
+      ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
     } else b = _lhs; \
 \
-    MKLPREFIX##symm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
+    BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
 \
   } \
 };
 
 
-#define EIGEN_MKL_HEMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
 template <typename Index, \
           int LhsStorageOrder, bool ConjugateLhs, \
           int RhsStorageOrder, bool ConjugateRhs> \
@@ -229,35 +214,30 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
     const EIGTYPE* _lhs, Index lhsStride, \
     const EIGTYPE* _rhs, Index rhsStride, \
     EIGTYPE* res,        Index resStride, \
-    EIGTYPE alpha) \
+    EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
   { \
     char side='R', uplo='L'; \
-    MKL_INT m, n, lda, ldb, ldc; \
+    BlasIndex m, n, lda, ldb, ldc; \
     const EIGTYPE *a, *b; \
-    MKLTYPE alpha_, beta_; \
+    EIGTYPE beta(1); \
     MatrixX##EIGPREFIX b_tmp; \
     Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> a_tmp; \
-    EIGTYPE myone(1); \
 \
 /* Set m, n, k */ \
-    m = (MKL_INT)rows; \
-    n = (MKL_INT)cols; \
-\
-/* Set alpha_ & beta_ */ \
-    assign_scalar_eig2mkl(alpha_, alpha); \
-    assign_scalar_eig2mkl(beta_, myone); \
+    m = convert_index<BlasIndex>(rows); \
+    n = convert_index<BlasIndex>(cols); \
 \
 /* Set lda, ldb, ldc */ \
-    lda = (MKL_INT)rhsStride; \
-    ldb = (MKL_INT)lhsStride; \
-    ldc = (MKL_INT)resStride; \
+    lda = convert_index<BlasIndex>(rhsStride); \
+    ldb = convert_index<BlasIndex>(lhsStride); \
+    ldc = convert_index<BlasIndex>(resStride); \
 \
 /* Set a, b, c */ \
     if (((RhsStorageOrder==ColMajor) && ConjugateRhs) || ((RhsStorageOrder==RowMajor) && (!ConjugateRhs))) { \
       Map<const Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder>, 0, OuterStride<> > rhs(_rhs,n,n,OuterStride<>(rhsStride)); \
       a_tmp = rhs.conjugate(); \
       a = a_tmp.data(); \
-      lda = a_tmp.outerStride(); \
+      lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
     } else a = _rhs; \
     if (RhsStorageOrder==RowMajor) uplo='U'; \
 \
@@ -279,17 +259,17 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
       ldb = b_tmp.outerStride(); \
     } \
 \
-    MKLPREFIX##hemm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
+    BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
   } \
 };
 
-EIGEN_MKL_SYMM_R(double, double, d, d)
-EIGEN_MKL_SYMM_R(float, float, f, s)
-EIGEN_MKL_HEMM_R(dcomplex, MKL_Complex16, cd, z)
-EIGEN_MKL_HEMM_R(scomplex, MKL_Complex8, cf, c)
+EIGEN_BLAS_SYMM_R(double, double, d, d)
+EIGEN_BLAS_SYMM_R(float, float, f, s)
+EIGEN_BLAS_HEMM_R(dcomplex, double, cd, z)
+EIGEN_BLAS_HEMM_R(scomplex, float, cf, c)
 
 } // end namespace internal
 
 } // end namespace Eigen
 
-#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H
+#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H
diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h b/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h
index a08f385bc..38f23accf 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h
@@ -25,13 +25,13 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to BLAS F77
  *   Selfadjoint matrix-vector product functionality based on ?SYMV/HEMV.
  ********************************************************************************
 */
 
-#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_MKL_H
-#define EIGEN_SELFADJOINT_MATRIX_VECTOR_MKL_H
+#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H
+#define EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H
 
 namespace Eigen { 
 
@@ -47,7 +47,7 @@ template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool Conju
 struct selfadjoint_matrix_vector_product_symv :
   selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,BuiltIn> {};
 
-#define EIGEN_MKL_SYMV_SPECIALIZE(Scalar) \
+#define EIGEN_BLAS_SYMV_SPECIALIZE(Scalar) \
 template<typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> \
 struct selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Specialized> { \
 static void run( \
@@ -66,12 +66,12 @@ static void run( \
   } \
 }; \
 
-EIGEN_MKL_SYMV_SPECIALIZE(double)
-EIGEN_MKL_SYMV_SPECIALIZE(float)
-EIGEN_MKL_SYMV_SPECIALIZE(dcomplex)
-EIGEN_MKL_SYMV_SPECIALIZE(scomplex)
+EIGEN_BLAS_SYMV_SPECIALIZE(double)
+EIGEN_BLAS_SYMV_SPECIALIZE(float)
+EIGEN_BLAS_SYMV_SPECIALIZE(dcomplex)
+EIGEN_BLAS_SYMV_SPECIALIZE(scomplex)
 
-#define EIGEN_MKL_SYMV_SPECIALIZATION(EIGTYPE,MKLTYPE,MKLFUNC) \
+#define EIGEN_BLAS_SYMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASFUNC) \
 template<typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> \
 struct selfadjoint_matrix_vector_product_symv<EIGTYPE,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs> \
 { \
@@ -85,29 +85,27 @@ const EIGTYPE* _rhs, EIGTYPE* res, EIGTYPE alpha) \
     IsRowMajor = StorageOrder==RowMajor ? 1 : 0, \
     IsLower = UpLo == Lower ? 1 : 0 \
   }; \
-  MKL_INT n=size, lda=lhsStride, incx=1, incy=1; \
-  MKLTYPE alpha_, beta_; \
-  const EIGTYPE *x_ptr, myone(1); \
+  BlasIndex n=convert_index<BlasIndex>(size), lda=convert_index<BlasIndex>(lhsStride), incx=1, incy=1; \
+  EIGTYPE beta(1); \
+  const EIGTYPE *x_ptr; \
   char uplo=(IsRowMajor) ? (IsLower ? 'U' : 'L') : (IsLower ? 'L' : 'U'); \
-  assign_scalar_eig2mkl(alpha_, alpha); \
-  assign_scalar_eig2mkl(beta_, myone); \
   SYMVVector x_tmp; \
   if (ConjugateRhs) { \
     Map<const SYMVVector, 0 > map_x(_rhs,size,1); \
     x_tmp=map_x.conjugate(); \
     x_ptr=x_tmp.data(); \
   } else x_ptr=_rhs; \
-  MKLFUNC(&uplo, &n, &alpha_, (const MKLTYPE*)lhs, &lda, (const MKLTYPE*)x_ptr, &incx, &beta_, (MKLTYPE*)res, &incy); \
+  BLASFUNC(&uplo, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \
 }\
 };
 
-EIGEN_MKL_SYMV_SPECIALIZATION(double,   double,        dsymv)
-EIGEN_MKL_SYMV_SPECIALIZATION(float,    float,         ssymv)
-EIGEN_MKL_SYMV_SPECIALIZATION(dcomplex, MKL_Complex16, zhemv)
-EIGEN_MKL_SYMV_SPECIALIZATION(scomplex, MKL_Complex8,  chemv)
+EIGEN_BLAS_SYMV_SPECIALIZATION(double,   double, dsymv_)
+EIGEN_BLAS_SYMV_SPECIALIZATION(float,    float,  ssymv_)
+EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, double, zhemv_)
+EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, float,  chemv_)
 
 } // end namespace internal
 
 } // end namespace Eigen
 
-#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_MKL_H
+#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H
diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h
index d9e7cf852..aecded6bb 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h
@@ -25,13 +25,13 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to BLAS F77
  *   Triangular matrix * matrix product functionality based on ?TRMM.
  ********************************************************************************
 */
 
-#ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_MKL_H
-#define EIGEN_TRIANGULAR_MATRIX_MATRIX_MKL_H
+#ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H
+#define EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H
 
 namespace Eigen { 
 
@@ -50,7 +50,7 @@ struct product_triangular_matrix_matrix_trmm :
 
 
 // try to go to BLAS specialization
-#define EIGEN_MKL_TRMM_SPECIALIZE(Scalar, LhsIsTriangular) \
+#define EIGEN_BLAS_TRMM_SPECIALIZE(Scalar, LhsIsTriangular) \
 template <typename Index, int Mode, \
           int LhsStorageOrder, bool ConjugateLhs, \
           int RhsStorageOrder, bool ConjugateRhs> \
@@ -65,17 +65,17 @@ struct product_triangular_matrix_matrix<Scalar,Index, Mode, LhsIsTriangular, \
   } \
 };
 
-EIGEN_MKL_TRMM_SPECIALIZE(double, true)
-EIGEN_MKL_TRMM_SPECIALIZE(double, false)
-EIGEN_MKL_TRMM_SPECIALIZE(dcomplex, true)
-EIGEN_MKL_TRMM_SPECIALIZE(dcomplex, false)
-EIGEN_MKL_TRMM_SPECIALIZE(float, true)
-EIGEN_MKL_TRMM_SPECIALIZE(float, false)
-EIGEN_MKL_TRMM_SPECIALIZE(scomplex, true)
-EIGEN_MKL_TRMM_SPECIALIZE(scomplex, false)
+EIGEN_BLAS_TRMM_SPECIALIZE(double, true)
+EIGEN_BLAS_TRMM_SPECIALIZE(double, false)
+EIGEN_BLAS_TRMM_SPECIALIZE(dcomplex, true)
+EIGEN_BLAS_TRMM_SPECIALIZE(dcomplex, false)
+EIGEN_BLAS_TRMM_SPECIALIZE(float, true)
+EIGEN_BLAS_TRMM_SPECIALIZE(float, false)
+EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, true)
+EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, false)
 
 // implements col-major += alpha * op(triangular) * op(general)
-#define EIGEN_MKL_TRMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
 template <typename Index, int Mode, \
           int LhsStorageOrder, bool ConjugateLhs, \
           int RhsStorageOrder, bool ConjugateRhs> \
@@ -106,13 +106,14 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
    typedef Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> MatrixLhs; \
    typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs; \
 \
-/* Non-square case - doesn't fit to MKL ?TRMM. Fall to default triangular product or call MKL ?GEMM*/ \
+/* Non-square case - doesn't fit to BLAS ?TRMM. Fall to default triangular product or call BLAS ?GEMM*/ \
    if (rows != depth) { \
 \
-     int nthr = mkl_domain_get_max_threads(EIGEN_MKL_DOMAIN_BLAS); \
+     /* FIXME handle mkl_domain_get_max_threads */ \
+     /*int nthr = mkl_domain_get_max_threads(EIGEN_BLAS_DOMAIN_BLAS);*/ int nthr = 1;\
 \
      if (((nthr==1) && (((std::max)(rows,depth)-diagSize)/(double)diagSize < 0.5))) { \
-     /* Most likely no benefit to call TRMM or GEMM from MKL*/ \
+     /* Most likely no benefit to call TRMM or GEMM from BLAS */ \
        product_triangular_matrix_matrix<EIGTYPE,Index,Mode,true, \
        LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, BuiltIn>::run( \
            _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \
@@ -121,27 +122,23 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
      /* Make sense to call GEMM */ \
        Map<const MatrixLhs, 0, OuterStride<> > lhsMap(_lhs,rows,depth,OuterStride<>(lhsStride)); \
        MatrixLhs aa_tmp=lhsMap.template triangularView<Mode>(); \
-       MKL_INT aStride = aa_tmp.outerStride(); \
+       BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride()); \
        gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \
        general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \
        rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, resStride, alpha, gemm_blocking, 0); \
 \
-     /*std::cout << "TRMM_L: A is not square! Go to MKL GEMM implementation! " << nthr<<" \n";*/ \
+     /*std::cout << "TRMM_L: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \
      } \
      return; \
    } \
    char side = 'L', transa, uplo, diag = 'N'; \
    EIGTYPE *b; \
    const EIGTYPE *a; \
-   MKL_INT m, n, lda, ldb; \
-   MKLTYPE alpha_; \
-\
-/* Set alpha_*/ \
-   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
+   BlasIndex m, n, lda, ldb; \
 \
 /* Set m, n */ \
-   m = (MKL_INT)diagSize; \
-   n = (MKL_INT)cols; \
+   m = convert_index<BlasIndex>(diagSize); \
+   n = convert_index<BlasIndex>(cols); \
 \
 /* Set trans */ \
    transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \
@@ -152,7 +149,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
 \
    if (ConjugateRhs) b_tmp = rhs.conjugate(); else b_tmp = rhs; \
    b = b_tmp.data(); \
-   ldb = b_tmp.outerStride(); \
+   ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
 \
 /* Set uplo */ \
    uplo = IsLower ? 'L' : 'U'; \
@@ -168,14 +165,14 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
      else if (IsUnitDiag) \
        a_tmp.diagonal().setOnes();\
      a = a_tmp.data(); \
-     lda = a_tmp.outerStride(); \
+     lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
    } else { \
      a = _lhs; \
-     lda = lhsStride; \
+     lda = convert_index<BlasIndex>(lhsStride); \
    } \
-   /*std::cout << "TRMM_L: A is square! Go to MKL TRMM implementation! \n";*/ \
+   /*std::cout << "TRMM_L: A is square! Go to BLAS TRMM implementation! \n";*/ \
 /* call ?trmm*/ \
-   MKLPREFIX##trmm(&side, &uplo, &transa, &diag, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (MKLTYPE*)b, &ldb); \
+   BLASPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
 \
 /* Add op(a_triangular)*b into res*/ \
    Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
@@ -183,13 +180,13 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
   } \
 };
 
-EIGEN_MKL_TRMM_L(double, double, d, d)
-EIGEN_MKL_TRMM_L(dcomplex, MKL_Complex16, cd, z)
-EIGEN_MKL_TRMM_L(float, float, f, s)
-EIGEN_MKL_TRMM_L(scomplex, MKL_Complex8, cf, c)
+EIGEN_BLAS_TRMM_L(double, double, d, d)
+EIGEN_BLAS_TRMM_L(dcomplex, double, cd, z)
+EIGEN_BLAS_TRMM_L(float, float, f, s)
+EIGEN_BLAS_TRMM_L(scomplex, float, cf, c)
 
 // implements col-major += alpha * op(general) * op(triangular)
-#define EIGEN_MKL_TRMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+#define EIGEN_BLAS_TRMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
 template <typename Index, int Mode, \
           int LhsStorageOrder, bool ConjugateLhs, \
           int RhsStorageOrder, bool ConjugateRhs> \
@@ -220,13 +217,13 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
    typedef Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> MatrixLhs; \
    typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs; \
 \
-/* Non-square case - doesn't fit to MKL ?TRMM. Fall to default triangular product or call MKL ?GEMM*/ \
+/* Non-square case - doesn't fit to BLAS ?TRMM. Fall to default triangular product or call BLAS ?GEMM*/ \
    if (cols != depth) { \
 \
-     int nthr = mkl_domain_get_max_threads(EIGEN_MKL_DOMAIN_BLAS); \
+     int nthr = 1 /*mkl_domain_get_max_threads(EIGEN_BLAS_DOMAIN_BLAS)*/; \
 \
      if ((nthr==1) && (((std::max)(cols,depth)-diagSize)/(double)diagSize < 0.5)) { \
-     /* Most likely no benefit to call TRMM or GEMM from MKL*/ \
+     /* Most likely no benefit to call TRMM or GEMM from BLAS*/ \
        product_triangular_matrix_matrix<EIGTYPE,Index,Mode,false, \
        LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, BuiltIn>::run( \
            _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \
@@ -235,27 +232,23 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
      /* Make sense to call GEMM */ \
        Map<const MatrixRhs, 0, OuterStride<> > rhsMap(_rhs,depth,cols, OuterStride<>(rhsStride)); \
        MatrixRhs aa_tmp=rhsMap.template triangularView<Mode>(); \
-       MKL_INT aStride = aa_tmp.outerStride(); \
+       BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride()); \
        gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \
        general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \
        rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, resStride, alpha, gemm_blocking, 0); \
 \
-     /*std::cout << "TRMM_R: A is not square! Go to MKL GEMM implementation! " << nthr<<" \n";*/ \
+     /*std::cout << "TRMM_R: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \
      } \
      return; \
    } \
    char side = 'R', transa, uplo, diag = 'N'; \
    EIGTYPE *b; \
    const EIGTYPE *a; \
-   MKL_INT m, n, lda, ldb; \
-   MKLTYPE alpha_; \
-\
-/* Set alpha_*/ \
-   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
+   BlasIndex m, n, lda, ldb; \
 \
 /* Set m, n */ \
-   m = (MKL_INT)rows; \
-   n = (MKL_INT)diagSize; \
+   m = convert_index<BlasIndex>(rows); \
+   n = convert_index<BlasIndex>(diagSize); \
 \
 /* Set trans */ \
    transa = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \
@@ -266,7 +259,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
 \
    if (ConjugateLhs) b_tmp = lhs.conjugate(); else b_tmp = lhs; \
    b = b_tmp.data(); \
-   ldb = b_tmp.outerStride(); \
+   ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
 \
 /* Set uplo */ \
    uplo = IsLower ? 'L' : 'U'; \
@@ -282,14 +275,14 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
      else if (IsUnitDiag) \
        a_tmp.diagonal().setOnes();\
      a = a_tmp.data(); \
-     lda = a_tmp.outerStride(); \
+     lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
    } else { \
      a = _rhs; \
-     lda = rhsStride; \
+     lda = convert_index<BlasIndex>(rhsStride); \
    } \
-   /*std::cout << "TRMM_R: A is square! Go to MKL TRMM implementation! \n";*/ \
+   /*std::cout << "TRMM_R: A is square! Go to BLAS TRMM implementation! \n";*/ \
 /* call ?trmm*/ \
-   MKLPREFIX##trmm(&side, &uplo, &transa, &diag, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (MKLTYPE*)b, &ldb); \
+   BLASPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
 \
 /* Add op(a_triangular)*b into res*/ \
    Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
@@ -297,13 +290,13 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
   } \
 };
 
-EIGEN_MKL_TRMM_R(double, double, d, d)
-EIGEN_MKL_TRMM_R(dcomplex, MKL_Complex16, cd, z)
-EIGEN_MKL_TRMM_R(float, float, f, s)
-EIGEN_MKL_TRMM_R(scomplex, MKL_Complex8, cf, c)
+EIGEN_BLAS_TRMM_R(double, double, d, d)
+EIGEN_BLAS_TRMM_R(dcomplex, double, cd, z)
+EIGEN_BLAS_TRMM_R(float, float, f, s)
+EIGEN_BLAS_TRMM_R(scomplex, float, cf, c)
 
 } // end namespace internal
 
 } // end namespace Eigen
 
-#endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_MKL_H
+#endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H
diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h
index 7c014b72a..f79840aa7 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector.h
@@ -27,13 +27,13 @@ struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,C
     HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag
   };
   static EIGEN_DONT_INLINE  void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
-                                     const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha);
+                                     const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha);
 };
 
 template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
 EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version>
   ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
-        const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha)
+        const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha)
   {
     static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
     Index size = (std::min)(_rows,_cols);
diff --git a/Eigen/src/Core/products/TriangularMatrixVector_MKL.h b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h
index 3672b1240..07bf26ce5 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector_MKL.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h
@@ -25,13 +25,13 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to BLAS F77
  *   Triangular matrix-vector product functionality based on ?TRMV.
  ********************************************************************************
 */
 
-#ifndef EIGEN_TRIANGULAR_MATRIX_VECTOR_MKL_H
-#define EIGEN_TRIANGULAR_MATRIX_VECTOR_MKL_H
+#ifndef EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H
+#define EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H
 
 namespace Eigen { 
 
@@ -47,7 +47,7 @@ template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename Rh
 struct triangular_matrix_vector_product_trmv :
   triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,StorageOrder,BuiltIn> {};
 
-#define EIGEN_MKL_TRMV_SPECIALIZE(Scalar) \
+#define EIGEN_BLAS_TRMV_SPECIALIZE(Scalar) \
 template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
 struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,ColMajor,Specialized> { \
  static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \
@@ -65,13 +65,13 @@ struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs
   } \
 };
 
-EIGEN_MKL_TRMV_SPECIALIZE(double)
-EIGEN_MKL_TRMV_SPECIALIZE(float)
-EIGEN_MKL_TRMV_SPECIALIZE(dcomplex)
-EIGEN_MKL_TRMV_SPECIALIZE(scomplex)
+EIGEN_BLAS_TRMV_SPECIALIZE(double)
+EIGEN_BLAS_TRMV_SPECIALIZE(float)
+EIGEN_BLAS_TRMV_SPECIALIZE(dcomplex)
+EIGEN_BLAS_TRMV_SPECIALIZE(scomplex)
 
 // implements col-major: res += alpha * op(triangular) * vector
-#define EIGEN_MKL_TRMV_CM(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
 template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
 struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor> { \
   enum { \
@@ -105,17 +105,15 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
 /* Square part handling */\
 \
    char trans, uplo, diag; \
-   MKL_INT m, n, lda, incx, incy; \
+   BlasIndex m, n, lda, incx, incy; \
    EIGTYPE const *a; \
-   MKLTYPE alpha_, beta_; \
-   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
-   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \
+   EIGTYPE beta(1); \
 \
 /* Set m, n */ \
-   n = (MKL_INT)size; \
-   lda = lhsStride; \
+   n = convert_index<BlasIndex>(size); \
+   lda = convert_index<BlasIndex>(lhsStride); \
    incx = 1; \
-   incy = resIncr; \
+   incy = convert_index<BlasIndex>(resIncr); \
 \
 /* Set uplo, trans and diag*/ \
    trans = 'N'; \
@@ -123,39 +121,39 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
    diag = IsUnitDiag ? 'U' : 'N'; \
 \
 /* call ?TRMV*/ \
-   MKLPREFIX##trmv(&uplo, &trans, &diag, &n, (const MKLTYPE*)_lhs, &lda, (MKLTYPE*)x, &incx); \
+   BLASPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
 \
 /* Add op(a_tr)rhs into res*/ \
-   MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \
-/* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \
+   BLASPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
+/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
    if (size<(std::max)(rows,cols)) { \
      if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
      x = x_tmp.data(); \
      if (size<rows) { \
        y = _res + size*resIncr; \
        a = _lhs + size; \
-       m = rows-size; \
-       n = size; \
+       m = convert_index<BlasIndex>(rows-size); \
+       n = convert_index<BlasIndex>(size); \
      } \
      else { \
        x += size; \
        y = _res; \
        a = _lhs + size*lda; \
-       m = size; \
-       n = cols-size; \
+       m = convert_index<BlasIndex>(size); \
+       n = convert_index<BlasIndex>(cols-size); \
      } \
-     MKLPREFIX##gemv(&trans, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)x, &incx, &beta_, (MKLTYPE*)y, &incy); \
+     BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \
    } \
   } \
 };
 
-EIGEN_MKL_TRMV_CM(double, double, d, d)
-EIGEN_MKL_TRMV_CM(dcomplex, MKL_Complex16, cd, z)
-EIGEN_MKL_TRMV_CM(float, float, f, s)
-EIGEN_MKL_TRMV_CM(scomplex, MKL_Complex8, cf, c)
+EIGEN_BLAS_TRMV_CM(double,   double, d,  d)
+EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z)
+EIGEN_BLAS_TRMV_CM(float,    float,  f,  s)
+EIGEN_BLAS_TRMV_CM(scomplex, float,  cf, c)
 
 // implements row-major: res += alpha * op(triangular) * vector
-#define EIGEN_MKL_TRMV_RM(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
+#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
 template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
 struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor> { \
   enum { \
@@ -189,17 +187,15 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
 /* Square part handling */\
 \
    char trans, uplo, diag; \
-   MKL_INT m, n, lda, incx, incy; \
+   BlasIndex m, n, lda, incx, incy; \
    EIGTYPE const *a; \
-   MKLTYPE alpha_, beta_; \
-   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
-   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \
+   EIGTYPE beta(1); \
 \
 /* Set m, n */ \
-   n = (MKL_INT)size; \
-   lda = lhsStride; \
+   n = convert_index<BlasIndex>(size); \
+   lda = convert_index<BlasIndex>(lhsStride); \
    incx = 1; \
-   incy = resIncr; \
+   incy = convert_index<BlasIndex>(resIncr); \
 \
 /* Set uplo, trans and diag*/ \
    trans = ConjLhs ? 'C' : 'T'; \
@@ -207,39 +203,39 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
    diag = IsUnitDiag ? 'U' : 'N'; \
 \
 /* call ?TRMV*/ \
-   MKLPREFIX##trmv(&uplo, &trans, &diag, &n, (const MKLTYPE*)_lhs, &lda, (MKLTYPE*)x, &incx); \
+   BLASPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
 \
 /* Add op(a_tr)rhs into res*/ \
-   MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \
-/* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \
+   BLASPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
+/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
    if (size<(std::max)(rows,cols)) { \
      if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
      x = x_tmp.data(); \
      if (size<rows) { \
        y = _res + size*resIncr; \
        a = _lhs + size*lda; \
-       m = rows-size; \
-       n = size; \
+       m = convert_index<BlasIndex>(rows-size); \
+       n = convert_index<BlasIndex>(size); \
      } \
      else { \
        x += size; \
        y = _res; \
        a = _lhs + size; \
-       m = size; \
-       n = cols-size; \
+       m = convert_index<BlasIndex>(size); \
+       n = convert_index<BlasIndex>(cols-size); \
      } \
-     MKLPREFIX##gemv(&trans, &n, &m, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)x, &incx, &beta_, (MKLTYPE*)y, &incy); \
+     BLASPREFIX##gemv_(&trans, &n, &m, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \
    } \
   } \
 };
 
-EIGEN_MKL_TRMV_RM(double, double, d, d)
-EIGEN_MKL_TRMV_RM(dcomplex, MKL_Complex16, cd, z)
-EIGEN_MKL_TRMV_RM(float, float, f, s)
-EIGEN_MKL_TRMV_RM(scomplex, MKL_Complex8, cf, c)
+EIGEN_BLAS_TRMV_RM(double,   double, d,  d)
+EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z)
+EIGEN_BLAS_TRMV_RM(float,    float,  f,  s)
+EIGEN_BLAS_TRMV_RM(scomplex, float,  cf, c)
 
 } // end namespase internal
 
 } // end namespace Eigen
 
-#endif // EIGEN_TRIANGULAR_MATRIX_VECTOR_MKL_H
+#endif // EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H
diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h
index 208593718..1bed66ed8 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -83,7 +83,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
     // coherence when accessing the rhs elements
     std::ptrdiff_t l1, l2, l3;
     manage_caching_sizes(GetAction, &l1, &l2, &l3);
-    Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * otherStride) : 0;
+    Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * std::max<Index>(otherStride,size)) : 0;
     subcols = std::max<Index>((subcols/Traits::nr)*Traits::nr, Traits::nr);
 
     for(Index k2=IsLower ? 0 : size;
diff --git a/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h b/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h
index 6a0bb8339..88c0fb794 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h
@@ -25,20 +25,20 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to BLAS F77
  *   Triangular matrix * matrix product functionality based on ?TRMM.
  ********************************************************************************
 */
 
-#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_MKL_H
-#define EIGEN_TRIANGULAR_SOLVER_MATRIX_MKL_H
+#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H
+#define EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H
 
 namespace Eigen {
 
 namespace internal {
 
 // implements LeftSide op(triangular)^-1 * general
-#define EIGEN_MKL_TRSM_L(EIGTYPE, MKLTYPE, MKLPREFIX) \
+#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASPREFIX) \
 template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
 struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor> \
 { \
@@ -53,13 +53,11 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorage
       const EIGTYPE* _tri, Index triStride, \
       EIGTYPE* _other, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
   { \
-   MKL_INT m = size, n = otherSize, lda, ldb; \
+   BlasIndex m = convert_index<BlasIndex>(size), n = convert_index<BlasIndex>(otherSize), lda, ldb; \
    char side = 'L', uplo, diag='N', transa; \
    /* Set alpha_ */ \
-   MKLTYPE alpha; \
-   EIGTYPE myone(1); \
-   assign_scalar_eig2mkl(alpha, myone); \
-   ldb = otherStride;\
+   EIGTYPE alpha(1); \
+   ldb = convert_index<BlasIndex>(otherStride);\
 \
    const EIGTYPE *a; \
 /* Set trans */ \
@@ -75,25 +73,25 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorage
    if (conjA) { \
      a_tmp = tri.conjugate(); \
      a = a_tmp.data(); \
-     lda = a_tmp.outerStride(); \
+     lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
    } else { \
      a = _tri; \
-     lda = triStride; \
+     lda = convert_index<BlasIndex>(triStride); \
    } \
    if (IsUnitDiag) diag='U'; \
 /* call ?trsm*/ \
-   MKLPREFIX##trsm(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const MKLTYPE*)a, &lda, (MKLTYPE*)_other, &ldb); \
+   BLASPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
  } \
 };
 
-EIGEN_MKL_TRSM_L(double, double, d)
-EIGEN_MKL_TRSM_L(dcomplex, MKL_Complex16, z)
-EIGEN_MKL_TRSM_L(float, float, s)
-EIGEN_MKL_TRSM_L(scomplex, MKL_Complex8, c)
+EIGEN_BLAS_TRSM_L(double,   double, d)
+EIGEN_BLAS_TRSM_L(dcomplex, double, z)
+EIGEN_BLAS_TRSM_L(float,    float,  s)
+EIGEN_BLAS_TRSM_L(scomplex, float,  c)
 
 
 // implements RightSide general * op(triangular)^-1
-#define EIGEN_MKL_TRSM_R(EIGTYPE, MKLTYPE, MKLPREFIX) \
+#define EIGEN_BLAS_TRSM_R(EIGTYPE, BLASTYPE, BLASPREFIX) \
 template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
 struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor> \
 { \
@@ -108,13 +106,11 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorag
       const EIGTYPE* _tri, Index triStride, \
       EIGTYPE* _other, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
   { \
-   MKL_INT m = otherSize, n = size, lda, ldb; \
+   BlasIndex m = convert_index<BlasIndex>(otherSize), n = convert_index<BlasIndex>(size), lda, ldb; \
    char side = 'R', uplo, diag='N', transa; \
    /* Set alpha_ */ \
-   MKLTYPE alpha; \
-   EIGTYPE myone(1); \
-   assign_scalar_eig2mkl(alpha, myone); \
-   ldb = otherStride;\
+   EIGTYPE alpha(1); \
+   ldb = convert_index<BlasIndex>(otherStride);\
 \
    const EIGTYPE *a; \
 /* Set trans */ \
@@ -130,26 +126,26 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorag
    if (conjA) { \
      a_tmp = tri.conjugate(); \
      a = a_tmp.data(); \
-     lda = a_tmp.outerStride(); \
+     lda = convert_index<BlasIndex>(a_tmp.outerStride()); \
    } else { \
      a = _tri; \
-     lda = triStride; \
+     lda = convert_index<BlasIndex>(triStride); \
    } \
    if (IsUnitDiag) diag='U'; \
 /* call ?trsm*/ \
-   MKLPREFIX##trsm(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const MKLTYPE*)a, &lda, (MKLTYPE*)_other, &ldb); \
+   BLASPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
    /*std::cout << "TRMS_L specialization!\n";*/ \
  } \
 };
 
-EIGEN_MKL_TRSM_R(double, double, d)
-EIGEN_MKL_TRSM_R(dcomplex, MKL_Complex16, z)
-EIGEN_MKL_TRSM_R(float, float, s)
-EIGEN_MKL_TRSM_R(scomplex, MKL_Complex8, c)
+EIGEN_BLAS_TRSM_R(double,   double, d)
+EIGEN_BLAS_TRSM_R(dcomplex, double, z)
+EIGEN_BLAS_TRSM_R(float,    float,  s)
+EIGEN_BLAS_TRSM_R(scomplex, float,  c)
 
 
 } // end namespace internal
 
 } // end namespace Eigen
 
-#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_MKL_H
+#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H
diff --git a/Eigen/src/Core/util/MKL_support.h b/Eigen/src/Core/util/MKL_support.h
index 1ef3b61db..8c9239b1d 100644
--- a/Eigen/src/Core/util/MKL_support.h
+++ b/Eigen/src/Core/util/MKL_support.h
@@ -49,7 +49,7 @@
   #define EIGEN_USE_LAPACKE
 #endif
 
-#if defined(EIGEN_USE_BLAS) || defined(EIGEN_USE_LAPACKE) || defined(EIGEN_USE_MKL_VML)
+#if defined(EIGEN_USE_LAPACKE) || defined(EIGEN_USE_MKL_VML)
   #define EIGEN_USE_MKL
 #endif
 
@@ -64,7 +64,6 @@
 #   ifndef EIGEN_USE_MKL
     /*If the MKL version is too old, undef everything*/
 #       undef   EIGEN_USE_MKL_ALL
-#       undef   EIGEN_USE_BLAS
 #       undef   EIGEN_USE_LAPACKE
 #       undef   EIGEN_USE_MKL_VML
 #       undef   EIGEN_USE_LAPACKE_STRICT
@@ -107,52 +106,23 @@
 #else
 #define EIGEN_MKL_DOMAIN_PARDISO MKL_PARDISO
 #endif
+#endif
 
 namespace Eigen {
 
 typedef std::complex<double> dcomplex;
 typedef std::complex<float>  scomplex;
 
-namespace internal {
-
-template<typename MKLType, typename EigenType>
-static inline void assign_scalar_eig2mkl(MKLType& mklScalar, const EigenType& eigenScalar) {
-  mklScalar=eigenScalar;
-}
-
-template<typename MKLType, typename EigenType>
-static inline void assign_conj_scalar_eig2mkl(MKLType& mklScalar, const EigenType& eigenScalar) {
-  mklScalar=eigenScalar;
-}
-
-template <>
-inline void assign_scalar_eig2mkl<MKL_Complex16,dcomplex>(MKL_Complex16& mklScalar, const dcomplex& eigenScalar) {
-  mklScalar.real=eigenScalar.real();
-  mklScalar.imag=eigenScalar.imag();
-}
-
-template <>
-inline void assign_scalar_eig2mkl<MKL_Complex8,scomplex>(MKL_Complex8& mklScalar, const scomplex& eigenScalar) {
-  mklScalar.real=eigenScalar.real();
-  mklScalar.imag=eigenScalar.imag();
-}
-
-template <>
-inline void assign_conj_scalar_eig2mkl<MKL_Complex16,dcomplex>(MKL_Complex16& mklScalar, const dcomplex& eigenScalar) {
-  mklScalar.real=eigenScalar.real();
-  mklScalar.imag=-eigenScalar.imag();
-}
-
-template <>
-inline void assign_conj_scalar_eig2mkl<MKL_Complex8,scomplex>(MKL_Complex8& mklScalar, const scomplex& eigenScalar) {
-  mklScalar.real=eigenScalar.real();
-  mklScalar.imag=-eigenScalar.imag();
-}
-
-} // end namespace internal
+#if defined(EIGEN_USE_MKL)
+typedef MKL_INT BlasIndex;
+#else
+typedef int BlasIndex;
+#endif
 
 } // end namespace Eigen
 
+#if defined(EIGEN_USE_BLAS)
+#include "../../misc/blas.h"
 #endif
 
 #endif // EIGEN_MKL_SUPPORT_H
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 97627d14c..69863d826 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -371,11 +371,11 @@
 // Does the compiler support const expressions?
 #ifdef __CUDACC__
 // Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above
-#if __cplusplus > 199711L && defined(__CUDACC_VER__) && (defined(__clang__) || __CUDACC_VER__ >= 70500)
+#if __cplusplus > 199711L && defined(__CUDACC_VER__) && (EIGEN_COMP_CLANG || __CUDACC_VER__ >= 70500)
   #define EIGEN_HAS_CONSTEXPR 1
 #endif
-#elif (defined(__cplusplus) && __cplusplus >= 201402L) || \
-    EIGEN_GNUC_AT_LEAST(4,8)
+#elif __has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \
+  (EIGEN_GNUC_AT_LEAST(4,8) && (__cplusplus > 199711L))
 #define EIGEN_HAS_CONSTEXPR 1
 #endif
 
@@ -572,12 +572,12 @@ namespace Eigen {
 
 //------------------------------------------------------------------------------------------
 // Static and dynamic alignment control
-// 
+//
 // The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES
 // as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively.
 // The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not,
 // a default value is automatically computed based on architecture, compiler, and OS.
-// 
+//
 // This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX}
 // to be used to declare statically aligned buffers.
 //------------------------------------------------------------------------------------------
@@ -637,7 +637,7 @@ namespace Eigen {
 #ifndef EIGEN_MAX_STATIC_ALIGN_BYTES
 
   // Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES
-  
+
   // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable
   // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always
   // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in
@@ -664,13 +664,13 @@ namespace Eigen {
   #else
     #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
   #endif
-  
+
   #if EIGEN_ARCH_WANTS_STACK_ALIGNMENT
     #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
   #else
     #define EIGEN_MAX_STATIC_ALIGN_BYTES 0
   #endif
-  
+
 #endif
 
 // If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_ALIGN_BYTES
diff --git a/Eigen/src/Geometry/Rotation2D.h b/Eigen/src/Geometry/Rotation2D.h
index 5ab0d5920..b42a7df70 100644
--- a/Eigen/src/Geometry/Rotation2D.h
+++ b/Eigen/src/Geometry/Rotation2D.h
@@ -82,15 +82,15 @@ public:
   
   /** \returns the rotation angle in [0,2pi] */
   inline Scalar smallestPositiveAngle() const {
-    Scalar tmp = fmod(m_angle,Scalar(2)*EIGEN_PI);
-    return tmp<Scalar(0) ? tmp + Scalar(2)*EIGEN_PI : tmp;
+    Scalar tmp = numext::fmod(m_angle,Scalar(2*EIGEN_PI));
+    return tmp<Scalar(0) ? tmp + Scalar(2*EIGEN_PI) : tmp;
   }
   
   /** \returns the rotation angle in [-pi,pi] */
   inline Scalar smallestAngle() const {
-    Scalar tmp = fmod(m_angle,Scalar(2)*EIGEN_PI);
-    if(tmp>Scalar(EIGEN_PI))       tmp -= Scalar(2)*Scalar(EIGEN_PI);
-    else if(tmp<-Scalar(EIGEN_PI)) tmp += Scalar(2)*Scalar(EIGEN_PI);
+    Scalar tmp = numext::fmod(m_angle,Scalar(2*EIGEN_PI));
+    if(tmp>Scalar(EIGEN_PI))       tmp -= Scalar(2*EIGEN_PI);
+    else if(tmp<-Scalar(EIGEN_PI)) tmp += Scalar(2*EIGEN_PI);
     return tmp;
   }
 
diff --git a/Eigen/src/Householder/HouseholderSequence.h b/Eigen/src/Householder/HouseholderSequence.h
index 74cd0a472..e9f3ebf88 100644
--- a/Eigen/src/Householder/HouseholderSequence.h
+++ b/Eigen/src/Householder/HouseholderSequence.h
@@ -243,8 +243,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
     {
       workspace.resize(rows());
       Index vecs = m_length;
-      if(    internal::is_same<typename internal::remove_all<VectorsType>::type,Dest>::value
-          && internal::extract_data(dst) == internal::extract_data(m_vectors))
+      if(is_same_dense(dst,m_vectors))
       {
         // in-place
         dst.diagonal().setOnes();
diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h
index 1721213d6..64b9eb7f1 100644
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@@ -231,6 +231,15 @@ template<typename _MatrixType> class FullPivLU
       return Solve<FullPivLU, Rhs>(*this, b.derived());
     }
 
+    /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is
+        the LU decomposition.
+      */
+    inline RealScalar rcond() const
+    {
+      eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
+      return internal::rcond_estimate_helper(m_l1_norm, *this);
+    }
+
     /** \returns the determinant of the matrix of which
       * *this is the LU decomposition. It has only linear complexity
       * (that is, O(n) where n is the dimension of the square matrix)
@@ -410,6 +419,7 @@ template<typename _MatrixType> class FullPivLU
     IntColVectorType m_rowsTranspositions;
     IntRowVectorType m_colsTranspositions;
     Index m_det_pq, m_nonzero_pivots;
+    RealScalar m_l1_norm;
     RealScalar m_maxpivot, m_prescribedThreshold;
     bool m_isInitialized, m_usePrescribedThreshold;
 };
@@ -455,11 +465,12 @@ FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const EigenBase<InputType>
   // the permutations are stored as int indices, so just to be sure:
   eigen_assert(matrix.rows()<=NumTraits<int>::highest() && matrix.cols()<=NumTraits<int>::highest());
 
-  m_isInitialized = true;
   m_lu = matrix.derived();
+  m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff();
 
   computeInPlace();
 
+  m_isInitialized = true;
   return *this;
 }
 
diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h
index ab7797d2a..2e6d91939 100644
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h
@@ -76,7 +76,6 @@ template<typename _MatrixType> class PartialPivLU
     typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime> TranspositionType;
     typedef typename MatrixType::PlainObject PlainObject;
 
-
     /**
       * \brief Default Constructor.
       *
@@ -152,6 +151,15 @@ template<typename _MatrixType> class PartialPivLU
       return Solve<PartialPivLU, Rhs>(*this, b.derived());
     }
 
+    /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is
+        the LU decomposition.
+      */
+    inline RealScalar rcond() const
+    {
+      eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
+      return internal::rcond_estimate_helper(m_l1_norm, *this);
+    }
+
     /** \returns the inverse of the matrix of which *this is the LU decomposition.
       *
       * \warning The matrix being decomposed here is assumed to be invertible. If you need to check for
@@ -178,7 +186,7 @@ template<typename _MatrixType> class PartialPivLU
       *
       * \sa MatrixBase::determinant()
       */
-    typename internal::traits<MatrixType>::Scalar determinant() const;
+    Scalar determinant() const;
 
     MatrixType reconstructedMatrix() const;
 
@@ -247,6 +255,7 @@ template<typename _MatrixType> class PartialPivLU
     PermutationType m_p;
     TranspositionType m_rowsTranspositions;
     Index m_det_p;
+    RealScalar m_l1_norm;
     bool m_isInitialized;
 };
 
@@ -256,6 +265,7 @@ PartialPivLU<MatrixType>::PartialPivLU()
     m_p(),
     m_rowsTranspositions(),
     m_det_p(0),
+    m_l1_norm(0),
     m_isInitialized(false)
 {
 }
@@ -266,6 +276,7 @@ PartialPivLU<MatrixType>::PartialPivLU(Index size)
     m_p(size),
     m_rowsTranspositions(size),
     m_det_p(0),
+    m_l1_norm(0),
     m_isInitialized(false)
 {
 }
@@ -277,6 +288,7 @@ PartialPivLU<MatrixType>::PartialPivLU(const EigenBase<InputType>& matrix)
     m_p(matrix.rows()),
     m_rowsTranspositions(matrix.rows()),
     m_det_p(0),
+    m_l1_norm(0),
     m_isInitialized(false)
 {
   compute(matrix.derived());
@@ -467,6 +479,7 @@ PartialPivLU<MatrixType>& PartialPivLU<MatrixType>::compute(const EigenBase<Inpu
   eigen_assert(matrix.rows()<NumTraits<int>::highest());
 
   m_lu = matrix.derived();
+  m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff();
 
   eigen_assert(matrix.rows() == matrix.cols() && "PartialPivLU is only for square (and moreover invertible) matrices");
   const Index size = matrix.rows();
@@ -484,7 +497,7 @@ PartialPivLU<MatrixType>& PartialPivLU<MatrixType>::compute(const EigenBase<Inpu
 }
 
 template<typename MatrixType>
-typename internal::traits<MatrixType>::Scalar PartialPivLU<MatrixType>::determinant() const
+typename PartialPivLU<MatrixType>::Scalar PartialPivLU<MatrixType>::determinant() const
 {
   eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
   return Scalar(m_det_p) * m_lu.diagonal().prod();
diff --git a/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/Eigen/src/QR/CompleteOrthogonalDecomposition.h
index e71944fd7..230d0d23c 100644
--- a/Eigen/src/QR/CompleteOrthogonalDecomposition.h
+++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h
@@ -397,6 +397,10 @@ CompleteOrthogonalDecomposition<MatrixType>& CompleteOrthogonalDecomposition<
 
   const Index rank = m_cpqr.rank();
   const Index cols = matrix.cols();
+  const Index rows = matrix.rows();
+  m_zCoeffs.resize((std::min)(rows, cols));
+  m_temp.resize(cols);
+
   if (rank < cols) {
     // We have reduced the (permuted) matrix to the form
     //   [R11 R12]
diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h
index 3552c87bf..799e81bd7 100644
--- a/Eigen/src/SVD/BDCSVD.h
+++ b/Eigen/src/SVD/BDCSVD.h
@@ -11,7 +11,7 @@
 // Copyright (C) 2013 Jean Ceccato <jean.ceccato@ensimag.fr>
 // Copyright (C) 2013 Pierre Zoppitelli <pierre.zoppitelli@ensimag.fr>
 // Copyright (C) 2013 Jitse Niesen <jitse@maths.leeds.ac.uk>
-// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2014-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -21,6 +21,7 @@
 #define EIGEN_BDCSVD_H
 // #define EIGEN_BDCSVD_DEBUG_VERBOSE
 // #define EIGEN_BDCSVD_SANITY_CHECKS
+
 namespace Eigen {
 
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
@@ -49,6 +50,18 @@ struct traits<BDCSVD<_MatrixType> >
  *
  * \tparam _MatrixType the type of the matrix of which we are computing the SVD decomposition
  *
+ * This class first reduces the input matrix to bi-diagonal form using class UpperBidiagonalization,
+ * and then performs a divide-and-conquer diagonalization. Small blocks are diagonalized using class JacobiSVD.
+ * You can control the switching size with the setSwitchSize() method, default is 16.
+ * For small matrice (<16), it is thus preferable to directly use JacobiSVD. For larger ones, BDCSVD is highly
+ * recommended and can several order of magnitude faster.
+ *
+ * \warning this algorithm is unlikely to provide accurate result when compiled with unsafe math optimizations.
+ * For instance, this concerns Intel's compiler (ICC), which perfroms such optimization by default unless
+ * you compile with the \c -fp-model \c precise option. Likewise, the \c -ffast-math option of GCC or clang will
+ * significantly degrade the accuracy.
+ *
+ * \sa class JacobiSVD
  */
 template<typename _MatrixType> 
 class BDCSVD : public SVDBase<BDCSVD<_MatrixType> >
@@ -228,6 +241,8 @@ BDCSVD<MatrixType>& BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsign
 #endif
   allocate(matrix.rows(), matrix.cols(), computationOptions);
   using std::abs;
+
+  const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
   
   //**** step -1 - If the problem is too small, directly falls back to JacobiSVD and return
   if(matrix.cols() < m_algoswap)
@@ -266,7 +281,7 @@ BDCSVD<MatrixType>& BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsign
   {
     RealScalar a = abs(m_computed.coeff(i, i));
     m_singularValues.coeffRef(i) = a * scale;
-    if (a == 0)
+    if (a<considerZero)
     {
       m_nonzeroSingularValues = i;
       m_singularValues.tail(m_diagSize - i - 1).setZero();
@@ -380,6 +395,7 @@ void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW,
   using std::abs;
   const Index n = lastCol - firstCol + 1;
   const Index k = n/2;
+  const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
   RealScalar alphaK;
   RealScalar betaK; 
   RealScalar r0; 
@@ -434,7 +450,7 @@ void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW,
     f = m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1);
   }
   if (m_compV) m_naiveV(firstRowW+k, firstColW) = 1;
-  if (r0 == 0)
+  if (r0<considerZero)
   {
     c0 = 1;
     s0 = 0;
@@ -553,6 +569,8 @@ void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW,
 template <typename MatrixType>
 void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V)
 {
+  const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
+  using std::abs;
   ArrayRef col0 = m_computed.col(firstCol).segment(firstCol, n);
   m_workspace.head(n) =  m_computed.block(firstCol, firstCol, n, n).diagonal();
   ArrayRef diag = m_workspace.head(n);
@@ -575,7 +593,7 @@ void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec
   while(actual_n>1 && diag(actual_n-1)==0) --actual_n;
   Index m = 0; // size of the deflated problem
   for(Index k=0;k<actual_n;++k)
-    if(col0(k)!=0)
+    if(abs(col0(k))>considerZero)
       m_workspaceI(m++) = k;
   Map<ArrayXi> perm(m_workspaceI.data(),m);
   
@@ -600,7 +618,7 @@ void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec
   
   {
     Index actual_n = n;
-    while(actual_n>1 && col0(actual_n-1)==0) --actual_n;
+    while(actual_n>1 && abs(col0(actual_n-1))<considerZero) --actual_n;
     std::cout << "\n\n    mus:    " << mus.head(actual_n).transpose() << "\n\n";
     std::cout << "    check1 (expect0) : " << ((singVals.array()-(shifts+mus)) / singVals.array()).head(actual_n).transpose() << "\n\n";
     std::cout << "    check2 (>0)      : " << ((singVals.array()-diag) / singVals.array()).head(actual_n).transpose() << "\n\n";
@@ -680,6 +698,7 @@ typename BDCSVD<MatrixType>::RealScalar BDCSVD<MatrixType>::secularEq(RealScalar
     res += numext::abs2(col0(j)) / ((diagShifted(j) - mu) * (diag(j) + shift + mu));
   }
   return res;
+
 }
 
 template <typename MatrixType>
@@ -746,14 +765,14 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
     RealScalar muPrev, muCur;
     if (shift == left)
     {
-      muPrev = (right - left) * 0.1;
+      muPrev = (right - left) * RealScalar(0.1);
       if (k == actual_n-1) muCur = right - left;
-      else                 muCur = (right - left) * 0.5; 
+      else                 muCur = (right - left) * RealScalar(0.5);
     }
     else
     {
-      muPrev = -(right - left) * 0.1;
-      muCur = -(right - left) * 0.5;
+      muPrev = -(right - left) * RealScalar(0.1);
+      muCur = -(right - left) * RealScalar(0.5);
     }
 
     RealScalar fPrev = secularEq(muPrev, col0, diag, perm, diagShifted, shift);
@@ -798,15 +817,15 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
       RealScalar leftShifted, rightShifted;
       if (shift == left)
       {
-        leftShifted = RealScalar(1)/NumTraits<RealScalar>::highest();
+        leftShifted = (std::numeric_limits<RealScalar>::min)();
         // I don't understand why the case k==0 would be special there:
         // if (k == 0) rightShifted = right - left; else 
-        rightShifted = (k==actual_n-1) ? right : ((right - left) * 0.6); // theoretically we can take 0.5, but let's be safe
+        rightShifted = (k==actual_n-1) ? right : ((right - left) * RealScalar(0.6)); // theoretically we can take 0.5, but let's be safe
       }
       else
       {
-        leftShifted = -(right - left) * 0.6;
-        rightShifted = -RealScalar(1)/NumTraits<RealScalar>::highest();
+        leftShifted = -(right - left) * RealScalar(0.6);
+        rightShifted = -(std::numeric_limits<RealScalar>::min)();
       }
       
       RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift);
@@ -817,7 +836,10 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
 
 #ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
       if(!(fLeft * fRight<0))
+      {
+        std::cout << "fLeft: " << leftShifted << " - " << diagShifted.head(10).transpose()  << "\n ; " << bool(left==shift) << " " << (left-shift) << "\n";
         std::cout << k << " : " <<  fLeft << " * " << fRight << " == " << fLeft * fRight << "  ;  " << left << " - " << right << " -> " <<  leftShifted << " " << rightShifted << "   shift=" << shift << "\n";
+      }
 #endif
       eigen_internal_assert(fLeft * fRight < 0);
       
@@ -1028,8 +1050,9 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
   Diagonal<MatrixXr> fulldiag(m_computed);
   VectorBlock<Diagonal<MatrixXr>,Dynamic> diag(fulldiag, firstCol+shift, length);
   
+  const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
   RealScalar maxDiag = diag.tail((std::max)(Index(1),length-1)).cwiseAbs().maxCoeff();
-  RealScalar epsilon_strict = NumTraits<RealScalar>::epsilon() * maxDiag;
+  RealScalar epsilon_strict = numext::maxi(considerZero,NumTraits<RealScalar>::epsilon() * maxDiag);
   RealScalar epsilon_coarse = 8 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(col0.cwiseAbs().maxCoeff(), maxDiag);
   
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
@@ -1082,7 +1105,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
   {
     // Check for total deflation
     // If we have a total deflation, then we have to consider col0(0)==diag(0) as a singular value during sorting
-    bool total_deflation = (col0.tail(length-1).array()==RealScalar(0)).all();
+    bool total_deflation = (col0.tail(length-1).array()<considerZero).all();
     
     // Sort the diagonal entries, since diag(1:k-1) and diag(k:length) are already sorted, let's do a sorted merge.
     // First, compute the respective permutation.
@@ -1093,7 +1116,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
       
       // Move deflated diagonal entries at the end.
       for(Index i=1; i<length; ++i)
-        if(diag(i)==0)
+        if(abs(diag(i))<considerZero)
           permutation[p++] = i;
         
       Index i=1, j=k+1;
@@ -1112,7 +1135,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
       for(Index i=1; i<length; ++i)
       {
         Index pi = permutation[i];
-        if(diag(pi)==0 || diag(0)<diag(pi))
+        if(abs(diag(pi))<considerZero || diag(0)<diag(pi))
           permutation[i-1] = permutation[i];
         else
         {
@@ -1163,7 +1186,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
   //condition 4.4
   {
     Index i = length-1;
-    while(i>0 && (diag(i)==0 || col0(i)==0)) --i;
+    while(i>0 && (abs(diag(i))<considerZero || abs(col0(i))<considerZero)) --i;
     for(; i>1;--i)
        if( (diag(i) - diag(i-1)) < NumTraits<RealScalar>::epsilon()*maxDiag )
       {
@@ -1177,7 +1200,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
   
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
   for(Index j=2;j<length;++j)
-    assert(diag(j-1)<=diag(j) || diag(j)==0);
+    assert(diag(j-1)<=diag(j) || abs(diag(j))<considerZero);
 #endif
   
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h
index bf5ff48c3..1940c8294 100644
--- a/Eigen/src/SVD/JacobiSVD.h
+++ b/Eigen/src/SVD/JacobiSVD.h
@@ -350,7 +350,8 @@ template<typename MatrixType, int QRPreconditioner>
 struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, false>
 {
   typedef JacobiSVD<MatrixType, QRPreconditioner> SVD;
-  static void run(typename SVD::WorkMatrixType&, SVD&, Index, Index) {}
+  typedef typename MatrixType::RealScalar RealScalar;
+  static bool run(typename SVD::WorkMatrixType&, SVD&, Index, Index, RealScalar&) { return true; }
 };
 
 template<typename MatrixType, int QRPreconditioner>
@@ -359,19 +360,30 @@ struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, true>
   typedef JacobiSVD<MatrixType, QRPreconditioner> SVD;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
-  static void run(typename SVD::WorkMatrixType& work_matrix, SVD& svd, Index p, Index q)
+  static bool run(typename SVD::WorkMatrixType& work_matrix, SVD& svd, Index p, Index q, RealScalar& maxDiagEntry)
   {
     using std::sqrt;
+    using std::abs;
     Scalar z;
     JacobiRotation<Scalar> rot;
     RealScalar n = sqrt(numext::abs2(work_matrix.coeff(p,p)) + numext::abs2(work_matrix.coeff(q,p)));
-    
+
+    const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
+    const RealScalar precision = NumTraits<Scalar>::epsilon();
+
     if(n==0)
     {
-      z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q);
-      work_matrix.row(p) *= z;
-      if(svd.computeU()) svd.m_matrixU.col(p) *= conj(z);
-      if(work_matrix.coeff(q,q)!=Scalar(0))
+      // make sure first column is zero
+      work_matrix.coeffRef(p,p) = work_matrix.coeffRef(q,p) = Scalar(0);
+
+      if(abs(numext::imag(work_matrix.coeff(p,q)))>considerAsZero)
+      {
+        // work_matrix.coeff(p,q) can be zero if work_matrix.coeff(q,p) is not zero but small enough to underflow when computing n
+        z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q);
+        work_matrix.row(p) *= z;
+        if(svd.computeU()) svd.m_matrixU.col(p) *= conj(z);
+      }
+      if(abs(numext::imag(work_matrix.coeff(q,q)))>considerAsZero)
       {
         z = abs(work_matrix.coeff(q,q)) / work_matrix.coeff(q,q);
         work_matrix.row(q) *= z;
@@ -385,19 +397,25 @@ struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, true>
       rot.s() = work_matrix.coeff(q,p) / n;
       work_matrix.applyOnTheLeft(p,q,rot);
       if(svd.computeU()) svd.m_matrixU.applyOnTheRight(p,q,rot.adjoint());
-      if(work_matrix.coeff(p,q) != Scalar(0))
+      if(abs(numext::imag(work_matrix.coeff(p,q)))>considerAsZero)
       {
         z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q);
         work_matrix.col(q) *= z;
         if(svd.computeV()) svd.m_matrixV.col(q) *= z;
       }
-      if(work_matrix.coeff(q,q) != Scalar(0))
+      if(abs(numext::imag(work_matrix.coeff(q,q)))>considerAsZero)
       {
         z = abs(work_matrix.coeff(q,q)) / work_matrix.coeff(q,q);
         work_matrix.row(q) *= z;
         if(svd.computeU()) svd.m_matrixU.col(q) *= conj(z);
       }
     }
+
+    // update largest diagonal entry
+    maxDiagEntry = numext::maxi(maxDiagEntry,numext::maxi(abs(work_matrix.coeff(p,p)), abs(work_matrix.coeff(q,q))));
+    // and check whether the 2x2 block is already diagonal
+    RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
+    return abs(work_matrix.coeff(p,q))>threshold || abs(work_matrix.coeff(q,p)) > threshold;
   }
 };
 
@@ -414,7 +432,6 @@ void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
   JacobiRotation<RealScalar> rot1;
   RealScalar t = m.coeff(0,0) + m.coeff(1,1);
   RealScalar d = m.coeff(1,0) - m.coeff(0,1);
-  
   if(d == RealScalar(0))
   {
     rot1.s() = RealScalar(0);
@@ -707,6 +724,7 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
   }
 
   /*** step 2. The main Jacobi SVD iteration. ***/
+  RealScalar maxDiagEntry = m_workMatrix.cwiseAbs().diagonal().maxCoeff();
 
   bool finished = false;
   while(!finished)
@@ -722,25 +740,27 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
         // if this 2x2 sub-matrix is not diagonal already...
         // notice that this comparison will evaluate to false if any NaN is involved, ensuring that NaN's don't
         // keep us iterating forever. Similarly, small denormal numbers are considered zero.
-        RealScalar threshold = numext::maxi<RealScalar>(considerAsZero,
-                   precision * numext::maxi<RealScalar>(abs(m_workMatrix.coeff(p,p)),
-                                                        abs(m_workMatrix.coeff(q,q))));
-        // We compare both values to threshold instead of calling max to be robust to NaN (See bug 791)
+        RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
         if(abs(m_workMatrix.coeff(p,q))>threshold || abs(m_workMatrix.coeff(q,p)) > threshold)
         {
           finished = false;
-
           // perform SVD decomposition of 2x2 sub-matrix corresponding to indices p,q to make it diagonal
-          internal::svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner>::run(m_workMatrix, *this, p, q);
-          JacobiRotation<RealScalar> j_left, j_right;
-          internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right);
-
-          // accumulate resulting Jacobi rotations
-          m_workMatrix.applyOnTheLeft(p,q,j_left);
-          if(computeU()) m_matrixU.applyOnTheRight(p,q,j_left.transpose());
-
-          m_workMatrix.applyOnTheRight(p,q,j_right);
-          if(computeV()) m_matrixV.applyOnTheRight(p,q,j_right);
+          // the complex to real operation returns true is the updated 2x2 block is not already diagonal
+          if(internal::svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner>::run(m_workMatrix, *this, p, q, maxDiagEntry))
+          {
+            JacobiRotation<RealScalar> j_left, j_right;
+            internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right);
+
+            // accumulate resulting Jacobi rotations
+            m_workMatrix.applyOnTheLeft(p,q,j_left);
+            if(computeU()) m_matrixU.applyOnTheRight(p,q,j_left.transpose());
+
+            m_workMatrix.applyOnTheRight(p,q,j_right);
+            if(computeV()) m_matrixV.applyOnTheRight(p,q,j_right);
+
+            // keep track of the largest diagonal coefficient
+            maxDiagEntry = numext::maxi(maxDiagEntry,numext::maxi(abs(m_workMatrix.coeff(p,p)), abs(m_workMatrix.coeff(q,q))));
+          }
         }
       }
     }
diff --git a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
index fe4a97120..9143a4c82 100644
--- a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
@@ -22,7 +22,7 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>
     typedef CwiseUnaryOp<UnaryOp, ArgType> XprType;
 
     class InnerIterator;
-//     class ReverseInnerIterator;
+    class ReverseInnerIterator;
     
     enum {
       CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<UnaryOp>::Cost,
diff --git a/Eigen/src/SuperLUSupport/SuperLUSupport.h b/Eigen/src/SuperLUSupport/SuperLUSupport.h
index 0ae3017cc..7e2efd452 100644
--- a/Eigen/src/SuperLUSupport/SuperLUSupport.h
+++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h
@@ -986,7 +986,7 @@ void SuperILU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest
                 &m_sluStat, &info, Scalar());
   StatFree(&m_sluStat);
   
-  if(&x.coeffRef(0) != x_ref.data())
+  if(x.derived().data() != x_ref.data())
     x = x_ref;
 
   m_info = info==0 ? Success : NumericalIssue;
diff --git a/Eigen/src/misc/blas.h b/Eigen/src/misc/blas.h
index 6fce99ed5..25215b15e 100644
--- a/Eigen/src/misc/blas.h
+++ b/Eigen/src/misc/blas.h
@@ -30,15 +30,15 @@ int  BLASFUNC(cdotcw)  (int *, float  *, int *, float  *, int *, float*);
 int  BLASFUNC(zdotuw)  (int *, double  *, int *, double  *, int *, double*);
 int  BLASFUNC(zdotcw)  (int *, double  *, int *, double  *, int *, double*);
 
-int    BLASFUNC(saxpy) (int *, float  *, float  *, int *, float  *, int *);
-int    BLASFUNC(daxpy) (int *, double *, double *, int *, double *, int *);
-int    BLASFUNC(qaxpy) (int *, double *, double *, int *, double *, int *);
-int    BLASFUNC(caxpy) (int *, float  *, float  *, int *, float  *, int *);
-int    BLASFUNC(zaxpy) (int *, double *, double *, int *, double *, int *);
-int    BLASFUNC(xaxpy) (int *, double *, double *, int *, double *, int *);
-int    BLASFUNC(caxpyc)(int *, float  *, float  *, int *, float  *, int *);
-int    BLASFUNC(zaxpyc)(int *, double *, double *, int *, double *, int *);
-int    BLASFUNC(xaxpyc)(int *, double *, double *, int *, double *, int *);
+int    BLASFUNC(saxpy) (const int *, const float  *, const float  *, const int *, float  *, const int *);
+int    BLASFUNC(daxpy) (const int *, const double *, const double *, const int *, double *, const int *);
+int    BLASFUNC(qaxpy) (const int *, const double *, const double *, const int *, double *, const int *);
+int    BLASFUNC(caxpy) (const int *, const float  *, const float  *, const int *, float  *, const int *);
+int    BLASFUNC(zaxpy) (const int *, const double *, const double *, const int *, double *, const int *);
+int    BLASFUNC(xaxpy) (const int *, const double *, const double *, const int *, double *, const int *);
+int    BLASFUNC(caxpyc)(const int *, const float  *, const float  *, const int *, float  *, const int *);
+int    BLASFUNC(zaxpyc)(const int *, const double *, const double *, const int *, double *, const int *);
+int    BLASFUNC(xaxpyc)(const int *, const double *, const double *, const int *, double *, const int *);
 
 int    BLASFUNC(scopy) (int *, float  *, int *, float  *, int *);
 int    BLASFUNC(dcopy) (int *, double *, int *, double *, int *);
@@ -177,31 +177,19 @@ int BLASFUNC(xgeru)(int *,    int *, double *, double *, int *,
 int BLASFUNC(xgerc)(int *,    int *, double *, double *, int *,
 		    double *, int *, double *, int *);
 
-int BLASFUNC(sgemv)(char *, int *, int *, float  *, float  *, int *,
-		    float  *, int *, float  *, float  *, int *);
-int BLASFUNC(dgemv)(char *, int *, int *, double *, double *, int *,
-		    double *, int *, double *, double *, int *);
-int BLASFUNC(qgemv)(char *, int *, int *, double *, double *, int *,
-		    double *, int *, double *, double *, int *);
-int BLASFUNC(cgemv)(char *, int *, int *, float  *, float  *, int *,
-		    float  *, int *, float  *, float  *, int *);
-int BLASFUNC(zgemv)(char *, int *, int *, double *, double *, int *,
-		    double *, int *, double *, double *, int *);
-int BLASFUNC(xgemv)(char *, int *, int *, double *, double *, int *,
-		    double *, int *, double *, double *, int *);
+int BLASFUNC(sgemv)(const char *, const int *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(dgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(qgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(cgemv)(const char *, const int *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(xgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
 
-int BLASFUNC(strsv) (char *, char *, char *, int *, float  *, int *,
-		     float  *, int *);
-int BLASFUNC(dtrsv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
-int BLASFUNC(qtrsv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
-int BLASFUNC(ctrsv) (char *, char *, char *, int *, float  *, int *,
-		     float  *, int *);
-int BLASFUNC(ztrsv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
-int BLASFUNC(xtrsv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
+int BLASFUNC(strsv) (const char *, const char *, const char *, const int *, const float  *, const int *, float  *, const int *);
+int BLASFUNC(dtrsv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *);
+int BLASFUNC(qtrsv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *);
+int BLASFUNC(ctrsv) (const char *, const char *, const char *, const int *, const float  *, const int *, float  *, const int *);
+int BLASFUNC(ztrsv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *);
+int BLASFUNC(xtrsv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *);
 
 int BLASFUNC(stpsv) (char *, char *, char *, int *, float  *, float  *, int *);
 int BLASFUNC(dtpsv) (char *, char *, char *, int *, double *, double *, int *);
@@ -210,18 +198,12 @@ int BLASFUNC(ctpsv) (char *, char *, char *, int *, float  *, float  *, int *);
 int BLASFUNC(ztpsv) (char *, char *, char *, int *, double *, double *, int *);
 int BLASFUNC(xtpsv) (char *, char *, char *, int *, double *, double *, int *);
 
-int BLASFUNC(strmv) (char *, char *, char *, int *, float  *, int *,
-		     float  *, int *);
-int BLASFUNC(dtrmv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
-int BLASFUNC(qtrmv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
-int BLASFUNC(ctrmv) (char *, char *, char *, int *, float  *, int *,
-		     float  *, int *);
-int BLASFUNC(ztrmv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
-int BLASFUNC(xtrmv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
+int BLASFUNC(strmv) (const char *, const char *, const char *, const int *, const float  *, const int *, float  *, const int *);
+int BLASFUNC(dtrmv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *);
+int BLASFUNC(qtrmv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *);
+int BLASFUNC(ctrmv) (const char *, const char *, const char *, const int *, const float  *, const int *, float  *, const int *);
+int BLASFUNC(ztrmv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *);
+int BLASFUNC(xtrmv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *);
 
 int BLASFUNC(stpmv) (char *, char *, char *, int *, float  *, float  *, int *);
 int BLASFUNC(dtpmv) (char *, char *, char *, int *, double *, double *, int *);
@@ -244,18 +226,9 @@ int BLASFUNC(ctbsv) (char *, char *, char *, int *, int *, float  *, int *, floa
 int BLASFUNC(ztbsv) (char *, char *, char *, int *, int *, double *, int *, double *, int *);
 int BLASFUNC(xtbsv) (char *, char *, char *, int *, int *, double *, int *, double *, int *);
 
-int BLASFUNC(ssymv) (char *, int *, float  *, float *, int *,
-		     float  *, int *, float *, float *, int *);
-int BLASFUNC(dsymv) (char *, int *, double  *, double *, int *,
-		     double  *, int *, double *, double *, int *);
-int BLASFUNC(qsymv) (char *, int *, double  *, double *, int *,
-		     double  *, int *, double *, double *, int *);
-int BLASFUNC(csymv) (char *, int *, float  *, float *, int *,
-		     float  *, int *, float *, float *, int *);
-int BLASFUNC(zsymv) (char *, int *, double  *, double *, int *,
-		     double  *, int *, double *, double *, int *);
-int BLASFUNC(xsymv) (char *, int *, double  *, double *, int *,
-		     double  *, int *, double *, double *, int *);
+int BLASFUNC(ssymv) (const char *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(dsymv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(qsymv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
 
 int BLASFUNC(sspmv) (char *, int *, float  *, float *,
 		     float  *, int *, float *, float *, int *);
@@ -263,38 +236,17 @@ int BLASFUNC(dspmv) (char *, int *, double  *, double *,
 		     double  *, int *, double *, double *, int *);
 int BLASFUNC(qspmv) (char *, int *, double  *, double *,
 		     double  *, int *, double *, double *, int *);
-int BLASFUNC(cspmv) (char *, int *, float  *, float *,
-		     float  *, int *, float *, float *, int *);
-int BLASFUNC(zspmv) (char *, int *, double  *, double *,
-		     double  *, int *, double *, double *, int *);
-int BLASFUNC(xspmv) (char *, int *, double  *, double *,
-		     double  *, int *, double *, double *, int *);
 
-int BLASFUNC(ssyr) (char *, int *, float   *, float  *, int *,
-		    float  *, int *);
-int BLASFUNC(dsyr) (char *, int *, double  *, double *, int *,
-		    double *, int *);
-int BLASFUNC(qsyr) (char *, int *, double  *, double *, int *,
-		    double *, int *);
-int BLASFUNC(csyr) (char *, int *, float   *, float  *, int *,
-		    float  *, int *);
-int BLASFUNC(zsyr) (char *, int *, double  *, double *, int *,
-		    double *, int *);
-int BLASFUNC(xsyr) (char *, int *, double  *, double *, int *,
-		    double *, int *);
+int BLASFUNC(ssyr) (const char *, const int *, const float   *, const float  *, const int *, float  *, const int *);
+int BLASFUNC(dsyr) (const char *, const int *, const double  *, const double *, const int *, double *, const int *);
+int BLASFUNC(qsyr) (const char *, const int *, const double  *, const double *, const int *, double *, const int *);
 
-int BLASFUNC(ssyr2) (char *, int *, float   *,
-		     float  *, int *, float  *, int *, float  *, int *);
-int BLASFUNC(dsyr2) (char *, int *, double  *,
-		     double *, int *, double *, int *, double *, int *);
-int BLASFUNC(qsyr2) (char *, int *, double  *,
-		     double *, int *, double *, int *, double *, int *);
-int BLASFUNC(csyr2) (char *, int *, float   *,
-		     float  *, int *, float  *, int *, float  *, int *);
-int BLASFUNC(zsyr2) (char *, int *, double  *,
-		     double *, int *, double *, int *, double *, int *);
-int BLASFUNC(xsyr2) (char *, int *, double  *,
-		     double *, int *, double *, int *, double *, int *);
+int BLASFUNC(ssyr2) (const char *, const int *, const float   *, const float  *, const int *, const float  *, const int *, float  *, const int *);
+int BLASFUNC(dsyr2) (const char *, const int *, const double  *, const double *, const int *, const double *, const int *, double *, const int *);
+int BLASFUNC(qsyr2) (const char *, const int *, const double  *, const double *, const int *, const double *, const int *, double *, const int *);
+int BLASFUNC(csyr2) (const char *, const int *, const float   *, const float  *, const int *, const float  *, const int *, float  *, const int *);
+int BLASFUNC(zsyr2) (const char *, const int *, const double  *, const double *, const int *, const double *, const int *, double *, const int *);
+int BLASFUNC(xsyr2) (const char *, const int *, const double  *, const double *, const int *, const double *, const int *, double *, const int *);
 
 int BLASFUNC(sspr) (char *, int *, float   *, float  *, int *,
 		    float  *);
@@ -302,12 +254,6 @@ int BLASFUNC(dspr) (char *, int *, double  *, double *, int *,
 		    double *);
 int BLASFUNC(qspr) (char *, int *, double  *, double *, int *,
 		    double *);
-int BLASFUNC(cspr) (char *, int *, float   *, float  *, int *,
-		    float  *);
-int BLASFUNC(zspr) (char *, int *, double  *, double *, int *,
-		    double *);
-int BLASFUNC(xspr) (char *, int *, double  *, double *, int *,
-		    double *);
 
 int BLASFUNC(sspr2) (char *, int *, float   *,
 		     float  *, int *, float  *, int *, float  *);
@@ -347,12 +293,9 @@ int BLASFUNC(zhpr2) (char *, int *, double  *,
 int BLASFUNC(xhpr2) (char *, int *, double  *,
 		     double *, int *, double *, int *, double *);
 
-int BLASFUNC(chemv) (char *, int *, float  *, float *, int *,
-		     float  *, int *, float *, float *, int *);
-int BLASFUNC(zhemv) (char *, int *, double  *, double *, int *,
-		     double  *, int *, double *, double *, int *);
-int BLASFUNC(xhemv) (char *, int *, double  *, double *, int *,
-		     double  *, int *, double *, double *, int *);
+int BLASFUNC(chemv) (const char *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zhemv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(xhemv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
 
 int BLASFUNC(chpmv) (char *, int *, float  *, float *,
 		     float  *, int *, float *, float *, int *);
@@ -401,18 +344,12 @@ int BLASFUNC(xhbmv)(char *, int *, int *, double *, double *, int *,
 
 /* Level 3 routines */
 
-int BLASFUNC(sgemm)(char *, char *, int *, int *, int *, float *,
-	   float  *, int *, float  *, int *, float  *, float  *, int *);
-int BLASFUNC(dgemm)(char *, char *, int *, int *, int *, double *,
-	   double *, int *, double *, int *, double *, double *, int *);
-int BLASFUNC(qgemm)(char *, char *, int *, int *, int *, double *,
-	   double *, int *, double *, int *, double *, double *, int *);
-int BLASFUNC(cgemm)(char *, char *, int *, int *, int *, float *,
-	   float  *, int *, float  *, int *, float  *, float  *, int *);
-int BLASFUNC(zgemm)(char *, char *, int *, int *, int *, double *,
-	   double *, int *, double *, int *, double *, double *, int *);
-int BLASFUNC(xgemm)(char *, char *, int *, int *, int *, double *,
-	   double *, int *, double *, int *, double *, double *, int *);
+int BLASFUNC(sgemm)(const char *, const char *, const int *, const int *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(dgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(qgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(cgemm)(const char *, const char *, const int *, const int *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(xgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
 
 int BLASFUNC(cgemm3m)(char *, char *, int *, int *, int *, float *,
 	   float  *, int *, float  *, int *, float  *, float  *, int *);
@@ -434,84 +371,48 @@ int BLASFUNC(zge2mm)(char *, char *, char *, int *, int *,
 		     double *, double  *, int *, double  *, int *,
 		     double *, double  *, int *);
 
-int BLASFUNC(strsm)(char *, char *, char *, char *, int *, int *,
-	   float *,  float *, int *, float *, int *);
-int BLASFUNC(dtrsm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
-int BLASFUNC(qtrsm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
-int BLASFUNC(ctrsm)(char *, char *, char *, char *, int *, int *,
-	   float *,  float *, int *, float *, int *);
-int BLASFUNC(ztrsm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
-int BLASFUNC(xtrsm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
-
-int BLASFUNC(strmm)(char *, char *, char *, char *, int *, int *,
-	   float *,  float *, int *, float *, int *);
-int BLASFUNC(dtrmm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
-int BLASFUNC(qtrmm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
-int BLASFUNC(ctrmm)(char *, char *, char *, char *, int *, int *,
-	   float *,  float *, int *, float *, int *);
-int BLASFUNC(ztrmm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
-int BLASFUNC(xtrmm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
-
-int BLASFUNC(ssymm)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float  *, int *, float  *, float  *, int *);
-int BLASFUNC(dsymm)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
-int BLASFUNC(qsymm)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
-int BLASFUNC(csymm)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float  *, int *, float  *, float  *, int *);
-int BLASFUNC(zsymm)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
-int BLASFUNC(xsymm)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
-
-int BLASFUNC(csymm3m)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float  *, int *, float  *, float  *, int *);
-int BLASFUNC(zsymm3m)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
-int BLASFUNC(xsymm3m)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
-
-int BLASFUNC(ssyrk)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float  *, float  *, int *);
-int BLASFUNC(dsyrk)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, double *, int *);
-int BLASFUNC(qsyrk)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, double *, int *);
-int BLASFUNC(csyrk)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float  *, float  *, int *);
-int BLASFUNC(zsyrk)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, double *, int *);
-int BLASFUNC(xsyrk)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, double *, int *);
-
-int BLASFUNC(ssyr2k)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float *, int *, float  *, float  *, int *);
-int BLASFUNC(dsyr2k)(char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
-int BLASFUNC(qsyr2k)(char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
-int BLASFUNC(csyr2k)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float *, int *, float  *, float  *, int *);
-int BLASFUNC(zsyr2k)(char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
-int BLASFUNC(xsyr2k)(char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
-
-int BLASFUNC(chemm)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float  *, int *, float  *, float  *, int *);
-int BLASFUNC(zhemm)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
-int BLASFUNC(xhemm)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
+int BLASFUNC(strsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *,  const float *,  const int *, float *,  const int *);
+int BLASFUNC(dtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+int BLASFUNC(qtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+int BLASFUNC(ctrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *,  const float *,  const int *, float *,  const int *);
+int BLASFUNC(ztrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+int BLASFUNC(xtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+
+int BLASFUNC(strmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *,  const float *,  const int *, float *,  const int *);
+int BLASFUNC(dtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+int BLASFUNC(qtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+int BLASFUNC(ctrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *,  const float *,  const int *, float *,  const int *);
+int BLASFUNC(ztrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+int BLASFUNC(xtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+
+int BLASFUNC(ssymm)(const char *, const char *, const int *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(dsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(qsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(csymm)(const char *, const char *, const int *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(xsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+
+int BLASFUNC(csymm3m)(char *, char *, int *, int *, float  *, float  *, int *, float  *, int *, float  *, float  *, int *);
+int BLASFUNC(zsymm3m)(char *, char *, int *, int *, double *, double *, int *, double *, int *, double *, double *, int *);
+int BLASFUNC(xsymm3m)(char *, char *, int *, int *, double *, double *, int *, double *, int *, double *, double *, int *);
+
+int BLASFUNC(ssyrk)(const char *, const char *, const int *, const int *, const float  *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(dsyrk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(qsyrk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(csyrk)(const char *, const char *, const int *, const int *, const float  *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zsyrk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(xsyrk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, double *, const int *);
+
+int BLASFUNC(ssyr2k)(const char *, const char *, const int *, const int *, const float  *, const float  *, const int *, const float *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(dsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *);
+int BLASFUNC(qsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *);
+int BLASFUNC(csyr2k)(const char *, const char *, const int *, const int *, const float  *, const float  *, const int *, const float *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *);
+int BLASFUNC(xsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *);
+
+int BLASFUNC(chemm)(const char *, const char *, const int *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zhemm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(xhemm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
 
 int BLASFUNC(chemm3m)(char *, char *, int *, int *, float  *, float  *, int *,
 	   float  *, int *, float  *, float  *, int *);
@@ -520,136 +421,17 @@ int BLASFUNC(zhemm3m)(char *, char *, int *, int *, double *, double *, int *,
 int BLASFUNC(xhemm3m)(char *, char *, int *, int *, double *, double *, int *,
 	   double *, int *, double *, double *, int *);
 
-int BLASFUNC(cherk)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float  *, float  *, int *);
-int BLASFUNC(zherk)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, double *, int *);
-int BLASFUNC(xherk)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, double *, int *);
-
-int BLASFUNC(cher2k)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float *, int *, float  *, float  *, int *);
-int BLASFUNC(zher2k)(char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
-int BLASFUNC(xher2k)(char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
-int BLASFUNC(cher2m)(char *, char *, char *, int *, int *, float  *, float  *, int *,
-	   float *, int *, float  *, float  *, int *);
-int BLASFUNC(zher2m)(char *, char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
-int BLASFUNC(xher2m)(char *, char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
-
-int BLASFUNC(sgemt)(char *, int *, int *, float  *, float  *, int *,
-		    float  *, int *);
-int BLASFUNC(dgemt)(char *, int *, int *, double *, double *, int *,
-		    double *, int *);
-int BLASFUNC(cgemt)(char *, int *, int *, float  *, float  *, int *,
-		    float  *, int *);
-int BLASFUNC(zgemt)(char *, int *, int *, double *, double *, int *,
-		    double *, int *);
+int BLASFUNC(cherk)(const char *, const char *, const int *, const int *, const float  *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zherk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(xherk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, double *, const int *);
+
+int BLASFUNC(cher2k)(const char *, const char *, const int *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zher2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(xher2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(cher2m)(const char *, const char *, const char *, const int *, const int *, const float  *, const float  *, const int *, const float *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zher2m)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *);
+int BLASFUNC(xher2m)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *);
 
-int BLASFUNC(sgema)(char *, char *, int *, int *, float  *,
-		    float  *, int *, float *, float  *, int *, float *, int *);
-int BLASFUNC(dgema)(char *, char *, int *, int *, double *,
-		    double *, int *, double*, double *, int *, double*, int *);
-int BLASFUNC(cgema)(char *, char *, int *, int *, float  *,
-		    float  *, int *, float *, float  *, int *, float *, int *);
-int BLASFUNC(zgema)(char *, char *, int *, int *, double *,
-		    double *, int *, double*, double *, int *, double*, int *);
-
-int BLASFUNC(sgems)(char *, char *, int *, int *, float  *,
-		    float  *, int *, float *, float  *, int *, float *, int *);
-int BLASFUNC(dgems)(char *, char *, int *, int *, double *,
-		    double *, int *, double*, double *, int *, double*, int *);
-int BLASFUNC(cgems)(char *, char *, int *, int *, float  *,
-		    float  *, int *, float *, float  *, int *, float *, int *);
-int BLASFUNC(zgems)(char *, char *, int *, int *, double *,
-		    double *, int *, double*, double *, int *, double*, int *);
-
-int BLASFUNC(sgetf2)(int *, int *, float  *, int *, int *, int *);
-int BLASFUNC(dgetf2)(int *, int *, double *, int *, int *, int *);
-int BLASFUNC(qgetf2)(int *, int *, double *, int *, int *, int *);
-int BLASFUNC(cgetf2)(int *, int *, float  *, int *, int *, int *);
-int BLASFUNC(zgetf2)(int *, int *, double *, int *, int *, int *);
-int BLASFUNC(xgetf2)(int *, int *, double *, int *, int *, int *);
-
-int BLASFUNC(sgetrf)(int *, int *, float  *, int *, int *, int *);
-int BLASFUNC(dgetrf)(int *, int *, double *, int *, int *, int *);
-int BLASFUNC(qgetrf)(int *, int *, double *, int *, int *, int *);
-int BLASFUNC(cgetrf)(int *, int *, float  *, int *, int *, int *);
-int BLASFUNC(zgetrf)(int *, int *, double *, int *, int *, int *);
-int BLASFUNC(xgetrf)(int *, int *, double *, int *, int *, int *);
-
-int BLASFUNC(slaswp)(int *, float  *, int *, int *, int *, int *, int *);
-int BLASFUNC(dlaswp)(int *, double *, int *, int *, int *, int *, int *);
-int BLASFUNC(qlaswp)(int *, double *, int *, int *, int *, int *, int *);
-int BLASFUNC(claswp)(int *, float  *, int *, int *, int *, int *, int *);
-int BLASFUNC(zlaswp)(int *, double *, int *, int *, int *, int *, int *);
-int BLASFUNC(xlaswp)(int *, double *, int *, int *, int *, int *, int *);
-
-int BLASFUNC(sgetrs)(char *, int *, int *, float  *, int *, int *, float  *, int *, int *);
-int BLASFUNC(dgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
-int BLASFUNC(qgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
-int BLASFUNC(cgetrs)(char *, int *, int *, float  *, int *, int *, float  *, int *, int *);
-int BLASFUNC(zgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
-int BLASFUNC(xgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
-
-int BLASFUNC(sgesv)(int *, int *, float  *, int *, int *, float *, int *, int *);
-int BLASFUNC(dgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
-int BLASFUNC(qgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
-int BLASFUNC(cgesv)(int *, int *, float  *, int *, int *, float *, int *, int *);
-int BLASFUNC(zgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
-int BLASFUNC(xgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
-
-int BLASFUNC(spotf2)(char *, int *, float  *, int *, int *);
-int BLASFUNC(dpotf2)(char *, int *, double *, int *, int *);
-int BLASFUNC(qpotf2)(char *, int *, double *, int *, int *);
-int BLASFUNC(cpotf2)(char *, int *, float  *, int *, int *);
-int BLASFUNC(zpotf2)(char *, int *, double *, int *, int *);
-int BLASFUNC(xpotf2)(char *, int *, double *, int *, int *);
-
-int BLASFUNC(spotrf)(char *, int *, float  *, int *, int *);
-int BLASFUNC(dpotrf)(char *, int *, double *, int *, int *);
-int BLASFUNC(qpotrf)(char *, int *, double *, int *, int *);
-int BLASFUNC(cpotrf)(char *, int *, float  *, int *, int *);
-int BLASFUNC(zpotrf)(char *, int *, double *, int *, int *);
-int BLASFUNC(xpotrf)(char *, int *, double *, int *, int *);
-
-int BLASFUNC(slauu2)(char *, int *, float  *, int *, int *);
-int BLASFUNC(dlauu2)(char *, int *, double *, int *, int *);
-int BLASFUNC(qlauu2)(char *, int *, double *, int *, int *);
-int BLASFUNC(clauu2)(char *, int *, float  *, int *, int *);
-int BLASFUNC(zlauu2)(char *, int *, double *, int *, int *);
-int BLASFUNC(xlauu2)(char *, int *, double *, int *, int *);
-
-int BLASFUNC(slauum)(char *, int *, float  *, int *, int *);
-int BLASFUNC(dlauum)(char *, int *, double *, int *, int *);
-int BLASFUNC(qlauum)(char *, int *, double *, int *, int *);
-int BLASFUNC(clauum)(char *, int *, float  *, int *, int *);
-int BLASFUNC(zlauum)(char *, int *, double *, int *, int *);
-int BLASFUNC(xlauum)(char *, int *, double *, int *, int *);
-
-int BLASFUNC(strti2)(char *, char *, int *, float  *, int *, int *);
-int BLASFUNC(dtrti2)(char *, char *, int *, double *, int *, int *);
-int BLASFUNC(qtrti2)(char *, char *, int *, double *, int *, int *);
-int BLASFUNC(ctrti2)(char *, char *, int *, float  *, int *, int *);
-int BLASFUNC(ztrti2)(char *, char *, int *, double *, int *, int *);
-int BLASFUNC(xtrti2)(char *, char *, int *, double *, int *, int *);
-
-int BLASFUNC(strtri)(char *, char *, int *, float  *, int *, int *);
-int BLASFUNC(dtrtri)(char *, char *, int *, double *, int *, int *);
-int BLASFUNC(qtrtri)(char *, char *, int *, double *, int *, int *);
-int BLASFUNC(ctrtri)(char *, char *, int *, float  *, int *, int *);
-int BLASFUNC(ztrtri)(char *, char *, int *, double *, int *, int *);
-int BLASFUNC(xtrtri)(char *, char *, int *, double *, int *, int *);
-
-int BLASFUNC(spotri)(char *, int *, float  *, int *, int *);
-int BLASFUNC(dpotri)(char *, int *, double *, int *, int *);
-int BLASFUNC(qpotri)(char *, int *, double *, int *, int *);
-int BLASFUNC(cpotri)(char *, int *, float  *, int *, int *);
-int BLASFUNC(zpotri)(char *, int *, double *, int *, int *);
-int BLASFUNC(xpotri)(char *, int *, double *, int *, int *);
 
 #ifdef __cplusplus
 }
diff --git a/Eigen/src/misc/lapack.h b/Eigen/src/misc/lapack.h
new file mode 100644
index 000000000..249f3575c
--- /dev/null
+++ b/Eigen/src/misc/lapack.h
@@ -0,0 +1,152 @@
+#ifndef LAPACK_H
+#define LAPACK_H
+
+#include "blas.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+int BLASFUNC(csymv) (const char *, const int *, const float  *, const float  *, const int *, const float  *, const int *, const float  *, float  *, const int *);
+int BLASFUNC(zsymv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+int BLASFUNC(xsymv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+
+
+int BLASFUNC(cspmv) (char *, int *, float  *, float *,
+         float  *, int *, float *, float *, int *);
+int BLASFUNC(zspmv) (char *, int *, double  *, double *,
+         double  *, int *, double *, double *, int *);
+int BLASFUNC(xspmv) (char *, int *, double  *, double *,
+         double  *, int *, double *, double *, int *);
+
+int BLASFUNC(csyr) (char *, int *, float   *, float  *, int *,
+        float  *, int *);
+int BLASFUNC(zsyr) (char *, int *, double  *, double *, int *,
+        double *, int *);
+int BLASFUNC(xsyr) (char *, int *, double  *, double *, int *,
+        double *, int *);
+
+int BLASFUNC(cspr) (char *, int *, float   *, float  *, int *,
+        float  *);
+int BLASFUNC(zspr) (char *, int *, double  *, double *, int *,
+        double *);
+int BLASFUNC(xspr) (char *, int *, double  *, double *, int *,
+        double *);
+
+int BLASFUNC(sgemt)(char *, int *, int *, float  *, float  *, int *,
+        float  *, int *);
+int BLASFUNC(dgemt)(char *, int *, int *, double *, double *, int *,
+        double *, int *);
+int BLASFUNC(cgemt)(char *, int *, int *, float  *, float  *, int *,
+        float  *, int *);
+int BLASFUNC(zgemt)(char *, int *, int *, double *, double *, int *,
+        double *, int *);
+
+int BLASFUNC(sgema)(char *, char *, int *, int *, float  *,
+        float  *, int *, float *, float  *, int *, float *, int *);
+int BLASFUNC(dgema)(char *, char *, int *, int *, double *,
+        double *, int *, double*, double *, int *, double*, int *);
+int BLASFUNC(cgema)(char *, char *, int *, int *, float  *,
+        float  *, int *, float *, float  *, int *, float *, int *);
+int BLASFUNC(zgema)(char *, char *, int *, int *, double *,
+        double *, int *, double*, double *, int *, double*, int *);
+
+int BLASFUNC(sgems)(char *, char *, int *, int *, float  *,
+        float  *, int *, float *, float  *, int *, float *, int *);
+int BLASFUNC(dgems)(char *, char *, int *, int *, double *,
+        double *, int *, double*, double *, int *, double*, int *);
+int BLASFUNC(cgems)(char *, char *, int *, int *, float  *,
+        float  *, int *, float *, float  *, int *, float *, int *);
+int BLASFUNC(zgems)(char *, char *, int *, int *, double *,
+        double *, int *, double*, double *, int *, double*, int *);
+
+int BLASFUNC(sgetf2)(int *, int *, float  *, int *, int *, int *);
+int BLASFUNC(dgetf2)(int *, int *, double *, int *, int *, int *);
+int BLASFUNC(qgetf2)(int *, int *, double *, int *, int *, int *);
+int BLASFUNC(cgetf2)(int *, int *, float  *, int *, int *, int *);
+int BLASFUNC(zgetf2)(int *, int *, double *, int *, int *, int *);
+int BLASFUNC(xgetf2)(int *, int *, double *, int *, int *, int *);
+
+int BLASFUNC(sgetrf)(int *, int *, float  *, int *, int *, int *);
+int BLASFUNC(dgetrf)(int *, int *, double *, int *, int *, int *);
+int BLASFUNC(qgetrf)(int *, int *, double *, int *, int *, int *);
+int BLASFUNC(cgetrf)(int *, int *, float  *, int *, int *, int *);
+int BLASFUNC(zgetrf)(int *, int *, double *, int *, int *, int *);
+int BLASFUNC(xgetrf)(int *, int *, double *, int *, int *, int *);
+
+int BLASFUNC(slaswp)(int *, float  *, int *, int *, int *, int *, int *);
+int BLASFUNC(dlaswp)(int *, double *, int *, int *, int *, int *, int *);
+int BLASFUNC(qlaswp)(int *, double *, int *, int *, int *, int *, int *);
+int BLASFUNC(claswp)(int *, float  *, int *, int *, int *, int *, int *);
+int BLASFUNC(zlaswp)(int *, double *, int *, int *, int *, int *, int *);
+int BLASFUNC(xlaswp)(int *, double *, int *, int *, int *, int *, int *);
+
+int BLASFUNC(sgetrs)(char *, int *, int *, float  *, int *, int *, float  *, int *, int *);
+int BLASFUNC(dgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
+int BLASFUNC(qgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
+int BLASFUNC(cgetrs)(char *, int *, int *, float  *, int *, int *, float  *, int *, int *);
+int BLASFUNC(zgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
+int BLASFUNC(xgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
+
+int BLASFUNC(sgesv)(int *, int *, float  *, int *, int *, float *, int *, int *);
+int BLASFUNC(dgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
+int BLASFUNC(qgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
+int BLASFUNC(cgesv)(int *, int *, float  *, int *, int *, float *, int *, int *);
+int BLASFUNC(zgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
+int BLASFUNC(xgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
+
+int BLASFUNC(spotf2)(char *, int *, float  *, int *, int *);
+int BLASFUNC(dpotf2)(char *, int *, double *, int *, int *);
+int BLASFUNC(qpotf2)(char *, int *, double *, int *, int *);
+int BLASFUNC(cpotf2)(char *, int *, float  *, int *, int *);
+int BLASFUNC(zpotf2)(char *, int *, double *, int *, int *);
+int BLASFUNC(xpotf2)(char *, int *, double *, int *, int *);
+
+int BLASFUNC(spotrf)(char *, int *, float  *, int *, int *);
+int BLASFUNC(dpotrf)(char *, int *, double *, int *, int *);
+int BLASFUNC(qpotrf)(char *, int *, double *, int *, int *);
+int BLASFUNC(cpotrf)(char *, int *, float  *, int *, int *);
+int BLASFUNC(zpotrf)(char *, int *, double *, int *, int *);
+int BLASFUNC(xpotrf)(char *, int *, double *, int *, int *);
+
+int BLASFUNC(slauu2)(char *, int *, float  *, int *, int *);
+int BLASFUNC(dlauu2)(char *, int *, double *, int *, int *);
+int BLASFUNC(qlauu2)(char *, int *, double *, int *, int *);
+int BLASFUNC(clauu2)(char *, int *, float  *, int *, int *);
+int BLASFUNC(zlauu2)(char *, int *, double *, int *, int *);
+int BLASFUNC(xlauu2)(char *, int *, double *, int *, int *);
+
+int BLASFUNC(slauum)(char *, int *, float  *, int *, int *);
+int BLASFUNC(dlauum)(char *, int *, double *, int *, int *);
+int BLASFUNC(qlauum)(char *, int *, double *, int *, int *);
+int BLASFUNC(clauum)(char *, int *, float  *, int *, int *);
+int BLASFUNC(zlauum)(char *, int *, double *, int *, int *);
+int BLASFUNC(xlauum)(char *, int *, double *, int *, int *);
+
+int BLASFUNC(strti2)(char *, char *, int *, float  *, int *, int *);
+int BLASFUNC(dtrti2)(char *, char *, int *, double *, int *, int *);
+int BLASFUNC(qtrti2)(char *, char *, int *, double *, int *, int *);
+int BLASFUNC(ctrti2)(char *, char *, int *, float  *, int *, int *);
+int BLASFUNC(ztrti2)(char *, char *, int *, double *, int *, int *);
+int BLASFUNC(xtrti2)(char *, char *, int *, double *, int *, int *);
+
+int BLASFUNC(strtri)(char *, char *, int *, float  *, int *, int *);
+int BLASFUNC(dtrtri)(char *, char *, int *, double *, int *, int *);
+int BLASFUNC(qtrtri)(char *, char *, int *, double *, int *, int *);
+int BLASFUNC(ctrtri)(char *, char *, int *, float  *, int *, int *);
+int BLASFUNC(ztrtri)(char *, char *, int *, double *, int *, int *);
+int BLASFUNC(xtrtri)(char *, char *, int *, double *, int *, int *);
+
+int BLASFUNC(spotri)(char *, int *, float  *, int *, int *);
+int BLASFUNC(dpotri)(char *, int *, double *, int *, int *);
+int BLASFUNC(qpotri)(char *, int *, double *, int *, int *);
+int BLASFUNC(cpotri)(char *, int *, float  *, int *, int *);
+int BLASFUNC(zpotri)(char *, int *, double *, int *, int *);
+int BLASFUNC(xpotri)(char *, int *, double *, int *, int *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/Eigen/src/plugins/ArrayCwiseBinaryOps.h b/Eigen/src/plugins/ArrayCwiseBinaryOps.h
index 9422c40bc..5694592d6 100644
--- a/Eigen/src/plugins/ArrayCwiseBinaryOps.h
+++ b/Eigen/src/plugins/ArrayCwiseBinaryOps.h
@@ -280,3 +280,21 @@ operator||(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
   return CwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>(derived(),other.derived());
 }
 
+/** \returns an expression of the coefficient-wise ^ operator of *this and \a other
+ *
+ * \warning this operator is for expression of bool only.
+ *
+ * Example: \include Cwise_boolean_xor.cpp
+ * Output: \verbinclude Cwise_boolean_xor.out
+ *
+ * \sa operator&&(), select()
+ */
+template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
+inline const CwiseBinaryOp<internal::scalar_boolean_xor_op, const Derived, const OtherDerived>
+operator^(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
+{
+  EIGEN_STATIC_ASSERT((internal::is_same<bool,Scalar>::value && internal::is_same<bool,typename OtherDerived::Scalar>::value),
+                      THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL);
+  return CwiseBinaryOp<internal::scalar_boolean_xor_op, const Derived, const OtherDerived>(derived(),other.derived());
+}
diff --git a/bench/BenchTimer.h b/bench/BenchTimer.h
index 64666d75f..ea28496b7 100644
--- a/bench/BenchTimer.h
+++ b/bench/BenchTimer.h
@@ -22,7 +22,6 @@
 # endif
 # include <windows.h>
 #elif defined(__APPLE__)
-#include <CoreServices/CoreServices.h>
 #include <mach/mach_time.h>
 #else
 # include <unistd.h>
diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h
index 8fe211602..e0631b401 100644
--- a/bench/tensors/tensor_benchmarks.h
+++ b/bench/tensors/tensor_benchmarks.h
@@ -46,9 +46,10 @@ template <typename Device, typename T> class BenchmarkSuite {
   void typeCasting(int num_iters) {
     eigen_assert(m_ == n_);
     Eigen::array<TensorIndex, 2> sizes;
-    sizes[0] = m_;
-    sizes[1] = k_;
-    if (sizeof(T) < sizeof(int)) {
+    if (sizeof(T) >= sizeof(int)) {
+      sizes[0] = m_;
+      sizes[1] = k_;
+    } else {
       sizes[0] = m_ * sizeof(T) / sizeof(int);
       sizes[1] = k_ * sizeof(T) / sizeof(int);
     }
@@ -200,9 +201,15 @@ template <typename Device, typename T> class BenchmarkSuite {
     size_b[1] = k_/2;
     TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
 
+#ifndef EIGEN_HAS_INDEX_LIST
     Eigen::array<TensorIndex, 2> strides;
     strides[0] = 1;
     strides[1] = 2;
+#else
+    // Take advantage of cxx11 to give the compiler information it can use to
+    // optimize the code.
+    Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> > strides;
+#endif
 
     StartBenchmarkTiming();
     for (int iter = 0; iter < num_iters; ++iter) {
@@ -361,7 +368,7 @@ template <typename Device, typename T> class BenchmarkSuite {
     const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
         b_, input_size);
     Eigen::array<TensorIndex, 0> output_size;
-    TensorMap<Tensor<float, 0, 0, TensorIndex>, Eigen::Aligned> C(
+    TensorMap<Tensor<T, 0, 0, TensorIndex>, Eigen::Aligned> C(
         c_, output_size);
 
     StartBenchmarkTiming();
diff --git a/bench/tensors/tensor_benchmarks_fp16_gpu.cu b/bench/tensors/tensor_benchmarks_fp16_gpu.cu
index 35c6f7489..65784d0d6 100644
--- a/bench/tensors/tensor_benchmarks_fp16_gpu.cu
+++ b/bench/tensors/tensor_benchmarks_fp16_gpu.cu
@@ -28,11 +28,12 @@ BM_FuncGPU(shuffling);
 BM_FuncGPU(padding);
 BM_FuncGPU(striding);
 BM_FuncGPU(broadcasting);
-//BM_FuncGPU(coeffWiseOp);
-//BM_FuncGPU(algebraicFunc);
-//BM_FuncGPU(transcendentalFunc);
+BM_FuncGPU(coeffWiseOp);
+BM_FuncGPU(algebraicFunc);
+BM_FuncGPU(transcendentalFunc);
 BM_FuncGPU(rowReduction);
 BM_FuncGPU(colReduction);
+BM_FuncGPU(fullReduction);
 
 
 // Contractions
@@ -48,11 +49,11 @@ BM_FuncGPU(colReduction);
   BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
 
 
-/*BM_FuncWithInputDimsGPU(contraction, N, N, N);
+BM_FuncWithInputDimsGPU(contraction, N, N, N);
 BM_FuncWithInputDimsGPU(contraction, 64, N, N);
 BM_FuncWithInputDimsGPU(contraction, N, 64, N);
 BM_FuncWithInputDimsGPU(contraction, N, N, 64);
-*/
+
 
 // Convolutions
 #define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2)                             \
diff --git a/blas/common.h b/blas/common.h
index 5ecb153e2..61d8344d9 100644
--- a/blas/common.h
+++ b/blas/common.h
@@ -10,8 +10,8 @@
 #ifndef EIGEN_BLAS_COMMON_H
 #define EIGEN_BLAS_COMMON_H
 
-#include <Eigen/Core>
-#include <Eigen/Jacobi>
+#include "../Eigen/Core"
+#include "../Eigen/Jacobi"
 
 #include <complex>
 
@@ -19,8 +19,7 @@
 #error the token SCALAR must be defined to compile this file
 #endif
 
-#include <Eigen/src/misc/blas.h>
-
+#include "../Eigen/src/misc/blas.h"
 
 #define NOTR    0
 #define TR      1
@@ -94,6 +93,7 @@ enum
 
 typedef Matrix<Scalar,Dynamic,Dynamic,ColMajor> PlainMatrixType;
 typedef Map<Matrix<Scalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > MatrixType;
+typedef Map<const Matrix<Scalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > ConstMatrixType;
 typedef Map<Matrix<Scalar,Dynamic,1>, 0, InnerStride<Dynamic> > StridedVectorType;
 typedef Map<Matrix<Scalar,Dynamic,1> > CompactVectorType;
 
@@ -105,24 +105,43 @@ matrix(T* data, int rows, int cols, int stride)
 }
 
 template<typename T>
+Map<const Matrix<T,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> >
+matrix(const T* data, int rows, int cols, int stride)
+{
+  return Map<const Matrix<T,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> >(data, rows, cols, OuterStride<>(stride));
+}
+
+template<typename T>
 Map<Matrix<T,Dynamic,1>, 0, InnerStride<Dynamic> > make_vector(T* data, int size, int incr)
 {
   return Map<Matrix<T,Dynamic,1>, 0, InnerStride<Dynamic> >(data, size, InnerStride<Dynamic>(incr));
 }
 
 template<typename T>
+Map<const Matrix<T,Dynamic,1>, 0, InnerStride<Dynamic> > make_vector(const T* data, int size, int incr)
+{
+  return Map<const Matrix<T,Dynamic,1>, 0, InnerStride<Dynamic> >(data, size, InnerStride<Dynamic>(incr));
+}
+
+template<typename T>
 Map<Matrix<T,Dynamic,1> > make_vector(T* data, int size)
 {
   return Map<Matrix<T,Dynamic,1> >(data, size);
 }
 
 template<typename T>
+Map<const Matrix<T,Dynamic,1> > make_vector(const T* data, int size)
+{
+  return Map<const Matrix<T,Dynamic,1> >(data, size);
+}
+
+template<typename T>
 T* get_compact_vector(T* x, int n, int incx)
 {
   if(incx==1)
     return x;
 
-  T* ret = new Scalar[n];
+  typename Eigen::internal::remove_const<T>::type* ret = new Scalar[n];
   if(incx<0) make_vector(ret,n) = make_vector(x,n,-incx).reverse();
   else       make_vector(ret,n) = make_vector(x,n, incx);
   return ret;
diff --git a/blas/level1_impl.h b/blas/level1_impl.h
index e623bd178..f857bfa20 100644
--- a/blas/level1_impl.h
+++ b/blas/level1_impl.h
@@ -9,11 +9,11 @@
 
 #include "common.h"
 
-int EIGEN_BLAS_FUNC(axpy)(int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *py, int *incy)
+int EIGEN_BLAS_FUNC(axpy)(const int *n, const RealScalar *palpha, const RealScalar *px, const int *incx, RealScalar *py, const int *incy)
 {
-  Scalar* x = reinterpret_cast<Scalar*>(px);
+  const Scalar* x = reinterpret_cast<const Scalar*>(px);
   Scalar* y = reinterpret_cast<Scalar*>(py);
-  Scalar alpha  = *reinterpret_cast<Scalar*>(palpha);
+  Scalar alpha  = *reinterpret_cast<const Scalar*>(palpha);
 
   if(*n<=0) return 0;
 
diff --git a/blas/level2_cplx_impl.h b/blas/level2_cplx_impl.h
index 2edc51596..e3ce61435 100644
--- a/blas/level2_cplx_impl.h
+++ b/blas/level2_cplx_impl.h
@@ -16,7 +16,8 @@
   *  where alpha and beta are scalars, x and y are n element vectors and
   *  A is an n by n hermitian matrix.
   */
-int EIGEN_BLAS_FUNC(hemv)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *px, int *incx, RealScalar *pbeta, RealScalar *py, int *incy)
+int EIGEN_BLAS_FUNC(hemv)(const char *uplo, const int *n, const RealScalar *palpha, const RealScalar *pa, const int *lda,
+                          const RealScalar *px, const int *incx, const RealScalar *pbeta, RealScalar *py, const int *incy)
 {
   typedef void (*functype)(int, const Scalar*, int, const Scalar*, Scalar*, Scalar);
   static const functype func[2] = {
@@ -26,11 +27,11 @@ int EIGEN_BLAS_FUNC(hemv)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa
     (internal::selfadjoint_matrix_vector_product<Scalar,int,ColMajor,Lower,false,false>::run),
   };
 
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
-  Scalar* x = reinterpret_cast<Scalar*>(px);
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  const Scalar* x = reinterpret_cast<const Scalar*>(px);
   Scalar* y = reinterpret_cast<Scalar*>(py);
-  Scalar alpha  = *reinterpret_cast<Scalar*>(palpha);
-  Scalar beta   = *reinterpret_cast<Scalar*>(pbeta);
+  Scalar alpha  = *reinterpret_cast<const Scalar*>(palpha);
+  Scalar beta   = *reinterpret_cast<const Scalar*>(pbeta);
 
   // check arguments
   int info = 0;
@@ -45,7 +46,7 @@ int EIGEN_BLAS_FUNC(hemv)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa
   if(*n==0)
     return 1;
 
-  Scalar* actual_x = get_compact_vector(x,*n,*incx);
+  const Scalar* actual_x = get_compact_vector(x,*n,*incx);
   Scalar* actual_y = get_compact_vector(y,*n,*incy);
 
   if(beta!=Scalar(1))
diff --git a/blas/level2_impl.h b/blas/level2_impl.h
index d09db0cc6..173f40b44 100644
--- a/blas/level2_impl.h
+++ b/blas/level2_impl.h
@@ -23,7 +23,8 @@ struct general_matrix_vector_product_wrapper
   }
 };
 
-int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *incb, RealScalar *pbeta, RealScalar *pc, int *incc)
+int EIGEN_BLAS_FUNC(gemv)(const char *opa, const int *m, const int *n, const RealScalar *palpha,
+                          const RealScalar *pa, const int *lda, const RealScalar *pb, const int *incb, const RealScalar *pbeta, RealScalar *pc, const int *incc)
 {
   typedef void (*functype)(int, int, const Scalar *, int, const Scalar *, int , Scalar *, int, Scalar);
   static const functype func[4] = {
@@ -36,11 +37,11 @@ int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealSca
     0
   };
 
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
-  Scalar* b = reinterpret_cast<Scalar*>(pb);
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  const Scalar* b = reinterpret_cast<const Scalar*>(pb);
   Scalar* c = reinterpret_cast<Scalar*>(pc);
-  Scalar alpha  = *reinterpret_cast<Scalar*>(palpha);
-  Scalar beta   = *reinterpret_cast<Scalar*>(pbeta);
+  Scalar alpha  = *reinterpret_cast<const Scalar*>(palpha);
+  Scalar beta   = *reinterpret_cast<const Scalar*>(pbeta);
 
   // check arguments
   int info = 0;
@@ -62,7 +63,7 @@ int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealSca
   if(code!=NOTR)
     std::swap(actual_m,actual_n);
 
-  Scalar* actual_b = get_compact_vector(b,actual_n,*incb);
+  const Scalar* actual_b = get_compact_vector(b,actual_n,*incb);
   Scalar* actual_c = get_compact_vector(c,actual_m,*incc);
 
   if(beta!=Scalar(1))
@@ -82,7 +83,7 @@ int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealSca
   return 1;
 }
 
-int EIGEN_BLAS_FUNC(trsv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pa, int *lda, RealScalar *pb, int *incb)
+int EIGEN_BLAS_FUNC(trsv)(const char *uplo, const char *opa, const char *diag, const int *n, const RealScalar *pa, const int *lda, RealScalar *pb, const int *incb)
 {
   typedef void (*functype)(int, const Scalar *, int, Scalar *);
   static const functype func[16] = {
@@ -116,7 +117,7 @@ int EIGEN_BLAS_FUNC(trsv)(char *uplo, char *opa, char *diag, int *n, RealScalar
     0
   };
 
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
   Scalar* b = reinterpret_cast<Scalar*>(pb);
 
   int info = 0;
@@ -141,7 +142,7 @@ int EIGEN_BLAS_FUNC(trsv)(char *uplo, char *opa, char *diag, int *n, RealScalar
 
 
 
-int EIGEN_BLAS_FUNC(trmv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pa, int *lda, RealScalar *pb, int *incb)
+int EIGEN_BLAS_FUNC(trmv)(const char *uplo, const char *opa, const char *diag, const int *n, const RealScalar *pa, const int *lda, RealScalar *pb, const int *incb)
 {
   typedef void (*functype)(int, int, const Scalar *, int, const Scalar *, int, Scalar *, int, const Scalar&);
   static const functype func[16] = {
@@ -175,7 +176,7 @@ int EIGEN_BLAS_FUNC(trmv)(char *uplo, char *opa, char *diag, int *n, RealScalar
     0
   };
 
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
   Scalar* b = reinterpret_cast<Scalar*>(pb);
 
   int info = 0;
@@ -217,11 +218,11 @@ int EIGEN_BLAS_FUNC(trmv)(char *uplo, char *opa, char *diag, int *n, RealScalar
 int EIGEN_BLAS_FUNC(gbmv)(char *trans, int *m, int *n, int *kl, int *ku, RealScalar *palpha, RealScalar *pa, int *lda,
                           RealScalar *px, int *incx, RealScalar *pbeta, RealScalar *py, int *incy)
 {
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
-  Scalar* x = reinterpret_cast<Scalar*>(px);
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  const Scalar* x = reinterpret_cast<const Scalar*>(px);
   Scalar* y = reinterpret_cast<Scalar*>(py);
-  Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
-  Scalar beta = *reinterpret_cast<Scalar*>(pbeta);
+  Scalar alpha = *reinterpret_cast<const Scalar*>(palpha);
+  Scalar beta = *reinterpret_cast<const Scalar*>(pbeta);
   int coeff_rows = *kl+*ku+1;
 
   int info = 0;
@@ -244,7 +245,7 @@ int EIGEN_BLAS_FUNC(gbmv)(char *trans, int *m, int *n, int *kl, int *ku, RealSca
   if(OP(*trans)!=NOTR)
     std::swap(actual_m,actual_n);
 
-  Scalar* actual_x = get_compact_vector(x,actual_n,*incx);
+  const Scalar* actual_x = get_compact_vector(x,actual_n,*incx);
   Scalar* actual_y = get_compact_vector(y,actual_m,*incy);
 
   if(beta!=Scalar(1))
@@ -253,7 +254,7 @@ int EIGEN_BLAS_FUNC(gbmv)(char *trans, int *m, int *n, int *kl, int *ku, RealSca
     else                make_vector(actual_y, actual_m) *= beta;
   }
 
-  MatrixType mat_coeffs(a,coeff_rows,*n,*lda);
+  ConstMatrixType mat_coeffs(a,coeff_rows,*n,*lda);
 
   int nb = std::min(*n,(*m)+(*ku));
   for(int j=0; j<nb; ++j)
diff --git a/blas/level2_real_impl.h b/blas/level2_real_impl.h
index 4896a03d9..7620f0a38 100644
--- a/blas/level2_real_impl.h
+++ b/blas/level2_real_impl.h
@@ -10,7 +10,8 @@
 #include "common.h"
 
 // y = alpha*A*x + beta*y
-int EIGEN_BLAS_FUNC(symv) (char *uplo, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *px, int *incx, RealScalar *pbeta, RealScalar *py, int *incy)
+int EIGEN_BLAS_FUNC(symv) (const char *uplo, const int *n, const RealScalar *palpha, const RealScalar *pa, const int *lda,
+                           const RealScalar *px, const int *incx, const RealScalar *pbeta, RealScalar *py, const int *incy)
 {
   typedef void (*functype)(int, const Scalar*, int, const Scalar*, Scalar*, Scalar);
   static const functype func[2] = {
@@ -20,11 +21,11 @@ int EIGEN_BLAS_FUNC(symv) (char *uplo, int *n, RealScalar *palpha, RealScalar *p
     (internal::selfadjoint_matrix_vector_product<Scalar,int,ColMajor,Lower,false,false>::run),
   };
 
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
-  Scalar* x = reinterpret_cast<Scalar*>(px);
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  const Scalar* x = reinterpret_cast<const Scalar*>(px);
   Scalar* y = reinterpret_cast<Scalar*>(py);
-  Scalar alpha  = *reinterpret_cast<Scalar*>(palpha);
-  Scalar beta   = *reinterpret_cast<Scalar*>(pbeta);
+  Scalar alpha  = *reinterpret_cast<const Scalar*>(palpha);
+  Scalar beta   = *reinterpret_cast<const Scalar*>(pbeta);
 
   // check arguments
   int info = 0;
@@ -39,7 +40,7 @@ int EIGEN_BLAS_FUNC(symv) (char *uplo, int *n, RealScalar *palpha, RealScalar *p
   if(*n==0)
     return 0;
 
-  Scalar* actual_x = get_compact_vector(x,*n,*incx);
+  const Scalar* actual_x = get_compact_vector(x,*n,*incx);
   Scalar* actual_y = get_compact_vector(y,*n,*incy);
 
   if(beta!=Scalar(1))
@@ -61,7 +62,7 @@ int EIGEN_BLAS_FUNC(symv) (char *uplo, int *n, RealScalar *palpha, RealScalar *p
 }
 
 // C := alpha*x*x' + C
-int EIGEN_BLAS_FUNC(syr)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *pc, int *ldc)
+int EIGEN_BLAS_FUNC(syr)(const char *uplo, const int *n, const RealScalar *palpha, const RealScalar *px, const int *incx, RealScalar *pc, const int *ldc)
 {
 
   typedef void (*functype)(int, Scalar*, int, const Scalar*, const Scalar*, const Scalar&);
@@ -72,9 +73,9 @@ int EIGEN_BLAS_FUNC(syr)(char *uplo, int *n, RealScalar *palpha, RealScalar *px,
     (selfadjoint_rank1_update<Scalar,int,ColMajor,Lower,false,Conj>::run),
   };
 
-  Scalar* x = reinterpret_cast<Scalar*>(px);
+  const Scalar* x = reinterpret_cast<const Scalar*>(px);
   Scalar* c = reinterpret_cast<Scalar*>(pc);
-  Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
+  Scalar alpha = *reinterpret_cast<const Scalar*>(palpha);
 
   int info = 0;
   if(UPLO(*uplo)==INVALID)                                            info = 1;
@@ -87,7 +88,7 @@ int EIGEN_BLAS_FUNC(syr)(char *uplo, int *n, RealScalar *palpha, RealScalar *px,
   if(*n==0 || alpha==Scalar(0)) return 1;
 
   // if the increment is not 1, let's copy it to a temporary vector to enable vectorization
-  Scalar* x_cpy = get_compact_vector(x,*n,*incx);
+  const Scalar* x_cpy = get_compact_vector(x,*n,*incx);
 
   int code = UPLO(*uplo);
   if(code>=2 || func[code]==0)
@@ -101,7 +102,7 @@ int EIGEN_BLAS_FUNC(syr)(char *uplo, int *n, RealScalar *palpha, RealScalar *px,
 }
 
 // C := alpha*x*y' + alpha*y*x' + C
-int EIGEN_BLAS_FUNC(syr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, int *ldc)
+int EIGEN_BLAS_FUNC(syr2)(const char *uplo, const int *n, const RealScalar *palpha, const RealScalar *px, const int *incx, const RealScalar *py, const int *incy, RealScalar *pc, const int *ldc)
 {
   typedef void (*functype)(int, Scalar*, int, const Scalar*, const Scalar*, Scalar);
   static const functype func[2] = {
@@ -111,10 +112,10 @@ int EIGEN_BLAS_FUNC(syr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px
     (internal::rank2_update_selector<Scalar,int,Lower>::run),
   };
 
-  Scalar* x = reinterpret_cast<Scalar*>(px);
-  Scalar* y = reinterpret_cast<Scalar*>(py);
+  const Scalar* x = reinterpret_cast<const Scalar*>(px);
+  const Scalar* y = reinterpret_cast<const Scalar*>(py);
   Scalar* c = reinterpret_cast<Scalar*>(pc);
-  Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
+  Scalar alpha = *reinterpret_cast<const Scalar*>(palpha);
 
   int info = 0;
   if(UPLO(*uplo)==INVALID)                                            info = 1;
@@ -128,8 +129,8 @@ int EIGEN_BLAS_FUNC(syr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px
   if(alpha==Scalar(0))
     return 1;
 
-  Scalar* x_cpy = get_compact_vector(x,*n,*incx);
-  Scalar* y_cpy = get_compact_vector(y,*n,*incy);
+  const Scalar* x_cpy = get_compact_vector(x,*n,*incx);
+  const Scalar* y_cpy = get_compact_vector(y,*n,*incy);
 
   int code = UPLO(*uplo);
   if(code>=2 || func[code]==0)
diff --git a/blas/level3_impl.h b/blas/level3_impl.h
index 267a727ef..6c802cd5f 100644
--- a/blas/level3_impl.h
+++ b/blas/level3_impl.h
@@ -9,7 +9,8 @@
 #include <iostream>
 #include "common.h"
 
-int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb, RealScalar *pbeta, RealScalar *pc, int *ldc)
+int EIGEN_BLAS_FUNC(gemm)(const char *opa, const char *opb, const int *m, const int *n, const int *k, const RealScalar *palpha,
+                          const RealScalar *pa, const int *lda, const RealScalar *pb, const int *ldb, const RealScalar *pbeta, RealScalar *pc, const int *ldc)
 {
 //   std::cerr << "in gemm " << *opa << " " << *opb << " " << *m << " " << *n << " " << *k << " " << *lda << " " << *ldb << " " << *ldc << " " << *palpha << " " << *pbeta << "\n";
   typedef void (*functype)(DenseIndex, DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, Scalar, internal::level3_blocking<Scalar,Scalar>&, Eigen::internal::GemmParallelInfo<DenseIndex>*);
@@ -37,11 +38,11 @@ int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScal
     0
   };
 
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
-  Scalar* b = reinterpret_cast<Scalar*>(pb);
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  const Scalar* b = reinterpret_cast<const Scalar*>(pb);
   Scalar* c = reinterpret_cast<Scalar*>(pc);
-  Scalar alpha  = *reinterpret_cast<Scalar*>(palpha);
-  Scalar beta   = *reinterpret_cast<Scalar*>(pbeta);
+  Scalar alpha  = *reinterpret_cast<const Scalar*>(palpha);
+  Scalar beta   = *reinterpret_cast<const Scalar*>(pbeta);
 
   int info = 0;
   if(OP(*opa)==INVALID)                                               info = 1;
@@ -74,7 +75,8 @@ int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScal
   return 0;
 }
 
-int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m, int *n, RealScalar *palpha,  RealScalar *pa, int *lda, RealScalar *pb, int *ldb)
+int EIGEN_BLAS_FUNC(trsm)(const char *side, const char *uplo, const char *opa, const char *diag, const int *m, const int *n,
+                          const RealScalar *palpha,  const RealScalar *pa, const int *lda, RealScalar *pb, const int *ldb)
 {
 //   std::cerr << "in trsm " << *side << " " << *uplo << " " << *opa << " " << *diag << " " << *m << "," << *n << " " << *palpha << " " << *lda << " " << *ldb<< "\n";
   typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, internal::level3_blocking<Scalar,Scalar>&);
@@ -137,9 +139,9 @@ int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m,
     0
   };
 
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
   Scalar* b = reinterpret_cast<Scalar*>(pb);
-  Scalar  alpha = *reinterpret_cast<Scalar*>(palpha);
+  Scalar  alpha = *reinterpret_cast<const Scalar*>(palpha);
 
   int info = 0;
   if(SIDE(*side)==INVALID)                                            info = 1;
@@ -157,7 +159,7 @@ int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m,
     return 0;
 
   int code = OP(*opa) | (SIDE(*side) << 2) | (UPLO(*uplo) << 3) | (DIAG(*diag) << 4);
-  
+
   if(SIDE(*side)==LEFT)
   {
     internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m,1,false);
@@ -178,7 +180,8 @@ int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m,
 
 // b = alpha*op(a)*b  for side = 'L'or'l'
 // b = alpha*b*op(a)  for side = 'R'or'r'
-int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m, int *n, RealScalar *palpha,  RealScalar *pa, int *lda, RealScalar *pb, int *ldb)
+int EIGEN_BLAS_FUNC(trmm)(const char *side, const char *uplo, const char *opa, const char *diag, const int *m, const int *n,
+                          const RealScalar *palpha, const RealScalar *pa, const int *lda, RealScalar *pb, const int *ldb)
 {
 //   std::cerr << "in trmm " << *side << " " << *uplo << " " << *opa << " " << *diag << " " << *m << " " << *n << " " << *lda << " " << *ldb << " " << *palpha << "\n";
   typedef void (*functype)(DenseIndex, DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&, internal::level3_blocking<Scalar,Scalar>&);
@@ -241,9 +244,9 @@ int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m,
     0
   };
 
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
   Scalar* b = reinterpret_cast<Scalar*>(pb);
-  Scalar  alpha = *reinterpret_cast<Scalar*>(palpha);
+  Scalar  alpha = *reinterpret_cast<const Scalar*>(palpha);
 
   int info = 0;
   if(SIDE(*side)==INVALID)                                            info = 1;
@@ -281,14 +284,15 @@ int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m,
 
 // c = alpha*a*b + beta*c  for side = 'L'or'l'
 // c = alpha*b*a + beta*c  for side = 'R'or'r
-int EIGEN_BLAS_FUNC(symm)(char *side, char *uplo, int *m, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb, RealScalar *pbeta, RealScalar *pc, int *ldc)
+int EIGEN_BLAS_FUNC(symm)(const char *side, const char *uplo, const int *m, const int *n, const RealScalar *palpha,
+                          const RealScalar *pa, const int *lda, const RealScalar *pb, const int *ldb, const RealScalar *pbeta, RealScalar *pc, const int *ldc)
 {
 //   std::cerr << "in symm " << *side << " " << *uplo << " " << *m << "x" << *n << " lda:" << *lda << " ldb:" << *ldb << " ldc:" << *ldc << " alpha:" << *palpha << " beta:" << *pbeta << "\n";
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
-  Scalar* b = reinterpret_cast<Scalar*>(pb);
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  const Scalar* b = reinterpret_cast<const Scalar*>(pb);
   Scalar* c = reinterpret_cast<Scalar*>(pc);
-  Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
-  Scalar beta  = *reinterpret_cast<Scalar*>(pbeta);
+  Scalar alpha = *reinterpret_cast<const Scalar*>(palpha);
+  Scalar beta  = *reinterpret_cast<const Scalar*>(pbeta);
 
   int info = 0;
   if(SIDE(*side)==INVALID)                                            info = 1;
@@ -350,7 +354,8 @@ int EIGEN_BLAS_FUNC(symm)(char *side, char *uplo, int *m, int *n, RealScalar *pa
 
 // c = alpha*a*a' + beta*c  for op = 'N'or'n'
 // c = alpha*a'*a + beta*c  for op = 'T'or't','C'or'c'
-int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pbeta, RealScalar *pc, int *ldc)
+int EIGEN_BLAS_FUNC(syrk)(const char *uplo, const char *op, const int *n, const int *k,
+                          const RealScalar *palpha, const RealScalar *pa, const int *lda, const RealScalar *pbeta, RealScalar *pc, const int *ldc)
 {
 //   std::cerr << "in syrk " << *uplo << " " << *op << " " << *n << " " << *k << " " << *palpha << " " << *lda << " " << *pbeta << " " << *ldc << "\n";
   #if !ISCOMPLEX
@@ -373,14 +378,14 @@ int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palp
   };
   #endif
 
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
   Scalar* c = reinterpret_cast<Scalar*>(pc);
-  Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
-  Scalar beta  = *reinterpret_cast<Scalar*>(pbeta);
+  Scalar alpha = *reinterpret_cast<const Scalar*>(palpha);
+  Scalar beta  = *reinterpret_cast<const Scalar*>(pbeta);
 
   int info = 0;
   if(UPLO(*uplo)==INVALID)                                            info = 1;
-  else if(OP(*op)==INVALID)                                           info = 2;
+  else if(OP(*op)==INVALID || (ISCOMPLEX && OP(*op)==ADJ) )           info = 2;
   else if(*n<0)                                                       info = 3;
   else if(*k<0)                                                       info = 4;
   else if(*lda<std::max(1,(OP(*op)==NOTR)?*n:*k))                     info = 7;
@@ -429,19 +434,20 @@ int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palp
 
 // c = alpha*a*b' + alpha*b*a' + beta*c  for op = 'N'or'n'
 // c = alpha*a'*b + alpha*b'*a + beta*c  for op = 'T'or't'
-int EIGEN_BLAS_FUNC(syr2k)(char *uplo, char *op, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb, RealScalar *pbeta, RealScalar *pc, int *ldc)
+int EIGEN_BLAS_FUNC(syr2k)(const char *uplo, const char *op, const int *n, const int *k, const RealScalar *palpha,
+                           const RealScalar *pa, const int *lda, const RealScalar *pb, const int *ldb, const RealScalar *pbeta, RealScalar *pc, const int *ldc)
 {
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
-  Scalar* b = reinterpret_cast<Scalar*>(pb);
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  const Scalar* b = reinterpret_cast<const Scalar*>(pb);
   Scalar* c = reinterpret_cast<Scalar*>(pc);
-  Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
-  Scalar beta  = *reinterpret_cast<Scalar*>(pbeta);
+  Scalar alpha = *reinterpret_cast<const Scalar*>(palpha);
+  Scalar beta  = *reinterpret_cast<const Scalar*>(pbeta);
 
 //   std::cerr << "in syr2k " << *uplo << " " << *op << " " << *n << " " << *k << " " << alpha << " " << *lda << " " << *ldb << " " << beta << " " << *ldc << "\n";
 
   int info = 0;
   if(UPLO(*uplo)==INVALID)                                            info = 1;
-  else if(OP(*op)==INVALID)                                           info = 2;
+  else if(OP(*op)==INVALID || (ISCOMPLEX && OP(*op)==ADJ) )           info = 2;
   else if(*n<0)                                                       info = 3;
   else if(*k<0)                                                       info = 4;
   else if(*lda<std::max(1,(OP(*op)==NOTR)?*n:*k))                     info = 7;
@@ -496,13 +502,14 @@ int EIGEN_BLAS_FUNC(syr2k)(char *uplo, char *op, int *n, int *k, RealScalar *pal
 
 // c = alpha*a*b + beta*c  for side = 'L'or'l'
 // c = alpha*b*a + beta*c  for side = 'R'or'r
-int EIGEN_BLAS_FUNC(hemm)(char *side, char *uplo, int *m, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb, RealScalar *pbeta, RealScalar *pc, int *ldc)
+int EIGEN_BLAS_FUNC(hemm)(const char *side, const char *uplo, const int *m, const int *n, const RealScalar *palpha,
+                          const RealScalar *pa, const int *lda, const RealScalar *pb, const int *ldb, const RealScalar *pbeta, RealScalar *pc, const int *ldc)
 {
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
-  Scalar* b = reinterpret_cast<Scalar*>(pb);
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  const Scalar* b = reinterpret_cast<const Scalar*>(pb);
   Scalar* c = reinterpret_cast<Scalar*>(pc);
-  Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
-  Scalar beta  = *reinterpret_cast<Scalar*>(pbeta);
+  Scalar alpha = *reinterpret_cast<const Scalar*>(palpha);
+  Scalar beta  = *reinterpret_cast<const Scalar*>(pbeta);
 
 //   std::cerr << "in hemm " << *side << " " << *uplo << " " << *m << " " << *n << " " << alpha << " " << *lda << " " << beta << " " << *ldc << "\n";
 
@@ -554,7 +561,8 @@ int EIGEN_BLAS_FUNC(hemm)(char *side, char *uplo, int *m, int *n, RealScalar *pa
 
 // c = alpha*a*conj(a') + beta*c  for op = 'N'or'n'
 // c = alpha*conj(a')*a + beta*c  for op  = 'C'or'c'
-int EIGEN_BLAS_FUNC(herk)(char *uplo, char *op, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pbeta, RealScalar *pc, int *ldc)
+int EIGEN_BLAS_FUNC(herk)(const char *uplo, const char *op, const int *n, const int *k,
+                          const RealScalar *palpha, const RealScalar *pa, const int *lda, const RealScalar *pbeta, RealScalar *pc, const int *ldc)
 {
 //   std::cerr << "in herk " << *uplo << " " << *op << " " << *n << " " << *k << " " << *palpha << " " << *lda << " " << *pbeta << " " << *ldc << "\n";
 
@@ -574,7 +582,7 @@ int EIGEN_BLAS_FUNC(herk)(char *uplo, char *op, int *n, int *k, RealScalar *palp
     0
   };
 
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
   Scalar* c = reinterpret_cast<Scalar*>(pc);
   RealScalar alpha = *palpha;
   RealScalar beta  = *pbeta;
@@ -601,7 +609,7 @@ int EIGEN_BLAS_FUNC(herk)(char *uplo, char *op, int *n, int *k, RealScalar *palp
     else
       if(beta==Scalar(0)) matrix(c, *n, *n, *ldc).triangularView<Lower>().setZero();
       else                matrix(c, *n, *n, *ldc).triangularView<StrictlyLower>() *= beta;
-  
+
     if(beta!=Scalar(0))
     {
       matrix(c, *n, *n, *ldc).diagonal().real() *= beta;
@@ -620,12 +628,13 @@ int EIGEN_BLAS_FUNC(herk)(char *uplo, char *op, int *n, int *k, RealScalar *palp
 
 // c = alpha*a*conj(b') + conj(alpha)*b*conj(a') + beta*c,  for op = 'N'or'n'
 // c = alpha*conj(a')*b + conj(alpha)*conj(b')*a + beta*c,  for op = 'C'or'c'
-int EIGEN_BLAS_FUNC(her2k)(char *uplo, char *op, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb, RealScalar *pbeta, RealScalar *pc, int *ldc)
+int EIGEN_BLAS_FUNC(her2k)(const char *uplo, const char *op, const int *n, const int *k,
+                           const RealScalar *palpha, const RealScalar *pa, const int *lda, const RealScalar *pb, const int *ldb, const RealScalar *pbeta, RealScalar *pc, const int *ldc)
 {
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
-  Scalar* b = reinterpret_cast<Scalar*>(pb);
+  const Scalar* a = reinterpret_cast<const Scalar*>(pa);
+  const Scalar* b = reinterpret_cast<const Scalar*>(pb);
   Scalar* c = reinterpret_cast<Scalar*>(pc);
-  Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
+  Scalar alpha = *reinterpret_cast<const Scalar*>(palpha);
   RealScalar beta  = *pbeta;
 
 //   std::cerr << "in her2k " << *uplo << " " << *op << " " << *n << " " << *k << " " << alpha << " " << *lda << " " << *ldb << " " << beta << " " << *ldc << "\n";
diff --git a/blas/single.cpp b/blas/single.cpp
index 836e3eee2..20ea57d5c 100644
--- a/blas/single.cpp
+++ b/blas/single.cpp
@@ -19,4 +19,4 @@
 #include "level3_impl.h"
 
 float BLASFUNC(sdsdot)(int* n, float* alpha, float* x, int* incx, float* y, int* incy)
-{ return *alpha + BLASFUNC(dsdot)(n, x, incx, y, incy); }
+{ return double(*alpha) + BLASFUNC(dsdot)(n, x, incx, y, incy); }
diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake
index d5e3972b5..6f3661921 100644
--- a/cmake/EigenTesting.cmake
+++ b/cmake/EigenTesting.cmake
@@ -19,10 +19,25 @@ macro(ei_add_test_internal testname testname_with_suffix)
   endif()
   
   if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu)
-    if (${ARGC} GREATER 2)
-      cuda_add_executable(${targetname} ${filename} OPTIONS ${ARGV2})
+    if(EIGEN_TEST_CUDA_CLANG)
+      set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX)
+      if(CUDA_64_BIT_DEVICE_CODE)
+        link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64")
+      else()
+        link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib")
+      endif()
+      if (${ARGC} GREATER 2)
+        add_executable(${targetname} ${filename})
+      else()
+        add_executable(${targetname} ${filename} OPTIONS ${ARGV2})
+      endif()
+      target_link_libraries(${targetname} "cudart_static" "cuda" "dl" "rt" "pthread")
     else()
-      cuda_add_executable(${targetname} ${filename})
+      if (${ARGC} GREATER 2)
+        cuda_add_executable(${targetname} ${filename} OPTIONS ${ARGV2})
+      else()
+        cuda_add_executable(${targetname} ${filename})
+      endif()
     endif()
   else()
     add_executable(${targetname} ${filename})
@@ -316,7 +331,11 @@ macro(ei_testing_print_summary)
     endif()
 
     if(EIGEN_TEST_CUDA)
-      message(STATUS "CUDA:              ON")
+      if(EIGEN_TEST_CUDA_CLANG)
+        message(STATUS "CUDA:              ON (using clang)")
+      else()
+        message(STATUS "CUDA:              ON (using nvcc)")
+      endif()
     else()
       message(STATUS "CUDA:              OFF")
     endif()
diff --git a/doc/TutorialReshapeSlicing.dox b/doc/TutorialReshapeSlicing.dox
index eb0fb0df0..3730a5de6 100644
--- a/doc/TutorialReshapeSlicing.dox
+++ b/doc/TutorialReshapeSlicing.dox
@@ -37,10 +37,10 @@ Here is another example reshaping a 2x6 matrix to a 6x2 one:
 
 \section TutorialSlicing Slicing
 
-Slicing consists in taking a set of rows, or columns, or elements, uniformly spaced within a matrix.
+Slicing consists in taking a set of rows, columns, or elements, uniformly spaced within a matrix.
 Again, the class Map allows to easily mimic this feature.
 
-For instance, one can take skip every P elements in a vector:
+For instance, one can skip every P elements in a vector:
 <table class="example">
 <tr><th>Example:</th><th>Output:</th></tr>
 <tr><td>
diff --git a/doc/UsingIntelMKL.dox b/doc/UsingIntelMKL.dox
index 02c62ad85..dbe559e53 100644
--- a/doc/UsingIntelMKL.dox
+++ b/doc/UsingIntelMKL.dox
@@ -55,7 +55,7 @@ Operations on other scalar types or mixing reals and complexes will continue to
 In addition you can choose which parts will be substituted by defining one or multiple of the following macros:
 
 <table class="manual">
-<tr><td>\c EIGEN_USE_BLAS </td><td>Enables the use of external BLAS level 2 and 3 routines (currently works with Intel MKL only)</td></tr>
+<tr><td>\c EIGEN_USE_BLAS </td><td>Enables the use of external BLAS level 2 and 3 routines (compatible with any F77 BLAS interface, not only Intel MKL)</td></tr>
 <tr class="alt"><td>\c EIGEN_USE_LAPACKE </td><td>Enables the use of external Lapack routines via the <a href="http://www.netlib.org/lapack/lapacke.html">Intel Lapacke</a> C interface to Lapack (currently works with Intel MKL only)</td></tr>
 <tr><td>\c EIGEN_USE_LAPACKE_STRICT </td><td>Same as \c EIGEN_USE_LAPACKE but algorithm of lower robustness are disabled. This currently concerns only JacobiSVD which otherwise would be replaced by \c gesvd that is less robust than Jacobi rotations.</td></tr>
 <tr class="alt"><td>\c EIGEN_USE_MKL_VML </td><td>Enables the use of Intel VML (vector operations)</td></tr>
diff --git a/lapack/lapack_common.h b/lapack/lapack_common.h
index a93598784..c872a813e 100644
--- a/lapack/lapack_common.h
+++ b/lapack/lapack_common.h
@@ -11,6 +11,7 @@
 #define EIGEN_LAPACK_COMMON_H
 
 #include "../blas/common.h"
+#include "../Eigen/src/misc/lapack.h"
 
 #define EIGEN_LAPACK_FUNC(FUNC,ARGLIST)               \
   extern "C" { int EIGEN_BLAS_FUNC(FUNC) ARGLIST; }   \
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 841c4572b..7bed6a45c 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -327,8 +327,14 @@ endif()
 
 # CUDA unit tests
 option(EIGEN_TEST_CUDA "Enable CUDA support in unit tests" OFF)
+option(EIGEN_TEST_CUDA_CLANG "Use clang instead of nvcc to compile the CUDA tests" OFF)
+
+if(EIGEN_TEST_CUDA_CLANG AND NOT CMAKE_CXX_COMPILER MATCHES "clang")
+  message(WARNING "EIGEN_TEST_CUDA_CLANG is set, but CMAKE_CXX_COMPILER does not appear to be clang.")
+endif()
+
 if(EIGEN_TEST_CUDA)
-  
+
 find_package(CUDA 5.0)
 if(CUDA_FOUND)
   
@@ -336,6 +342,9 @@ if(CUDA_FOUND)
   if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") 
     set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE)
   endif()
+  if(EIGEN_TEST_CUDA_CLANG)
+   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_30")
+  endif()
   cuda_include_directories(${CMAKE_CURRENT_BINARY_DIR})
   set(EIGEN_ADD_TEST_FILENAME_EXTENSION  "cu")
   
diff --git a/test/array.cpp b/test/array.cpp
index 8b0a34722..beaa62221 100644
--- a/test/array.cpp
+++ b/test/array.cpp
@@ -331,11 +331,13 @@ template<typename ArrayType> void array_real(const ArrayType& m)
     VERIFY_IS_APPROX(numext::zeta(Scalar(3), Scalar(-2.5)), RealScalar(0.054102025820864097));
     VERIFY_IS_EQUAL(numext::zeta(Scalar(1), Scalar(1.2345)), // The second scalar does not matter
                     std::numeric_limits<RealScalar>::infinity());
+    VERIFY((numext::isnan)(numext::zeta(Scalar(0.9), Scalar(1.2345)))); // The second scalar does not matter
     
     // Check the polygamma against scipy.special.polygamma examples
     VERIFY_IS_APPROX(numext::polygamma(Scalar(1), Scalar(2)), RealScalar(0.644934066848));
     VERIFY_IS_APPROX(numext::polygamma(Scalar(1), Scalar(3)), RealScalar(0.394934066848));
     VERIFY_IS_APPROX(numext::polygamma(Scalar(1), Scalar(25.5)), RealScalar(0.0399946696496));
+    VERIFY((numext::isnan)(numext::polygamma(Scalar(1.5), Scalar(1.2345)))); // The second scalar does not matter
     
     // Check the polygamma function over a larger range of values
     VERIFY_IS_APPROX(numext::polygamma(Scalar(17), Scalar(4.7)), RealScalar(293.334565435));
diff --git a/test/cholesky.cpp b/test/cholesky.cpp
index d652af5bf..b7abc230b 100644
--- a/test/cholesky.cpp
+++ b/test/cholesky.cpp
@@ -17,6 +17,12 @@
 #include <Eigen/Cholesky>
 #include <Eigen/QR>
 
+template<typename MatrixType, int UpLo>
+typename MatrixType::RealScalar matrix_l1_norm(const MatrixType& m) {
+  MatrixType symm = m.template selfadjointView<UpLo>();
+  return symm.cwiseAbs().colwise().sum().maxCoeff();
+}
+
 template<typename MatrixType,template <typename,int> class CholType> void test_chol_update(const MatrixType& symm)
 {
   typedef typename MatrixType::Scalar Scalar;
@@ -77,7 +83,7 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
   {
     SquareMatrixType symmUp = symm.template triangularView<Upper>();
     SquareMatrixType symmLo = symm.template triangularView<Lower>();
-    
+
     LLT<SquareMatrixType,Lower> chollo(symmLo);
     VERIFY_IS_APPROX(symm, chollo.reconstructedMatrix());
     vecX = chollo.solve(vecB);
@@ -85,6 +91,14 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
     matX = chollo.solve(matB);
     VERIFY_IS_APPROX(symm * matX, matB);
 
+    const MatrixType symmLo_inverse = chollo.solve(MatrixType::Identity(rows,cols));
+    RealScalar rcond = (RealScalar(1) / matrix_l1_norm<MatrixType, Lower>(symmLo)) /
+                             matrix_l1_norm<MatrixType, Lower>(symmLo_inverse);
+    RealScalar rcond_est = chollo.rcond();
+    // Verify that the estimated condition number is within a factor of 10 of the
+    // truth.
+    VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10);
+
     // test the upper mode
     LLT<SquareMatrixType,Upper> cholup(symmUp);
     VERIFY_IS_APPROX(symm, cholup.reconstructedMatrix());
@@ -93,6 +107,15 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
     matX = cholup.solve(matB);
     VERIFY_IS_APPROX(symm * matX, matB);
 
+    // Verify that the estimated condition number is within a factor of 10 of the
+    // truth.
+    const MatrixType symmUp_inverse = cholup.solve(MatrixType::Identity(rows,cols));
+    rcond = (RealScalar(1) / matrix_l1_norm<MatrixType, Upper>(symmUp)) /
+                             matrix_l1_norm<MatrixType, Upper>(symmUp_inverse);
+    rcond_est = cholup.rcond();
+    VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10);
+
+
     MatrixType neg = -symmLo;
     chollo.compute(neg);
     VERIFY(chollo.info()==NumericalIssue);
@@ -101,7 +124,7 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
     VERIFY_IS_APPROX(MatrixType(chollo.matrixU().transpose().conjugate()), MatrixType(chollo.matrixL()));
     VERIFY_IS_APPROX(MatrixType(cholup.matrixL().transpose().conjugate()), MatrixType(cholup.matrixU()));
     VERIFY_IS_APPROX(MatrixType(cholup.matrixU().transpose().conjugate()), MatrixType(cholup.matrixL()));
-    
+
     // test some special use cases of SelfCwiseBinaryOp:
     MatrixType m1 = MatrixType::Random(rows,cols), m2(rows,cols);
     m2 = m1;
@@ -137,6 +160,15 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
     matX = ldltlo.solve(matB);
     VERIFY_IS_APPROX(symm * matX, matB);
 
+    const MatrixType symmLo_inverse = ldltlo.solve(MatrixType::Identity(rows,cols));
+    RealScalar rcond = (RealScalar(1) / matrix_l1_norm<MatrixType, Lower>(symmLo)) /
+                             matrix_l1_norm<MatrixType, Lower>(symmLo_inverse);
+    RealScalar rcond_est = ldltlo.rcond();
+    // Verify that the estimated condition number is within a factor of 10 of the
+    // truth.
+    VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10);
+
+
     LDLT<SquareMatrixType,Upper> ldltup(symmUp);
     VERIFY_IS_APPROX(symm, ldltup.reconstructedMatrix());
     vecX = ldltup.solve(vecB);
@@ -144,6 +176,14 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
     matX = ldltup.solve(matB);
     VERIFY_IS_APPROX(symm * matX, matB);
 
+    // Verify that the estimated condition number is within a factor of 10 of the
+    // truth.
+    const MatrixType symmUp_inverse = ldltup.solve(MatrixType::Identity(rows,cols));
+    rcond = (RealScalar(1) / matrix_l1_norm<MatrixType, Upper>(symmUp)) /
+                             matrix_l1_norm<MatrixType, Upper>(symmUp_inverse);
+    rcond_est = ldltup.rcond();
+    VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10);
+
     VERIFY_IS_APPROX(MatrixType(ldltlo.matrixL().transpose().conjugate()), MatrixType(ldltlo.matrixU()));
     VERIFY_IS_APPROX(MatrixType(ldltlo.matrixU().transpose().conjugate()), MatrixType(ldltlo.matrixL()));
     VERIFY_IS_APPROX(MatrixType(ldltup.matrixL().transpose().conjugate()), MatrixType(ldltup.matrixU()));
@@ -167,7 +207,7 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
     // restore
     if(sign == -1)
       symm = -symm;
-    
+
     // check matrices coming from linear constraints with Lagrange multipliers
     if(rows>=3)
     {
@@ -183,7 +223,7 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
       vecX = ldltlo.solve(vecB);
       VERIFY_IS_APPROX(A * vecX, vecB);
     }
-    
+
     // check non-full rank matrices
     if(rows>=3)
     {
@@ -199,7 +239,7 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
       vecX = ldltlo.solve(vecB);
       VERIFY_IS_APPROX(A * vecX, vecB);
     }
-    
+
     // check matrices with a wide spectrum
     if(rows>=3)
     {
@@ -225,7 +265,7 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
       {
         RealScalar large_tol =  std::sqrt(test_precision<RealScalar>());
         VERIFY((A * vecX).isApprox(vecB, large_tol));
-        
+
         ++g_test_level;
         VERIFY_IS_APPROX(A * vecX,vecB);
         --g_test_level;
@@ -314,14 +354,14 @@ template<typename MatrixType> void cholesky_bug241(const MatrixType& m)
 }
 
 // LDLT is not guaranteed to work for indefinite matrices, but happens to work fine if matrix is diagonal.
-// This test checks that LDLT reports correctly that matrix is indefinite. 
+// This test checks that LDLT reports correctly that matrix is indefinite.
 // See http://forum.kde.org/viewtopic.php?f=74&t=106942 and bug 736
 template<typename MatrixType> void cholesky_definiteness(const MatrixType& m)
 {
   eigen_assert(m.rows() == 2 && m.cols() == 2);
   MatrixType mat;
   LDLT<MatrixType> ldlt(2);
-  
+
   {
     mat << 1, 0, 0, -1;
     ldlt.compute(mat);
@@ -384,11 +424,11 @@ void test_cholesky()
     CALL_SUBTEST_3( cholesky_definiteness(Matrix2d()) );
     CALL_SUBTEST_4( cholesky(Matrix3f()) );
     CALL_SUBTEST_5( cholesky(Matrix4d()) );
-    
-    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);    
+
+    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
     CALL_SUBTEST_2( cholesky(MatrixXd(s,s)) );
     TEST_SET_BUT_UNUSED_VARIABLE(s)
-    
+
     s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2);
     CALL_SUBTEST_6( cholesky_cplx(MatrixXcd(s,s)) );
     TEST_SET_BUT_UNUSED_VARIABLE(s)
@@ -402,6 +442,6 @@ void test_cholesky()
   // Test problem size constructors
   CALL_SUBTEST_9( LLT<MatrixXf>(10) );
   CALL_SUBTEST_9( LDLT<MatrixXf>(10) );
-  
+
   TEST_SET_BUT_UNUSED_VARIABLE(nb_temporaries)
 }
diff --git a/test/fastmath.cpp b/test/fastmath.cpp
index efdd5b313..438e6b2e5 100644
--- a/test/fastmath.cpp
+++ b/test/fastmath.cpp
@@ -49,7 +49,7 @@ void check_inf_nan(bool dryrun) {
     VERIFY( !m.allFinite() );
     VERIFY(  m.hasNaN() );
   }
-  m(4) /= 0.0;
+  m(4) /= T(0.0);
   if(dryrun)
   {
     std::cout << "std::isfinite(" << m(4) << ") = "; check((std::isfinite)(m(4)),false); std::cout << "  ; numext::isfinite = "; check((numext::isfinite)(m(4)), false); std::cout << "\n";
diff --git a/test/geo_hyperplane.cpp b/test/geo_hyperplane.cpp
index c1cc691c9..e77702bc7 100644
--- a/test/geo_hyperplane.cpp
+++ b/test/geo_hyperplane.cpp
@@ -97,9 +97,9 @@ template<typename Scalar> void lines()
     Vector u = Vector::Random();
     Vector v = Vector::Random();
     Scalar a = internal::random<Scalar>();
-    while (abs(a-1) < 1e-4) a = internal::random<Scalar>();
-    while (u.norm() < 1e-4) u = Vector::Random();
-    while (v.norm() < 1e-4) v = Vector::Random();
+    while (abs(a-1) < Scalar(1e-4)) a = internal::random<Scalar>();
+    while (u.norm() < Scalar(1e-4)) u = Vector::Random();
+    while (v.norm() < Scalar(1e-4)) v = Vector::Random();
 
     HLine line_u = HLine::Through(center + u, center + a*u);
     HLine line_v = HLine::Through(center + v, center + a*v);
@@ -111,14 +111,14 @@ template<typename Scalar> void lines()
     Vector result = line_u.intersection(line_v);
 
     // the lines should intersect at the point we called "center"
-    if(abs(a-1) > 1e-2 && abs(v.normalized().dot(u.normalized()))<0.9)
+    if(abs(a-1) > Scalar(1e-2) && abs(v.normalized().dot(u.normalized()))<Scalar(0.9))
       VERIFY_IS_APPROX(result, center);
 
     // check conversions between two types of lines
     PLine pl(line_u); // gcc 3.3 will commit suicide if we don't name this variable
     HLine line_u2(pl);
     CoeffsType converted_coeffs = line_u2.coeffs();
-    if(line_u2.normal().dot(line_u.normal())<0.)
+    if(line_u2.normal().dot(line_u.normal())<Scalar(0))
       converted_coeffs = -line_u2.coeffs();
     VERIFY(line_u.coeffs().isApprox(converted_coeffs));
   }
diff --git a/test/geo_quaternion.cpp b/test/geo_quaternion.cpp
index 761bb52b4..25130c19a 100644
--- a/test/geo_quaternion.cpp
+++ b/test/geo_quaternion.cpp
@@ -30,7 +30,7 @@ template<typename QuatType> void check_slerp(const QuatType& q0, const QuatType&
   Scalar largeEps = test_precision<Scalar>();
 
   Scalar theta_tot = AA(q1*q0.inverse()).angle();
-  if(theta_tot>EIGEN_PI)
+  if(theta_tot>Scalar(EIGEN_PI))
     theta_tot = Scalar(2.*EIGEN_PI)-theta_tot;
   for(Scalar t=0; t<=Scalar(1.001); t+=Scalar(0.1))
   {
@@ -115,8 +115,8 @@ template<typename Scalar, int Options> void quaternion(void)
   // Do not execute the test if the rotation angle is almost zero, or
   // the rotation axis and v1 are almost parallel.
   if (abs(aa.angle()) > 5*test_precision<Scalar>()
-      && (aa.axis() - v1.normalized()).norm() < 1.99
-      && (aa.axis() + v1.normalized()).norm() < 1.99) 
+      && (aa.axis() - v1.normalized()).norm() < Scalar(1.99)
+      && (aa.axis() + v1.normalized()).norm() < Scalar(1.99))
   {
     VERIFY_IS_NOT_APPROX(q1 * v1, Quaternionx(AngleAxisx(aa.angle()*2,aa.axis())) * v1);
   }
diff --git a/test/geo_transformations.cpp b/test/geo_transformations.cpp
index 51f90036d..48393a5c6 100644
--- a/test/geo_transformations.cpp
+++ b/test/geo_transformations.cpp
@@ -466,7 +466,7 @@ template<typename Scalar, int Mode, int Options> void transformations()
     Scalar a2 = R0.slerp(Scalar(k+1)/Scalar(path_steps), R1).angle();
     l += std::abs(a2-a1);
   }
-  VERIFY(l<=EIGEN_PI*(Scalar(1)+NumTraits<Scalar>::epsilon()*Scalar(path_steps/2)));
+  VERIFY(l<=Scalar(EIGEN_PI)*(Scalar(1)+NumTraits<Scalar>::epsilon()*Scalar(path_steps/2)));
   
   // check basic features
   {
diff --git a/test/linearstructure.cpp b/test/linearstructure.cpp
index 292f33969..e7f4b3dc5 100644
--- a/test/linearstructure.cpp
+++ b/test/linearstructure.cpp
@@ -21,6 +21,7 @@ template<typename MatrixType> void linearStructure(const MatrixType& m)
   */
   typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
 
   Index rows = m.rows();
   Index cols = m.cols();
@@ -32,7 +33,7 @@ template<typename MatrixType> void linearStructure(const MatrixType& m)
              m3(rows, cols);
 
   Scalar s1 = internal::random<Scalar>();
-  while (abs(s1)<1e-3) s1 = internal::random<Scalar>();
+  while (abs(s1)<RealScalar(1e-3)) s1 = internal::random<Scalar>();
 
   Index r = internal::random<Index>(0, rows-1),
         c = internal::random<Index>(0, cols-1);
diff --git a/test/lu.cpp b/test/lu.cpp
index f14435114..9787f4d86 100644
--- a/test/lu.cpp
+++ b/test/lu.cpp
@@ -11,6 +11,11 @@
 #include <Eigen/LU>
 using namespace std;
 
+template<typename MatrixType>
+typename MatrixType::RealScalar matrix_l1_norm(const MatrixType& m) {
+  return m.cwiseAbs().colwise().sum().maxCoeff();
+}
+
 template<typename MatrixType> void lu_non_invertible()
 {
   typedef typename MatrixType::Index Index;
@@ -143,7 +148,14 @@ template<typename MatrixType> void lu_invertible()
   m3 = MatrixType::Random(size,size);
   m2 = lu.solve(m3);
   VERIFY_IS_APPROX(m3, m1*m2);
-  VERIFY_IS_APPROX(m2, lu.inverse()*m3);
+  MatrixType m1_inverse = lu.inverse();
+  VERIFY_IS_APPROX(m2, m1_inverse*m3);
+
+  RealScalar rcond = (RealScalar(1) / matrix_l1_norm(m1)) / matrix_l1_norm(m1_inverse);
+  const RealScalar rcond_est = lu.rcond();
+  // Verify that the estimated condition number is within a factor of 10 of the
+  // truth.
+  VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10);
 
   // test solve with transposed
   lu.template _solve_impl_transposed<false>(m3, m2);
@@ -170,6 +182,7 @@ template<typename MatrixType> void lu_partial_piv()
      PartialPivLU.h
   */
   typedef typename MatrixType::Index Index;
+  typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
   Index size = internal::random<Index>(1,4);
 
   MatrixType m1(size, size), m2(size, size), m3(size, size);
@@ -181,7 +194,13 @@ template<typename MatrixType> void lu_partial_piv()
   m3 = MatrixType::Random(size,size);
   m2 = plu.solve(m3);
   VERIFY_IS_APPROX(m3, m1*m2);
-  VERIFY_IS_APPROX(m2, plu.inverse()*m3);
+  MatrixType m1_inverse = plu.inverse();
+  VERIFY_IS_APPROX(m2, m1_inverse*m3);
+
+  RealScalar rcond = (RealScalar(1) / matrix_l1_norm(m1)) / matrix_l1_norm(m1_inverse);
+  const RealScalar rcond_est = plu.rcond();
+  // Verify that the estimate is within a factor of 10 of the truth.
+  VERIFY(rcond_est > rcond / 10 && rcond_est < rcond * 10);
 
   // test solve with transposed
   plu.template _solve_impl_transposed<false>(m3, m2);
diff --git a/test/main.h b/test/main.h
index bba5e7570..1bfb9e1b0 100644
--- a/test/main.h
+++ b/test/main.h
@@ -275,6 +275,10 @@ inline void verify_impl(bool condition, const char *testname, const char *file,
 
 #define VERIFY(a) ::verify_impl(a, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a))
 
+#define VERIFY_GE(a, b) ::verify_impl(a >= b, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a >= b))
+#define VERIFY_LE(a, b) ::verify_impl(a <= b, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a <= b))
+
+
 #define VERIFY_IS_EQUAL(a, b) VERIFY(test_is_equal(a, b))
 #define VERIFY_IS_NOT_EQUAL(a, b) VERIFY(!test_is_equal(a, b))
 #define VERIFY_IS_APPROX(a, b) VERIFY(verifyIsApprox(a, b))
@@ -298,7 +302,7 @@ namespace Eigen {
 template<typename T> inline typename NumTraits<T>::Real test_precision() { return NumTraits<T>::dummy_precision(); }
 template<> inline float test_precision<float>() { return 1e-3f; }
 template<> inline double test_precision<double>() { return 1e-6; }
-template<> inline long double test_precision<long double>() { return 1e-6; }
+template<> inline long double test_precision<long double>() { return 1e-6l; }
 template<> inline float test_precision<std::complex<float> >() { return test_precision<float>(); }
 template<> inline double test_precision<std::complex<double> >() { return test_precision<double>(); }
 template<> inline long double test_precision<std::complex<long double> >() { return test_precision<long double>(); }
@@ -316,9 +320,9 @@ inline bool test_isMuchSmallerThan(const float& a, const float& b)
 { return internal::isMuchSmallerThan(a, b, test_precision<float>()); }
 inline bool test_isApproxOrLessThan(const float& a, const float& b)
 { return internal::isApproxOrLessThan(a, b, test_precision<float>()); }
+
 inline bool test_isApprox(const double& a, const double& b)
 { return internal::isApprox(a, b, test_precision<double>()); }
-
 inline bool test_isMuchSmallerThan(const double& a, const double& b)
 { return internal::isMuchSmallerThan(a, b, test_precision<double>()); }
 inline bool test_isApproxOrLessThan(const double& a, const double& b)
@@ -359,6 +363,12 @@ inline bool test_isApproxOrLessThan(const long double& a, const long double& b)
 { return internal::isApproxOrLessThan(a, b, test_precision<long double>()); }
 #endif // EIGEN_TEST_NO_LONGDOUBLE
 
+inline bool test_isApprox(const half& a, const half& b)
+{ return internal::isApprox(a, b, test_precision<half>()); }
+inline bool test_isMuchSmallerThan(const half& a, const half& b)
+{ return internal::isMuchSmallerThan(a, b, test_precision<half>()); }
+inline bool test_isApproxOrLessThan(const half& a, const half& b)
+{ return internal::isApproxOrLessThan(a, b, test_precision<half>()); }
 
 // test_relative_error returns the relative difference between a and b as a real scalar as used in isApprox.
 template<typename T1,typename T2>
@@ -426,9 +436,7 @@ template<typename T1,typename T2>
 typename NumTraits<T1>::Real test_relative_error(const T1 &a, const T2 &b, typename internal::enable_if<internal::is_arithmetic<typename NumTraits<T1>::Real>::value, T1>::type* = 0)
 {
   typedef typename NumTraits<T1>::Real RealScalar; 
-  using std::min;
-  using std::sqrt;
-  return sqrt(RealScalar(numext::abs2(a-b))/RealScalar((min)(numext::abs2(a),numext::abs2(b))));
+  return numext::sqrt(RealScalar(numext::abs2(a-b))/RealScalar((numext::mini)(numext::abs2(a),numext::abs2(b))));
 }
 
 template<typename T>
diff --git a/test/mixingtypes.cpp b/test/mixingtypes.cpp
index a3b469af8..0b381ec6c 100644
--- a/test/mixingtypes.cpp
+++ b/test/mixingtypes.cpp
@@ -148,10 +148,14 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
   VERIFY_IS_APPROX(sd*vd.adjoint()*mcd,  sd*vd.adjoint().template cast<CD>().eval()*mcd);
   VERIFY_IS_APPROX(scd*vd.adjoint()*mcd, scd*vd.adjoint().template cast<CD>().eval()*mcd);
 
-  VERIFY_IS_APPROX(sd*vcd.adjoint()*md.template triangularView<Upper>(),  sd*vcd.adjoint()*md.template cast<CD>().eval().template triangularView<Upper>());
+  VERIFY_IS_APPROX( sd*vcd.adjoint()*md.template triangularView<Upper>(),  sd*vcd.adjoint()*md.template cast<CD>().eval().template triangularView<Upper>());
   VERIFY_IS_APPROX(scd*vcd.adjoint()*md.template triangularView<Lower>(), scd*vcd.adjoint()*md.template cast<CD>().eval().template triangularView<Lower>());
-  VERIFY_IS_APPROX(sd*vd.adjoint()*mcd.template triangularView<Lower>(),  sd*vd.adjoint().template cast<CD>().eval()*mcd.template triangularView<Lower>());
+  VERIFY_IS_APPROX( sd*vcd.adjoint()*md.transpose().template triangularView<Upper>(),  sd*vcd.adjoint()*md.transpose().template cast<CD>().eval().template triangularView<Upper>());
+  VERIFY_IS_APPROX(scd*vcd.adjoint()*md.transpose().template triangularView<Lower>(), scd*vcd.adjoint()*md.transpose().template cast<CD>().eval().template triangularView<Lower>());
+  VERIFY_IS_APPROX( sd*vd.adjoint()*mcd.template triangularView<Lower>(),  sd*vd.adjoint().template cast<CD>().eval()*mcd.template triangularView<Lower>());
   VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.template triangularView<Upper>(), scd*vd.adjoint().template cast<CD>().eval()*mcd.template triangularView<Upper>());
+  VERIFY_IS_APPROX( sd*vd.adjoint()*mcd.transpose().template triangularView<Lower>(),  sd*vd.adjoint().template cast<CD>().eval()*mcd.transpose().template triangularView<Lower>());
+  VERIFY_IS_APPROX(scd*vd.adjoint()*mcd.transpose().template triangularView<Upper>(), scd*vd.adjoint().template cast<CD>().eval()*mcd.transpose().template triangularView<Upper>());
 
   // Not supported yet: trmm
 //   VERIFY_IS_APPROX(sd*mcd*md.template triangularView<Lower>(),  sd*mcd*md.template cast<CD>().eval().template triangularView<Lower>());
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index 37da6c86f..7f5a6512d 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -387,7 +387,7 @@ template<typename Scalar> void packetmath_real()
     data2[i] = internal::random<Scalar>(0,1) * std::pow(Scalar(10), internal::random<Scalar>(-6,6));
   }
 
-  if(internal::random<float>(0,1)<0.1)
+  if(internal::random<float>(0,1)<0.1f)
     data1[internal::random<int>(0, PacketSize)] = 0;
   CHECK_CWISE1_IF(PacketTraits::HasSqrt, std::sqrt, internal::psqrt);
   CHECK_CWISE1_IF(PacketTraits::HasLog, std::log, internal::plog);
diff --git a/test/product_large.cpp b/test/product_large.cpp
index 98f84c53b..845cd40ca 100644
--- a/test/product_large.cpp
+++ b/test/product_large.cpp
@@ -71,7 +71,7 @@ void test_product_large()
     std::ptrdiff_t m1 = internal::random<int>(10,100)*16;
     std::ptrdiff_t n1 = internal::random<int>(10,100)*16;
     // only makes sure it compiles fine
-    internal::computeProductBlockingSizes<float,float>(k1,m1,n1,1);
+    internal::computeProductBlockingSizes<float,float,std::ptrdiff_t>(k1,m1,n1,1);
   }
 
   {
diff --git a/test/qr_colpivoting.cpp b/test/qr_colpivoting.cpp
index 46c54b74f..ef3a6173b 100644
--- a/test/qr_colpivoting.cpp
+++ b/test/qr_colpivoting.cpp
@@ -206,7 +206,7 @@ template<typename MatrixType> void qr_kahan_matrix()
   RealScalar c = std::sqrt(1 - s*s);
   for (Index i = 0; i < rows; ++i) {
     m1(i, i) = pow(s, i);
-    m1.row(i).tail(rows - i - 1) = -pow(s, i) * c * MatrixType::Ones(1, rows - i - 1);
+    m1.row(i).tail(rows - i - 1) = -RealScalar(pow(s, i)) * c * MatrixType::Ones(1, rows - i - 1);
   }
   m1 = (m1 + m1.transpose()).eval();
   ColPivHouseholderQR<MatrixType> qr(m1);
diff --git a/test/rand.cpp b/test/rand.cpp
index 6790acf15..eeec34191 100644
--- a/test/rand.cpp
+++ b/test/rand.cpp
@@ -29,6 +29,9 @@ template<typename Scalar> void check_all_in_range(Scalar x, Scalar y)
   {
     mask( check_in_range(x,y)-x )++;
   }
+  for(Index i=0; i<mask.size(); ++i)
+    if(mask(i)==0)
+      std::cout << "WARNING: value " << x+i << " not reached." << std::endl;
   VERIFY( (mask>0).all() );
 }
 
diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp
index cb8ebaedf..aa3882583 100644
--- a/test/sparse_basic.cpp
+++ b/test/sparse_basic.cpp
@@ -232,11 +232,11 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
       for (Index i=0; i<m2.rows(); ++i)
       {
         float x = internal::random<float>(0,1);
-        if (x<0.1)
+        if (x<0.1f)
         {
           // do nothing
         }
-        else if (x<0.5)
+        else if (x<0.5f)
         {
           countFalseNonZero++;
           m2.insert(i,j) = Scalar(0);
diff --git a/test/sparse_block.cpp b/test/sparse_block.cpp
index 8a6e0687c..582bf34c3 100644
--- a/test/sparse_block.cpp
+++ b/test/sparse_block.cpp
@@ -150,7 +150,7 @@ template<typename SparseMatrixType> void sparse_block(const SparseMatrixType& re
     DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols);
     SparseMatrixType m2(rows, cols);
     initSparse<Scalar>(density, refMat2, m2);
-    if(internal::random<float>(0,1)>0.5) m2.makeCompressed();
+    if(internal::random<float>(0,1)>0.5f) m2.makeCompressed();
     Index j0 = internal::random<Index>(0,outer-2);
     Index j1 = internal::random<Index>(0,outer-2);
     Index n0 = internal::random<Index>(1,outer-(std::max)(j0,j1));
diff --git a/test/sparse_product.cpp b/test/sparse_product.cpp
index 7ec5270e8..501aeeaa6 100644
--- a/test/sparse_product.cpp
+++ b/test/sparse_product.cpp
@@ -245,7 +245,7 @@ template<typename SparseMatrixType> void sparse_product()
     for (int k=0; k<mS.outerSize(); ++k)
       for (typename SparseMatrixType::InnerIterator it(mS,k); it; ++it)
         if (it.index() == k)
-          it.valueRef() *= 0.5;
+          it.valueRef() *= Scalar(0.5);
 
     VERIFY_IS_APPROX(refS.adjoint(), refS);
     VERIFY_IS_APPROX(mS.adjoint(), mS);
diff --git a/test/sparse_vector.cpp b/test/sparse_vector.cpp
index d95f301d5..b3e1dda25 100644
--- a/test/sparse_vector.cpp
+++ b/test/sparse_vector.cpp
@@ -12,7 +12,7 @@
 template<typename Scalar,typename StorageIndex> void sparse_vector(int rows, int cols)
 {
   double densityMat = (std::max)(8./(rows*cols), 0.01);
-  double densityVec = (std::max)(8./float(rows), 0.1);
+  double densityVec = (std::max)(8./(rows), 0.1);
   typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
   typedef Matrix<Scalar,Dynamic,1> DenseVector;
   typedef SparseVector<Scalar,0,StorageIndex> SparseVectorType;
diff --git a/test/sparseqr.cpp b/test/sparseqr.cpp
index 50d1fcdf2..e8605fd21 100644
--- a/test/sparseqr.cpp
+++ b/test/sparseqr.cpp
@@ -54,7 +54,7 @@ template<typename Scalar> void test_sparseqr_scalar()
   
   b = dA * DenseVector::Random(A.cols());
   solver.compute(A);
-  if(internal::random<float>(0,1)>0.5)
+  if(internal::random<float>(0,1)>0.5f)
     solver.factorize(A);  // this checks that calling analyzePattern is not needed if the pattern do not change.
   if (solver.info() != Success)
   {
diff --git a/test/svd_common.h b/test/svd_common.h
index d8611b541..3588eefaa 100644
--- a/test/svd_common.h
+++ b/test/svd_common.h
@@ -141,14 +141,14 @@ void svd_least_square(const MatrixType& m, unsigned int computationOptions)
       using std::abs;
       
       SolutionType y(x);
-      y.row(k) = (1.+2*NumTraits<RealScalar>::epsilon())*x.row(k);
+      y.row(k) = (RealScalar(1)+2*NumTraits<RealScalar>::epsilon())*x.row(k);
       RealScalar residual_y = (m*y-rhs).norm();
       VERIFY( test_isMuchSmallerThan(abs(residual_y-residual), rhs_norm) || residual < residual_y );
       if(internal::is_same<RealScalar,float>::value) ++g_test_level;
       VERIFY( test_isApprox(residual_y,residual) || residual < residual_y );
       if(internal::is_same<RealScalar,float>::value) --g_test_level;
       
-      y.row(k) = (1.-2*NumTraits<RealScalar>::epsilon())*x.row(k);
+      y.row(k) = (RealScalar(1)-2*NumTraits<RealScalar>::epsilon())*x.row(k);
       residual_y = (m*y-rhs).norm();
       VERIFY( test_isMuchSmallerThan(abs(residual_y-residual), rhs_norm) || residual < residual_y );
       if(internal::is_same<RealScalar,float>::value) ++g_test_level;
diff --git a/test/svd_fill.h b/test/svd_fill.h
index 7e44b3d05..500954d47 100644
--- a/test/svd_fill.h
+++ b/test/svd_fill.h
@@ -54,7 +54,7 @@ void svd_fill_random(MatrixType &m, int Option = 0)
   }
   
   Matrix<Scalar,Dynamic,1> samples(7);
-  samples << 0, 5.60844e-313, -5.60844e-313, 4.94e-324, -4.94e-324, -1./NumTraits<RealScalar>::highest(), 1./NumTraits<RealScalar>::highest();
+  samples << 0, 5.60844e-313, -5.60844e-313, 4.94e-324, -4.94e-324, -RealScalar(1)/NumTraits<RealScalar>::highest(), RealScalar(1)/NumTraits<RealScalar>::highest();
   
   if(Option==Symmetric)
   {
@@ -80,6 +80,8 @@ void svd_fill_random(MatrixType &m, int Option = 0)
           Index i = internal::random<Index>(0,m.rows()-1);
           Index j = internal::random<Index>(0,m.cols()-1);
           m(j,i) = m(i,j) = samples(internal::random<Index>(0,samples.size()-1));
+          if(NumTraits<Scalar>::IsComplex)
+            *(&numext::real_ref(m(j,i))+1) = *(&numext::real_ref(m(i,j))+1) = samples.real()(internal::random<Index>(0,samples.size()-1));
         }
       }
     }
@@ -91,8 +93,14 @@ void svd_fill_random(MatrixType &m, int Option = 0)
     if(!(dup && unit_uv))
     {
       Index n = internal::random<Index>(0,m.size()-1);
-      for(Index i=0; i<n; ++i)
-        m(internal::random<Index>(0,m.rows()-1), internal::random<Index>(0,m.cols()-1)) = samples(internal::random<Index>(0,samples.size()-1));
+      for(Index k=0; k<n; ++k)
+      {
+        Index i = internal::random<Index>(0,m.rows()-1);
+        Index j = internal::random<Index>(0,m.cols()-1);
+        m(i,j) = samples(internal::random<Index>(0,samples.size()-1));
+        if(NumTraits<Scalar>::IsComplex)
+          *(&numext::real_ref(m(i,j))+1) = samples.real()(internal::random<Index>(0,samples.size()-1));
+      }
     }
   }
 }
diff --git a/test/swap.cpp b/test/swap.cpp
index 5d6f0e6af..f76e3624d 100644
--- a/test/swap.cpp
+++ b/test/swap.cpp
@@ -74,10 +74,13 @@ template<typename MatrixType> void swap(const MatrixType& m)
   m1 = m1_copy;
   m3 = m3_copy;
   
-  // test assertion on mismatching size -- matrix case
-  VERIFY_RAISES_ASSERT(m1.swap(m1.row(0)));
-  // test assertion on mismatching size -- xpr case
-  VERIFY_RAISES_ASSERT(m1.row(0).swap(m1));
+  if(m1.rows()>1)
+  {
+    // test assertion on mismatching size -- matrix case
+    VERIFY_RAISES_ASSERT(m1.swap(m1.row(0)));
+    // test assertion on mismatching size -- xpr case
+    VERIFY_RAISES_ASSERT(m1.row(0).swap(m1));
+  }
 }
 
 void test_swap()
diff --git a/test/triangular.cpp b/test/triangular.cpp
index 936c2aef3..3e120f406 100644
--- a/test/triangular.cpp
+++ b/test/triangular.cpp
@@ -65,7 +65,7 @@ template<typename MatrixType> void triangular_square(const MatrixType& m)
 
   m1 = MatrixType::Random(rows, cols);
   for (int i=0; i<rows; ++i)
-    while (numext::abs2(m1(i,i))<1e-1) m1(i,i) = internal::random<Scalar>();
+    while (numext::abs2(m1(i,i))<RealScalar(1e-1)) m1(i,i) = internal::random<Scalar>();
 
   Transpose<MatrixType> trm4(m4);
   // test back and forward subsitution with a vector as the rhs
@@ -78,7 +78,7 @@ template<typename MatrixType> void triangular_square(const MatrixType& m)
   m3 = m1.template triangularView<Lower>();
   VERIFY(v2.isApprox(m3.conjugate() * (m1.conjugate().template triangularView<Lower>().solve(v2)), largerEps));
 
-  // test back and forward subsitution with a matrix as the rhs
+  // test back and forward substitution with a matrix as the rhs
   m3 = m1.template triangularView<Upper>();
   VERIFY(m2.isApprox(m3.adjoint() * (m1.adjoint().template triangularView<Lower>().solve(m2)), largerEps));
   m3 = m1.template triangularView<Lower>();
diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp
index 35fbb9781..ee446c3c1 100644
--- a/test/vectorization_logic.cpp
+++ b/test/vectorization_logic.cpp
@@ -22,7 +22,11 @@ template<typename Dst, typename Src>
 bool test_assign(const Dst&, const Src&, int traversal, int unrolling)
 {
   typedef internal::copy_using_evaluator_traits<internal::evaluator<Dst>,internal::evaluator<Src>, internal::assign_op<typename Dst::Scalar> > traits;
-  bool res = traits::Traversal==traversal && traits::Unrolling==unrolling;
+  bool res = traits::Traversal==traversal;
+  if(unrolling==InnerUnrolling+CompleteUnrolling)
+    res = res && (int(traits::Unrolling)==InnerUnrolling || int(traits::Unrolling)==CompleteUnrolling);
+  else
+    res = res && int(traits::Unrolling)==unrolling;
   if(!res)
   {
     std::cerr << "Src: " << demangle_flags(Src::Flags) << std::endl;
@@ -147,10 +151,10 @@ struct vectorization_logic
 
     VERIFY(test_assign(Matrix44c().col(1),Matrix44c().col(2)+Matrix44c().col(3),
       InnerVectorizedTraversal,CompleteUnrolling));
-    
+
     VERIFY(test_assign(Matrix44r().row(2),Matrix44r().row(1)+Matrix44r().row(1),
       InnerVectorizedTraversal,CompleteUnrolling));
-        
+
     if(PacketSize>1)
     {
       typedef Matrix<Scalar,3,3,ColMajor> Matrix33c;
@@ -158,17 +162,29 @@ struct vectorization_logic
         LinearTraversal,CompleteUnrolling));
       VERIFY(test_assign(Matrix33c().col(0),Matrix33c().col(1)+Matrix33c().col(1),
         LinearTraversal,CompleteUnrolling));
-              
-      VERIFY(test_assign(Matrix3(),Matrix3().cwiseQuotient(Matrix3()),
-        PacketTraits::HasDiv ? LinearVectorizedTraversal : LinearTraversal,CompleteUnrolling));
-      
+
+      VERIFY(test_assign(Matrix3(),Matrix3().cwiseProduct(Matrix3()),
+        LinearVectorizedTraversal,CompleteUnrolling));
+
       VERIFY(test_assign(Matrix<Scalar,17,17>(),Matrix<Scalar,17,17>()+Matrix<Scalar,17,17>(),
         HalfPacketSize==1 ? InnerVectorizedTraversal : LinearTraversal,NoUnrolling));
-        
+
       VERIFY(test_assign(Matrix11(),Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(2,3)+Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(8,4),
         DefaultTraversal,PacketSize>4?InnerUnrolling:CompleteUnrolling));
+
+      VERIFY(test_assign(Vector1(),Matrix11()*Vector1(),
+                         InnerVectorizedTraversal,CompleteUnrolling));
+
+      VERIFY(test_assign(Matrix11(),Matrix11().lazyProduct(Matrix11()),
+                         InnerVectorizedTraversal,InnerUnrolling+CompleteUnrolling));
     }
-    
+
+    VERIFY(test_redux(Vector1(),
+      LinearVectorizedTraversal,CompleteUnrolling));
+
+    VERIFY(test_redux(Matrix<Scalar,PacketSize,3>(),
+      LinearVectorizedTraversal,CompleteUnrolling));
+
     VERIFY(test_redux(Matrix3(),
       LinearVectorizedTraversal,CompleteUnrolling));
 
@@ -226,6 +242,7 @@ struct vectorization_logic_half
     typedef Matrix<Scalar,PacketSize,1> Vector1;
     typedef Matrix<Scalar,PacketSize,PacketSize> Matrix11;
     typedef Matrix<Scalar,5*PacketSize,7,ColMajor> Matrix57;
+    typedef Matrix<Scalar,3*PacketSize,5,ColMajor> Matrix35;
     typedef Matrix<Scalar,5*PacketSize,7,DontAlign|ColMajor> Matrix57u;
 //     typedef Matrix<Scalar,(Matrix11::Flags&RowMajorBit)?16:4*PacketSize,(Matrix11::Flags&RowMajorBit)?4*PacketSize:16> Matrix44;
 //     typedef Matrix<Scalar,(Matrix11::Flags&RowMajorBit)?16:4*PacketSize,(Matrix11::Flags&RowMajorBit)?4*PacketSize:16,DontAlign|EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION> Matrix44u;
@@ -291,12 +308,24 @@ struct vectorization_logic_half
         
       VERIFY(test_assign(Matrix11(),Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(2,3)+Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(8,4),
         DefaultTraversal,PacketSize>4?InnerUnrolling:CompleteUnrolling));
+
+      VERIFY(test_assign(Vector1(),Matrix11()*Vector1(),
+                         InnerVectorizedTraversal,CompleteUnrolling));
+
+      VERIFY(test_assign(Matrix11(),Matrix11().lazyProduct(Matrix11()),
+                         InnerVectorizedTraversal,InnerUnrolling+CompleteUnrolling));
     }
     
+    VERIFY(test_redux(Vector1(),
+      LinearVectorizedTraversal,CompleteUnrolling));
+
+    VERIFY(test_redux(Matrix<Scalar,PacketSize,3>(),
+      LinearVectorizedTraversal,CompleteUnrolling));
+
     VERIFY(test_redux(Matrix3(),
       LinearVectorizedTraversal,CompleteUnrolling));
 
-    VERIFY(test_redux(Matrix57(),
+    VERIFY(test_redux(Matrix35(),
       LinearVectorizedTraversal,CompleteUnrolling));
 
     VERIFY(test_redux(Matrix57().template block<PacketSize,3>(1,0),
diff --git a/unsupported/Eigen/CXX11/CMakeLists.txt b/unsupported/Eigen/CXX11/CMakeLists.txt
index f1d9f0482..a40bc4715 100644
--- a/unsupported/Eigen/CXX11/CMakeLists.txt
+++ b/unsupported/Eigen/CXX11/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(Eigen_CXX11_HEADERS Core Tensor TensorSymmetry)
+set(Eigen_CXX11_HEADERS Tensor TensorSymmetry ThreadPool)
 
 install(FILES
   ${Eigen_CXX11_HEADERS}
diff --git a/unsupported/Eigen/CXX11/Core b/unsupported/Eigen/CXX11/Core
deleted file mode 100644
index 946145f5a..000000000
--- a/unsupported/Eigen/CXX11/Core
+++ /dev/null
@@ -1,51 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_CORE_MODULE
-#define EIGEN_CXX11_CORE_MODULE
-
-#include <Eigen/Core>
-
-#include <Eigen/src/Core/util/DisableStupidWarnings.h>
-
-/** \defgroup CXX11_Core_Module C++11 Core Module
-  *
-  * This module provides common core features for all modules that
-  * explicitly depend on C++11. Currently, this is only the Tensor
-  * module. Note that at this stage, you should not need to include
-  * this module directly.
-  *
-  * It also provides a limited fallback for compilers that don't support
-  * CXX11 yet, such as nvcc.
-  *
-  * \code
-  * #include <Eigen/CXX11/Core>
-  * \endcode
-  */
-
-#include <vector>
-
-#include "src/Core/util/EmulateArray.h"
-#include "src/Core/util/MaxSizeVector.h"
-
-// Emulate the cxx11 functionality that we need if the compiler doesn't support it.
-// Visual studio 2015 doesn't advertise itself as cxx11 compliant, although it
-// supports enough of the standard for our needs
-#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900
-#include "src/Core/util/CXX11Workarounds.h"
-#include "src/Core/util/CXX11Meta.h"
-#else
-#include "src/Core/util/EmulateCXX11Meta.h"
-#endif
-
-#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
-
-#endif // EIGEN_CXX11_CORE_MODULE
-
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 16132398d..1e97ad3c0 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -11,10 +11,12 @@
 //#ifndef EIGEN_CXX11_TENSOR_MODULE
 //#define EIGEN_CXX11_TENSOR_MODULE
 
-#include "Core"
+#include "../../../Eigen/Core"
 
 #include <Eigen/src/Core/util/DisableStupidWarnings.h>
 
+#include "src/util/CXX11Meta.h"
+#include "src/util/MaxSizeVector.h"
 
 /** \defgroup CXX11_Tensor_Module Tensor Module
   *
@@ -26,6 +28,7 @@
   * \endcode
   */
 
+#include <cmath>
 #include <cstddef>
 #include <cstring>
 
@@ -51,11 +54,7 @@ typedef unsigned __int64 uint64_t;
 #endif
 
 #ifdef EIGEN_USE_THREADS
-#include <atomic>
-#include <condition_variable>
-#include <deque>
-#include <mutex>
-#include <thread>
+#include "ThreadPool"
 #endif
 
 #ifdef EIGEN_USE_GPU
@@ -84,6 +83,7 @@ typedef unsigned __int64 uint64_t;
 
 #include "src/Tensor/TensorBase.h"
 
+#include "src/Tensor/TensorCostModel.h"
 #include "src/Tensor/TensorEvaluator.h"
 #include "src/Tensor/TensorExpr.h"
 #include "src/Tensor/TensorReduction.h"
diff --git a/unsupported/Eigen/CXX11/TensorSymmetry b/unsupported/Eigen/CXX11/TensorSymmetry
index f1dc25fea..fb1b0c0fb 100644
--- a/unsupported/Eigen/CXX11/TensorSymmetry
+++ b/unsupported/Eigen/CXX11/TensorSymmetry
@@ -14,6 +14,8 @@
 
 #include <Eigen/src/Core/util/DisableStupidWarnings.h>
 
+#include "src/util/CXX11Meta.h"
+
 /** \defgroup CXX11_TensorSymmetry_Module Tensor Symmetry Module
   *
   * This module provides a classes that allow for the definition of
diff --git a/unsupported/Eigen/CXX11/ThreadPool b/unsupported/Eigen/CXX11/ThreadPool
new file mode 100644
index 000000000..09d637e9a
--- /dev/null
+++ b/unsupported/Eigen/CXX11/ThreadPool
@@ -0,0 +1,65 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_MODULE
+#define EIGEN_CXX11_THREADPOOL_MODULE
+
+#include "../../../Eigen/Core"
+
+#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+
+/** \defgroup CXX11_ThreadPool_Module C++11 ThreadPool Module
+  *
+  * This module provides 2 threadpool implementations
+  *  - a simple reference implementation
+  *  - a faster non blocking implementation
+  *
+  * This module requires C++11.
+  *
+  * \code
+  * #include <Eigen/CXX11/ThreadPool>
+  * \endcode
+  */
+
+
+// The code depends on CXX11, so only include the module if the
+// compiler supports it.
+#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900
+#include <cstddef>
+#include <cstring>
+#include <stdint.h>
+#include <time.h>
+
+#include <vector>
+#include <atomic>
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+#include <thread>
+#include <functional>
+#include <memory>
+
+#include "src/util/CXX11Meta.h"
+#include "src/util/MaxSizeVector.h"
+
+#include "src/ThreadPool/ThreadLocal.h"
+#include "src/ThreadPool/ThreadYield.h"
+#include "src/ThreadPool/EventCount.h"
+#include "src/ThreadPool/RunQueue.h"
+#include "src/ThreadPool/ThreadPoolInterface.h"
+#include "src/ThreadPool/ThreadEnvironment.h"
+#include "src/ThreadPool/SimpleThreadPool.h"
+#include "src/ThreadPool/NonBlockingThreadPool.h"
+
+#endif
+
+#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+
+#endif // EIGEN_CXX11_THREADPOOL_MODULE
+
diff --git a/unsupported/Eigen/CXX11/src/CMakeLists.txt b/unsupported/Eigen/CXX11/src/CMakeLists.txt
index d90ee1b0f..1734262bb 100644
--- a/unsupported/Eigen/CXX11/src/CMakeLists.txt
+++ b/unsupported/Eigen/CXX11/src/CMakeLists.txt
@@ -1,3 +1,4 @@
-add_subdirectory(Core)
+add_subdirectory(util)
+add_subdirectory(ThreadPool)
 add_subdirectory(Tensor)
 add_subdirectory(TensorSymmetry)
diff --git a/unsupported/Eigen/CXX11/src/Core/CMakeLists.txt b/unsupported/Eigen/CXX11/src/Core/CMakeLists.txt
deleted file mode 100644
index 28571dcb9..000000000
--- a/unsupported/Eigen/CXX11/src/Core/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-add_subdirectory(util)
diff --git a/unsupported/Eigen/CXX11/src/Core/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/Core/util/CMakeLists.txt
deleted file mode 100644
index 1e3b14712..000000000
--- a/unsupported/Eigen/CXX11/src/Core/util/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_Core_util_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_CXX11_Core_util_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/Core/util COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
index f1ec04c49..babafe108 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
@@ -112,6 +112,11 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
     return CoeffReturnType(index, m_impl.coeff(index));
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1);
+  }
+
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
  protected:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index 199d2ce41..cb615c75b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -36,7 +36,7 @@ struct traits<TensorAssignOp<LhsXprType, RhsXprType> >
   static const int Layout = internal::traits<LhsXprType>::Layout;
 
   enum {
-    Flags = 0,
+    Flags = 0
   };
 };
 
@@ -89,12 +89,18 @@ template<typename LeftArgType, typename RightArgType, typename Device>
 struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
 {
   typedef TensorAssignOp<LeftArgType, RightArgType> XprType;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
     IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
     Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-    RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess,
+    RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess
   };
 
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
@@ -104,12 +110,6 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
   }
 
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
-
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
   {
     // The dimensions of the lhs and the rhs tensors should be equal to prevent
@@ -150,6 +150,19 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
     return m_leftImpl.template packet<LoadMode>(index);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    // We assume that evalPacket or evalScalar is called to perform the
+    // assignment and account for the cost of the write here, but reduce left
+    // cost by one load because we are using m_leftImpl.coeffRef.
+    TensorOpCost left = m_leftImpl.costPerCoeff(vectorized);
+    return m_rightImpl.costPerCoeff(vectorized) +
+           TensorOpCost(
+               numext::maxi(0.0, left.bytes_loaded() - sizeof(CoeffReturnType)),
+               left.bytes_stored(), left.compute_cycles()) +
+           TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
+  }
+
   EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_leftImpl.data(); }
 
  private:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index 69d1802d5..1a34f3ccc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -334,6 +334,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return binaryExpr(other.derived(), internal::scalar_boolean_or_op());
     }
 
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_boolean_xor_op, const Derived, const OtherDerived>
+    operator^(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_boolean_xor_op());
+    }
+
     // Comparisons and tests.
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LT>, const Derived, const OtherDerived>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index b6e6db12a..c771496e2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -101,6 +101,9 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   typedef DSizes<Index, NumDims> Dimensions;
   typedef typename XprType::Scalar Scalar;
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
     IsAligned = false,
@@ -140,9 +143,6 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     }
   }
 
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
@@ -247,9 +247,8 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     const Index originalIndex = index;
 
@@ -284,12 +283,12 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
 
     // Todo: this could be extended to the second dimension if we're not
     // broadcasting alongside the first dimension, and so on.
-    if (innermostLoc + packetSize <= m_impl.dimensions()[0]) {
+    if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) {
       return m_impl.template packet<Unaligned>(inputIndex);
     } else {
-      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
       values[0] = m_impl.coeff(inputIndex);
-      for (int i = 1; i < packetSize; ++i) {
+      for (int i = 1; i < PacketSize; ++i) {
         values[i] = coeffColMajor(originalIndex+i);
       }
       PacketReturnType rslt = internal::pload<PacketReturnType>(values);
@@ -300,9 +299,8 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     const Index originalIndex = index;
 
@@ -337,12 +335,12 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
 
     // Todo: this could be extended to the second dimension if we're not
     // broadcasting alongside the first dimension, and so on.
-    if (innermostLoc + packetSize <= m_impl.dimensions()[NumDims-1]) {
+    if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims-1]) {
       return m_impl.template packet<Unaligned>(inputIndex);
     } else {
-      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
       values[0] = m_impl.coeff(inputIndex);
-      for (int i = 1; i < packetSize; ++i) {
+      for (int i = 1; i < PacketSize; ++i) {
         values[i] = coeffRowMajor(originalIndex+i);
       }
       PacketReturnType rslt = internal::pload<PacketReturnType>(values);
@@ -350,6 +348,29 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     }
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    double compute_cost = TensorOpCost::AddCost<Index>();
+    if (NumDims > 0) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        compute_cost += TensorOpCost::DivCost<Index>();
+        if (internal::index_statically_eq<Broadcast>()(i, 1)) {
+          compute_cost +=
+              TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
+        } else {
+          if (!internal::index_statically_eq<InputDimensions>()(i, 1)) {
+            compute_cost += TensorOpCost::MulCost<Index>() +
+                            TensorOpCost::ModCost<Index>() +
+                            TensorOpCost::AddCost<Index>();
+          }
+        }
+        compute_cost +=
+            TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
+      }
+    }
+    return m_impl.costPerCoeff(vectorized) +
+           TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+  }
 
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index c21a98fe0..f7a9f4ed3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -134,6 +134,10 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
   typedef typename XprType::Index Index;
   typedef DSizes<Index, NumDims> Dimensions;
   typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
 
   enum {
     // Alignment can't be guaranteed at compile time since it depends on the
@@ -148,8 +152,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device)
   {
-    // We could also support the case where NumInputDims==1 if needed.
-    EIGEN_STATIC_ASSERT(NumInputDims >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT(NumInputDims >= 1, YOU_MADE_A_PROGRAMMING_MISTAKE);
     eigen_assert(NumInputDims > m_dim.actualDim());
 
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
@@ -180,9 +183,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
     m_inputOffset = m_stride * op.offset();
   }
 
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
@@ -202,17 +202,16 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
 	(static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
       // m_stride is equal to 1, so let's avoid the integer division.
       eigen_assert(m_stride == 1);
       Index inputIndex = index * m_inputStride + m_inputOffset;
-      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
-      for (int i = 0; i < packetSize; ++i) {
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+      for (int i = 0; i < PacketSize; ++i) {
         values[i] = m_impl.coeff(inputIndex);
         inputIndex += m_inputStride;
       }
@@ -226,13 +225,13 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
     } else {
       const Index idx = index / m_stride;
       const Index rem = index - idx * m_stride;
-      if (rem + packetSize <= m_stride) {
+      if (rem + PacketSize <= m_stride) {
         Index inputIndex = idx * m_inputStride + m_inputOffset + rem;
         return m_impl.template packet<LoadMode>(inputIndex);
       } else {
         // Cross the stride boundary. Fallback to slow path.
-        EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
-        for (int i = 0; i < packetSize; ++i) {
+        EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+        for (int i = 0; i < PacketSize; ++i) {
           values[i] = coeff(index);
           ++index;
         }
@@ -242,6 +241,28 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
     }
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    double cost = 0;
+    if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) &&
+         m_dim.actualDim() == 0) ||
+        (static_cast<int>(Layout) == static_cast<int>(RowMajor) &&
+         m_dim.actualDim() == NumInputDims - 1)) {
+      cost += TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
+    } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) &&
+                m_dim.actualDim() == NumInputDims - 1) ||
+               (static_cast<int>(Layout) == static_cast<int>(RowMajor) &&
+                m_dim.actualDim() == 0)) {
+      cost += TensorOpCost::AddCost<Index>();
+    } else {
+      cost += 3 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>() +
+              3 * TensorOpCost::AddCost<Index>();
+    }
+
+    return m_impl.costPerCoeff(vectorized) +
+           TensorOpCost(0, 0, cost, vectorized, PacketSize);
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const {
     CoeffReturnType* result = const_cast<CoeffReturnType*>(m_impl.data());
     if (((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumDims) ||
@@ -298,6 +319,9 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
   typedef typename XprType::Index Index;
   typedef DSizes<Index, NumDims> Dimensions;
   typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
     IsAligned = false,
@@ -309,9 +333,6 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
     : Base(op, device)
     { }
 
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
   {
     return this->m_impl.coeffRef(this->srcCoeff(index));
@@ -320,17 +341,16 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
-    static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
 
     if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == 0) ||
 	(static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) {
       // m_stride is equal to 1, so let's avoid the integer division.
       eigen_assert(this->m_stride == 1);
-      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
       internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
       Index inputIndex = index * this->m_inputStride + this->m_inputOffset;
-      for (int i = 0; i < packetSize; ++i) {
+      for (int i = 0; i < PacketSize; ++i) {
         this->m_impl.coeffRef(inputIndex) = values[i];
         inputIndex += this->m_inputStride;
       }
@@ -342,14 +362,14 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
     } else {
       const Index idx = index / this->m_stride;
       const Index rem = index - idx * this->m_stride;
-      if (rem + packetSize <= this->m_stride) {
+      if (rem + PacketSize <= this->m_stride) {
         const Index inputIndex = idx * this->m_inputStride + this->m_inputOffset + rem;
         this->m_impl.template writePacket<StoreMode>(inputIndex, x);
       } else {
         // Cross stride boundary. Fallback to slow path.
-        EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+        EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
         internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
-        for (int i = 0; i < packetSize; ++i) {
+        for (int i = 0; i < PacketSize; ++i) {
           this->coeffRef(index) = values[i];
           ++index;
         }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index 7738f18fb..839c6e3e5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -260,6 +260,21 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
     return rslt;
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
+                                           2 * TensorOpCost::MulCost<Index>() +
+                                           TensorOpCost::DivCost<Index>() +
+                                           TensorOpCost::ModCost<Index>());
+    const double lhs_size = m_leftImpl.dimensions().TotalSize();
+    const double rhs_size = m_rightImpl.dimensions().TotalSize();
+    return (lhs_size / (lhs_size + rhs_size)) *
+               m_leftImpl.costPerCoeff(vectorized) +
+           (rhs_size / (lhs_size + rhs_size)) *
+               m_rightImpl.costPerCoeff(vectorized) +
+           TensorOpCost(0, 0, compute_cost);
+  }
+
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
   protected:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index f070ba61e..f8ec0614f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -37,11 +37,11 @@ struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
   typedef typename remove_reference<RhsNested>::type _RhsNested;
 
   // From NumDims below.
-  static const int NumDimensions = max_n_1<traits<RhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value>::size;
+  static const int NumDimensions = traits<RhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value;
   static const int Layout = traits<LhsXprType>::Layout;
 
   enum {
-    Flags = 0,
+    Flags = 0
   };
 };
 
@@ -65,7 +65,7 @@ struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_,
   typedef Device_ Device;
 
   // From NumDims below.
-  static const int NumDimensions = max_n_1<traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value>::size;
+  static const int NumDimensions = traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value;
 };
 
 }  // end namespace internal
@@ -140,11 +140,11 @@ struct TensorContractionEvaluatorBase
   static const int RDims =
       internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
   static const int ContractDims = internal::array_size<Indices>::value;
-  static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
+  static const int NumDims = LDims + RDims - 2 * ContractDims;
 
   typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
-  typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
 
   typedef DSizes<Index, NumDims> Dimensions;
 
@@ -218,11 +218,9 @@ struct TensorContractionEvaluatorBase
       rhs_strides[i+1] = rhs_strides[i] * eval_right_dims[i];
     }
 
-    m_i_strides[0] = 1;
-    m_j_strides[0] = 1;
-    if(ContractDims) {
-        m_k_strides[0] = 1;
-    }
+    if (m_i_strides.size() > 0) m_i_strides[0] = 1;
+    if (m_j_strides.size() > 0) m_j_strides[0] = 1;
+    if (m_k_strides.size() > 0) m_k_strides[0] = 1;
 
     m_i_size = 1;
     m_j_size = 1;
@@ -318,11 +316,6 @@ struct TensorContractionEvaluatorBase
       }
     }
 
-    // Scalar case. We represent the result as a 1d tensor of size 1.
-    if (LDims + RDims == 2 * ContractDims) {
-      m_dimensions[0] = 1;
-    }
-
     // If the layout is RowMajor, we need to reverse the m_dimensions
     if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) {
       for (int i = 0, j = NumDims - 1; i < j; i++, j--) {
@@ -426,6 +419,99 @@ struct TensorContractionEvaluatorBase
         buffer, resIncr, alpha);
   }
 
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const {
+    // columns in left side, rows in right side
+    const Index k = this->m_k_size;
+
+    // rows in left side
+    const Index m = this->m_i_size;
+
+    // columns in right side
+    const Index n = this->m_j_size;
+
+    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
+    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+
+    // define mr, nr, and all of my data mapper types
+    typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+    typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+    typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+
+    const Index nr = Traits::nr;
+    const Index mr = Traits::mr;
+
+    typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+    typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+
+    const Index lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size;
+    const Index rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size;
+
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+                                                   LeftEvaluator, left_nocontract_t,
+                                                   contract_t, lhs_packet_size,
+                                                   lhs_inner_dim_contiguous,
+                                                   false, Unaligned> LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+                                                   RightEvaluator, right_nocontract_t,
+                                                   contract_t, rhs_packet_size,
+                                                   rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+    // Declare GEBP packing and kernel structs
+    internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, ColMajor> pack_lhs;
+    internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, nr, ColMajor> pack_rhs;
+
+    internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, mr, nr, false, false> gebp;
+
+    // initialize data mappers
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+                  this->m_left_contracting_strides, this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+                  this->m_right_contracting_strides, this->m_k_strides);
+
+    OutputMapper output(buffer, m);
+
+    // Sizes of the blocks to load in cache. See the Goto paper for details.
+    internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, 1);
+    const Index kc = blocking.kc();
+    const Index mc = numext::mini(m, blocking.mc());
+    const Index nc = numext::mini(n, blocking.nc());
+    const Index sizeA = mc * kc;
+    const Index sizeB = kc * nc;
+
+    LhsScalar* blockA = static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar)));
+    RhsScalar* blockB = static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar)));
+
+    for(Index i2=0; i2<m; i2+=mc)
+    {
+      const Index actual_mc = numext::mini(i2+mc,m)-i2;
+      for (Index k2 = 0; k2 < k; k2 += kc) {
+        // make sure we don't overshoot right edge of left matrix, then pack vertical panel
+        const Index actual_kc = numext::mini(k2 + kc, k) - k2;
+        pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0);
+
+        // series of horizontal blocks
+        for (Index j2 = 0; j2 < n; j2 += nc) {
+          // make sure we don't overshoot right edge of right matrix, then pack block
+          const Index actual_nc = numext::mini(j2 + nc, n) - j2;
+          pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc, 0, 0);
+
+          // call gebp (matrix kernel)
+          // The parameters here are copied from Eigen's GEMM implementation
+          gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, 1.0, -1, -1, 0, 0);
+        }
+      }
+    }
+
+    this->m_device.deallocate(blockA);
+    this->m_device.deallocate(blockB);
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
     m_leftImpl.cleanup();
     m_rightImpl.cleanup();
@@ -440,6 +526,10 @@ struct TensorContractionEvaluatorBase
     return m_result[index];
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
+  }
+
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
     return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
@@ -491,7 +581,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
 
   enum {
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout
   };
 
   // Most of the code is assuming that both input tensors are ColMajor. If the
@@ -510,15 +600,14 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   static const int ContractDims = internal::array_size<Indices>::value;
 
   typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
-  typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
 
-  static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
+  static const int NumDims = LDims + RDims - 2 * ContractDims;
 
   // Could we use NumDimensions here?
   typedef DSizes<Index, NumDims> Dimensions;
 
-
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
       Base(op, device) { }
 
@@ -529,100 +618,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
       return;
     }
 
-    evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
-  }
-
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const {
-    // columns in left side, rows in right side
-    const Index k = this->m_k_size;
-
-    // rows in left side
-    const Index m = this->m_i_size;
-
-    // columns in right side
-    const Index n = this->m_j_size;
-
-    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
-    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
-
-    // define mr, nr, and all of my data mapper types
-    typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
-    typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
-    typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
-
-    const Index nr = Traits::nr;
-    const Index mr = Traits::mr;
-
-    typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
-    typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
-
-    const Index lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size;
-    const Index rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size;
-
-    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
-                                                   LeftEvaluator, left_nocontract_t,
-                                                   contract_t, lhs_packet_size,
-                                                   lhs_inner_dim_contiguous,
-                                                   false, Unaligned> LhsMapper;
-
-    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
-                                                   RightEvaluator, right_nocontract_t,
-                                                   contract_t, rhs_packet_size,
-                                                   rhs_inner_dim_contiguous,
-                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
-
-    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
-
-    // Declare GEBP packing and kernel structs
-    internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, ColMajor> pack_lhs;
-    internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, nr, ColMajor> pack_rhs;
-
-    internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, mr, nr, false, false> gebp;
-
-    // initialize data mappers
-    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
-                  this->m_left_contracting_strides, this->m_k_strides);
-
-    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
-                  this->m_right_contracting_strides, this->m_k_strides);
-
-    OutputMapper output(buffer, m);
-
-    // Sizes of the blocks to load in cache. See the Goto paper for details.
-    internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, 1);
-    const Index kc = blocking.kc();
-    const Index mc = numext::mini(m, blocking.mc());
-    const Index nc = numext::mini(n, blocking.nc());
-    const Index sizeA = mc * kc;
-    const Index sizeB = kc * nc;
-
-    LhsScalar* blockA = static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar)));
-    RhsScalar* blockB = static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar)));
-
-    for(Index i2=0; i2<m; i2+=mc)
-    {
-      const Index actual_mc = numext::mini(i2+mc,m)-i2;
-      for (Index k2 = 0; k2 < k; k2 += kc) {
-        // make sure we don't overshoot right edge of left matrix, then pack vertical panel
-        const Index actual_kc = numext::mini(k2 + kc, k) - k2;
-        pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0);
-
-        // series of horizontal blocks
-        for (Index j2 = 0; j2 < n; j2 += nc) {
-          // make sure we don't overshoot right edge of right matrix, then pack block
-          const Index actual_nc = numext::mini(j2 + nc, n) - j2;
-          pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc, 0, 0);
-
-          // call gebp (matrix kernel)
-          // The parameters here are copied from Eigen's GEMM implementation
-          gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, 1.0, -1, -1, 0, 0);
-        }
-      }
-    }
-
-    this->m_device.deallocate(blockA);
-    this->m_device.deallocate(blockB);
+    this->template evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
   }
 };
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
index 3d3f6904f..5cf7b4f71 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
@@ -35,9 +35,7 @@ class TensorContractionBlocking {
       computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, mc_, nc_, num_threads);
     }
     else {
-      if (kc_ && mc_ && nc_) {
-        mc_ = (((m / num_threads) + 15) / 16) * 16;
-      }
+      computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, nc_, mc_, num_threads);
     }
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index dbff660a9..886474986 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -543,12 +543,12 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
 #define prefetch_lhs(reg, row, col)                   \
     if (!CHECK_LHS_BOUNDARY) {                        \
       if (col < k_size) {                             \
-        reg =lhs.loadPacket(row, col);                \
+        reg =lhs.loadPacket<Unaligned>(row, col);     \
       }                                               \
     } else {                                          \
       if (col < k_size) {                             \
         if (row + 3 < m_size) {                       \
-          reg =lhs.loadPacket(row, col);              \
+          reg =lhs.loadPacket<Unaligned>(row, col);   \
         } else if (row + 2 < m_size) {                \
           reg.x =lhs(row + 0, col);                   \
           reg.y =lhs(row + 1, col);                   \
@@ -578,7 +578,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
     if (!CHECK_RHS_BOUNDARY) {
       if ((rhs_vert + 3) < k_size) {
         // just CHECK_RHS_BOUNDARY
-        rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0);
+        rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
       } else if (rhs_vert + 2 < k_size) {
         // just CHECK_RHS_BOUNDARY
         rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -593,7 +593,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
     } else {
       if (rhs_horiz0 < n_size) {
         if ((rhs_vert + 3) < k_size) {
-          rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0);
+          rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
         } else if ((rhs_vert + 2) < k_size) {
           rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
           rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
@@ -790,37 +790,37 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
 
      if (!CHECK_LHS_BOUNDARY) {
       if ((threadIdx.y/4+k+24) < k_size) {
-        lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
-        lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
-        lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16));
-        lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24));
+        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+        lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+        lhs_pf3 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
       } else if ((threadIdx.y/4+k+16) < k_size) {
-        lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
-        lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
-        lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16));
+        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+        lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
       } else if ((threadIdx.y/4+k+8) < k_size) {
-        lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
-        lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
+        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
       } else if ((threadIdx.y/4+k) < k_size) {
-        lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
       }
     } else {
       // just CHECK_LHS_BOUNDARY
       if (lhs_vert + 3 < m_size) {
         if ((threadIdx.y/4+k+24) < k_size) {
-          lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
-          lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
-          lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16));
-          lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24));
+          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+          lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+          lhs_pf3 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
         } else if ((threadIdx.y/4+k+16) < k_size) {
-          lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
-          lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
-          lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16));
+          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+          lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
         } else if ((threadIdx.y/4+k+8) < k_size) {
-          lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
-          lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
+          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
         } else if ((threadIdx.y/4+k) < k_size) {
-          lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
         }
       } else if (lhs_vert + 2 < m_size) {
         if ((threadIdx.y/4+k+24) < k_size) {
@@ -909,8 +909,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
     if (!CHECK_RHS_BOUNDARY) {
       if ((rhs_vert + 3) < k_size) {
         // just CHECK_RHS_BOUNDARY
-        rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0);
-        rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1);
+        rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+        rhs_pf1 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
       } else if (rhs_vert + 2 < k_size) {
         // just CHECK_RHS_BOUNDARY
         rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -932,8 +932,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
       if (rhs_horiz1 < n_size) {
         if ((rhs_vert + 3) < k_size) {
           // just CHECK_RHS_BOUNDARY
-          rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0);
-          rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1);
+          rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+          rhs_pf1 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
         } else if (rhs_vert + 2 < k_size) {
           // just CHECK_RHS_BOUNDARY
           rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -954,7 +954,7 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
       } else if (rhs_horiz0 < n_size) {
         if ((rhs_vert + 3) < k_size) {
           // just CHECK_RHS_BOUNDARY
-          rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0);
+          rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
         } else if ((rhs_vert + 2) < k_size) {
           // just CHECK_RHS_BOUNDARY
           rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -1240,10 +1240,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   typedef array<Index, RDims> right_dim_mapper_t;
 
   typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
-  typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
 
-  static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
+  static const int NumDims = LDims + RDims - 2 * ContractDims;
 
   typedef DSizes<Index, NumDims> Dimensions;
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
index 392cb6e3d..b27e1a1b4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@@ -16,7 +16,7 @@ namespace internal {
 
 enum {
   Rhs = 0,
-  Lhs = 1,
+  Lhs = 1
 };
 
 /*
@@ -233,7 +233,7 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar,
   typedef typename Tensor::PacketReturnType Packet;
   typedef typename unpacket_traits<Packet>::half HalfPacket;
 
-  template <int AlignmentType = Alignment>
+  template <int AlignmentType>
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
     // whole method makes column major assumption
@@ -276,7 +276,7 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar,
     return pload<Packet>(data);
   }
 
-  template <int AlignmentType = Alignment>
+  template <int AlignmentType>
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
     // whole method makes column major assumption
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index 9044454fd..73c48828c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -92,10 +92,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   typedef array<Index, RDims> right_dim_mapper_t;
 
   typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
-  typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
 
-  static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
+  static const int NumDims = LDims + RDims - 2 * ContractDims;
 
   typedef DSizes<Index, NumDims> Dimensions;
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index a96776a77..a2f1f71f5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -177,7 +177,6 @@ template <typename Eval, typename Scalar> struct ConversionSubExprEval<true, Eva
 };
 
 
-
 // Eval as rvalue
 template<typename TargetType, typename ArgType, typename Device>
 struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
@@ -190,6 +189,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
   typedef typename internal::remove_all<typename internal::traits<ArgType>::Scalar>::type SrcType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef typename PacketType<SrcType, Device>::type PacketSourceType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
     IsAligned = false,
@@ -231,6 +231,21 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
     return converter.template packet<LoadMode>(index);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    const double cast_cost = TensorOpCost::CastCost<SrcType, TargetType>();
+    if (vectorized) {
+      const double SrcCoeffRatio =
+          internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
+      const double TgtCoeffRatio =
+          internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
+      return m_impl.costPerCoeff(vectorized) * (SrcCoeffRatio / PacketSize) +
+          TensorOpCost(0, 0, TgtCoeffRatio * (cast_cost / PacketSize));
+    } else {
+      return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, cast_cost);
+    }
+  }
+
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
   protected:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 4fe1fb943..abdf742c6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -233,7 +233,7 @@ struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
   static const int Layout = traits<InputXprType>::Layout;
 
   enum {
-    Flags = 0,
+    Flags = 0
   };
 };
 
@@ -254,7 +254,7 @@ struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, t
 
 
 template<typename Indices, typename InputXprType, typename KernelXprType>
-class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType> >
+class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors>
 {
   public:
   typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
@@ -297,6 +297,11 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   typedef typename XprType::Index Index;
   typedef DSizes<Index, NumDims> Dimensions;
 
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
   enum {
     IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
@@ -367,10 +372,6 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
     }
   }
 
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
@@ -405,7 +406,6 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   template<int LoadMode>
   EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
   {
-    const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
     Index indices[2] = {index, index+PacketSize-1};
     Index startInputs[2] = {0, 0};
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -448,6 +448,23 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
     }
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    const double kernel_size = m_kernelImpl.dimensions().TotalSize();
+    // We ignore the use of fused multiply-add.
+    const double convolve_compute_cost =
+        TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
+    const double firstIndex_compute_cost =
+        NumDims *
+        (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
+         TensorOpCost::DivCost<Index>());
+    return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
+           kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
+                          m_kernelImpl.costPerCoeff(vectorized) +
+                          TensorOpCost(0, 0, convolve_compute_cost, vectorized,
+                                       PacketSize));
+  }
+
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
  private:
@@ -773,6 +790,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
   typedef typename InputArgType::Scalar Scalar;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
 
@@ -1044,6 +1062,25 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
     return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
+    // model.
+    const double kernel_size = m_kernelImpl.dimensions().TotalSize();
+    // We ignore the use of fused multiply-add.
+    const double convolve_compute_cost =
+        TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
+    const double firstIndex_compute_cost =
+        NumDims *
+        (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
+         TensorOpCost::DivCost<Index>());
+    return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
+           kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
+                          m_kernelImpl.costPerCoeff(vectorized) +
+                          TensorOpCost(0, 0, convolve_compute_cost, vectorized,
+                                       PacketSize));
+  }
+
  private:
   // No assignment (copies are needed by the kernels)
   TensorEvaluator& operator = (const TensorEvaluator&);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
new file mode 100644
index 000000000..4236c75a6
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@@ -0,0 +1,213 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
+
+//#if !defined(EIGEN_USE_GPU)
+//#define EIGEN_USE_COST_MODEL
+//#endif
+
+namespace Eigen {
+
+/** \class TensorEvaluator
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief A cost model used to limit the number of threads used for evaluating
+  * tensor expression.
+  *
+  */
+
+// Class storing the cost of evaluating a tensor expression in terms of the
+// estimated number of operand bytes loads, bytes stored, and compute cycles.
+class TensorOpCost {
+ public:
+  // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple
+  // model based on minimal reciprocal throughput numbers from Intel or
+  // Agner Fog's tables would be better than what is there now.
+  template <typename ArgType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int MulCost() {
+    return internal::functor_traits<
+        internal::scalar_product_op<ArgType, ArgType> >::Cost;
+  }
+  template <typename ArgType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int AddCost() {
+    return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost;
+  }
+  template <typename ArgType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int DivCost() {
+    return internal::functor_traits<
+        internal::scalar_quotient_op<ArgType, ArgType> >::Cost;
+  }
+  template <typename ArgType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int ModCost() {
+    return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost;
+  }
+  template <typename SrcType, typename TargetType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int CastCost() {
+    return internal::functor_traits<
+        internal::scalar_cast_op<SrcType, TargetType> >::Cost;
+  }
+
+  TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
+  TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
+      : bytes_loaded_(bytes_loaded),
+        bytes_stored_(bytes_stored),
+        compute_cycles_(compute_cycles) {}
+
+  TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles,
+               bool vectorized, double packet_size)
+      : bytes_loaded_(bytes_loaded),
+        bytes_stored_(bytes_stored),
+        compute_cycles_(vectorized ? compute_cycles / packet_size
+                                   : compute_cycles) {
+    eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded));
+    eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored));
+    eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const {
+    return bytes_loaded_;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const {
+    return bytes_stored_;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const {
+    return compute_cycles_;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(
+      double load_cost, double store_cost, double compute_cost) const {
+    return load_cost * bytes_loaded_ + store_cost * bytes_stored_ +
+           compute_cost * compute_cycles_;
+  }
+
+  // Drop memory access component. Intended for cases when memory accesses are
+  // sequential or are completely masked by computations.
+  EIGEN_DEVICE_FUNC void dropMemoryCost() {
+    bytes_loaded_ = 0;
+    bytes_stored_ = 0;
+  }
+
+  // TODO(rmlarsen): Define min in terms of total cost, not elementwise.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMin(
+      const TensorOpCost& rhs) {
+    bytes_loaded_ = numext::mini(bytes_loaded_, rhs.bytes_loaded());
+    bytes_stored_ = numext::mini(bytes_stored_, rhs.bytes_stored());
+    compute_cycles_ = numext::mini(compute_cycles_, rhs.compute_cycles());
+    return *this;
+  }
+
+  // TODO(rmlarsen): Define max in terms of total cost, not elementwise.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMax(
+      const TensorOpCost& rhs) {
+    bytes_loaded_ = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
+    bytes_stored_ = numext::maxi(bytes_stored_, rhs.bytes_stored());
+    compute_cycles_ = numext::maxi(compute_cycles_, rhs.compute_cycles());
+    return *this;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
+      const TensorOpCost& rhs) {
+    bytes_loaded_ += rhs.bytes_loaded();
+    bytes_stored_ += rhs.bytes_stored();
+    compute_cycles_ += rhs.compute_cycles();
+    return *this;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) {
+    bytes_loaded_ *= rhs;
+    bytes_stored_ *= rhs;
+    compute_cycles_ *= rhs;
+    return *this;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+(
+      TensorOpCost lhs, const TensorOpCost& rhs) {
+    lhs += rhs;
+    return lhs;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
+      TensorOpCost lhs, double rhs) {
+    lhs *= rhs;
+    return lhs;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
+      double lhs, TensorOpCost rhs) {
+    rhs *= lhs;
+    return rhs;
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) {
+    return os << "[bytes_loaded = " << tc.bytes_loaded()
+              << ", bytes_stored = " << tc.bytes_stored()
+              << ", compute_cycles = " << tc.compute_cycles() << "]";
+  }
+
+ private:
+  double bytes_loaded_;
+  double bytes_stored_;
+  double compute_cycles_;
+};
+
+// TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads
+// in [1:max_threads] instead of just switching multi-threading off for small
+// work units.
+template <typename Device>
+class TensorCostModel {
+ public:
+  // Scaling from Eigen compute cost to device cycles.
+  static const int kDeviceCyclesPerComputeCycle = 1;
+
+ // Costs in device cycles.
+  static const int kStartupCycles = 100000;
+  static const int kPerThreadCycles = 100000;
+  static const int kTaskSize = 40000;
+
+  // Returns the number of threads in [1:max_threads] to use for
+  // evaluating an expression with the given output size and cost per
+  // coefficient.
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(
+      double output_size, const TensorOpCost& cost_per_coeff, int max_threads) {
+    double cost = totalCost(output_size, cost_per_coeff);
+    int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
+    return numext::mini(max_threads, numext::maxi(1, threads));
+  }
+
+  // taskSize assesses parallel task size.
+  // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task
+  // granularity needs to be increased to mitigate parallelization overheads.
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(
+      double output_size, const TensorOpCost& cost_per_coeff) {
+    return totalCost(output_size, cost_per_coeff) / kTaskSize;
+  }
+
+ private:
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(
+      double output_size, const TensorOpCost& cost_per_coeff) {
+    // Cost of memory fetches from L2 cache. 64 is typical cache line size.
+    // 11 is L2 cache latency on Haswell.
+    // We don't know whether data is in L1, L2 or L3. But we are most interested
+    // in single-threaded computational time around 100us-10ms (smaller time
+    // is too small for parallelization, larger time is not intersting
+    // either because we are probably using all available threads already).
+    // And for the target time range, L2 seems to be what matters. Data set
+    // fitting into L1 is too small to take noticeable time. Data set fitting
+    // only into L3 presumably will take more than 10ms to load and process.
+    const double kLoadCycles = 1.0 / 64 * 11;
+    const double kStoreCycles = 1.0 / 64 * 11;
+    // Scaling from Eigen compute cost to device cycles.
+    return output_size *
+        cost_per_coeff.total_cost(kLoadCycles, kStoreCycles,
+                                  kDeviceCyclesPerComputeCycle);
+  }
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
index b58e513b4..e020d076f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
@@ -83,8 +83,10 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
   typedef typename internal::traits<ArgType>::Index Index;
   static const int NumDims = internal::traits<ArgType>::NumDimensions;
   typedef DSizes<Index, NumDims> Dimensions;
-  typedef
-      typename internal::remove_const<typename ArgType::Scalar>::type Scalar;
+  typedef typename internal::remove_const<typename ArgType::Scalar>::type Scalar;
+  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
     IsAligned = false,
@@ -101,9 +103,6 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
     m_dimensions = op.func().dimensions(op.expression());
   }
 
-  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
@@ -134,6 +133,11 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
     return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    // TODO(rmlarsen): Extend CustomOp API to return its cost estimate.
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
+  }
+
   EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
 
  protected:
@@ -236,6 +240,9 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
   static const int NumDims = internal::traits<XprType>::NumDimensions;
   typedef DSizes<Index, NumDims> Dimensions;
   typedef typename XprType::Scalar Scalar;
+  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
     IsAligned = false,
@@ -252,9 +259,6 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
     m_dimensions = op.func().dimensions(op.lhsExpression(), op.rhsExpression());
   }
 
-  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
@@ -284,6 +288,11 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
     return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    // TODO(rmlarsen): Extend CustomOp API to return its cost estimate.
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
+  }
+
   EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
 
  protected:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index 821835cf3..1d2d162dc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -291,15 +291,9 @@ struct GpuDevice {
   int max_blocks_;
 };
 
-#ifndef __CUDA_ARCH__
 #define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
   (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);   \
   assert(cudaGetLastError() == cudaSuccess);
-#else
-#define LAUNCH_CUDA_KERNEL(kernel, ...)                                                     \
-  { const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); }        \
-  eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__);
-#endif
 
 
 // FIXME: Should be device and kernel specific.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
index 267f6f8e3..9d141395b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
@@ -44,6 +44,26 @@ struct DefaultDevice {
 #endif
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+#ifndef __CUDA_ARCH__
+    // Running on the host CPU
+    return l1CacheSize();
+#else
+    // Running on a CUDA device, return the amount of shared memory available.
+    return 48*1024;
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+#ifndef __CUDA_ARCH__
+    // Running single threaded on the host CPU
+    return l3CacheSize();
+#else
+    // Running on a CUDA device
+    return firstLevelCacheSize();
+#endif
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
 #ifndef __CUDA_ARCH__
     // Running single threaded on the host CPU
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index cd3dd214b..c02891465 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -12,145 +12,15 @@
 
 namespace Eigen {
 
-// This defines an interface that ThreadPoolDevice can take to use
-// custom thread pools underneath.
-class ThreadPoolInterface {
- public:
-  virtual void Schedule(std::function<void()> fn) = 0;
-
-  virtual ~ThreadPoolInterface() {}
-};
-
-// The implementation of the ThreadPool type ensures that the Schedule method
-// runs the functions it is provided in FIFO order when the scheduling is done
-// by a single thread.
-// Environment provides a way to create threads and also allows to intercept
-// task submission and execution.
-template <typename Environment>
-class ThreadPoolTempl : public ThreadPoolInterface {
- public:
-  // Construct a pool that contains "num_threads" threads.
-  explicit ThreadPoolTempl(int num_threads, Environment env = Environment())
-      : env_(env), threads_(num_threads), waiters_(num_threads) {
-    for (int i = 0; i < num_threads; i++) {
-      threads_.push_back(env.CreateThread([this]() { WorkerLoop(); }));
-    }
-  }
-
-  // Wait until all scheduled work has finished and then destroy the
-  // set of threads.
-  ~ThreadPoolTempl() {
-    {
-      // Wait for all work to get done.
-      std::unique_lock<std::mutex> l(mu_);
-      while (!pending_.empty()) {
-        empty_.wait(l);
-      }
-      exiting_ = true;
-
-      // Wakeup all waiters.
-      for (auto w : waiters_) {
-        w->ready = true;
-        w->task.f = nullptr;
-        w->cv.notify_one();
-      }
-    }
-
-    // Wait for threads to finish.
-    for (auto t : threads_) {
-      delete t;
-    }
-  }
-
-  // Schedule fn() for execution in the pool of threads. The functions are
-  // executed in the order in which they are scheduled.
-  void Schedule(std::function<void()> fn) {
-    Task t = env_.CreateTask(std::move(fn));
-    std::unique_lock<std::mutex> l(mu_);
-    if (waiters_.empty()) {
-      pending_.push_back(std::move(t));
-    } else {
-      Waiter* w = waiters_.back();
-      waiters_.pop_back();
-      w->ready = true;
-      w->task = std::move(t);
-      w->cv.notify_one();
-    }
-  }
-
- protected:
-  void WorkerLoop() {
-    std::unique_lock<std::mutex> l(mu_);
-    Waiter w;
-    Task t;
-    while (!exiting_) {
-      if (pending_.empty()) {
-        // Wait for work to be assigned to me
-        w.ready = false;
-        waiters_.push_back(&w);
-        while (!w.ready) {
-          w.cv.wait(l);
-        }
-        t = w.task;
-        w.task.f = nullptr;
-      } else {
-        // Pick up pending work
-        t = std::move(pending_.front());
-        pending_.pop_front();
-        if (pending_.empty()) {
-          empty_.notify_all();
-        }
-      }
-      if (t.f) {
-        mu_.unlock();
-        env_.ExecuteTask(t);
-        t.f = nullptr;
-        mu_.lock();
-      }
-    }
-  }
-
- private:
-  typedef typename Environment::Task Task;
-  typedef typename Environment::EnvThread Thread;
-
-  struct Waiter {
-    std::condition_variable cv;
-    Task task;
-    bool ready;
-  };
-
-  Environment env_;
-  std::mutex mu_;
-  MaxSizeVector<Thread*> threads_;  // All threads
-  MaxSizeVector<Waiter*> waiters_;  // Stack of waiting threads.
-  std::deque<Task> pending_;          // Queue of pending work
-  std::condition_variable empty_;          // Signaled on pending_.empty()
-  bool exiting_ = false;
-};
-
-struct StlThreadEnvironment {
-  struct Task {
-    std::function<void()> f;
-  };
-
-  // EnvThread constructor must start the thread,
-  // destructor must join the thread.
-  class EnvThread {
-   public:
-    EnvThread(std::function<void()> f) : thr_(f) {}
-    ~EnvThread() { thr_.join(); }
-
-   private:
-    std::thread thr_;
-  };
-
-  EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(f); }
-  Task CreateTask(std::function<void()> f) { return Task{std::move(f)}; }
-  void ExecuteTask(const Task& t) { t.f(); }
-};
-
-typedef ThreadPoolTempl<StlThreadEnvironment> ThreadPool;
+// Use the SimpleThreadPool by default. We'll switch to the new non blocking
+// thread pool later.
+#ifdef EIGEN_USE_NONBLOCKING_THREAD_POOL
+template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>;
+typedef NonBlockingThreadPool ThreadPool;
+#else
+template <typename Env> using ThreadPoolTempl = SimpleThreadPoolTempl<Env>;
+typedef SimpleThreadPool ThreadPool;
+#endif
 
 
 // Barrier is an object that allows one or more threads to wait until
@@ -264,6 +134,15 @@ struct ThreadPoolDevice {
     return num_threads_;
   }
 
+  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+    return l1CacheSize();
+  }
+
+  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+    // The l3 cache size is shared between all the cores.
+    return l3CacheSize() / num_threads_;
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
     // Should return an enum that encodes the ISA supported by the CPU
     return 1;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index 977dcafb0..7eccdf7de 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -190,13 +190,13 @@ template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0
 #else
   EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex) {
   }
-  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex) {
+  EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex) {
   }
-  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex, const DenseIndex) {
+  EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex) {
   }
-  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
+  EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
   }
-  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
+  EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
   }
 #endif
 
@@ -275,7 +275,7 @@ struct DSizes : array<DenseIndex, NumDims> {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex TotalSize() const {
-    return internal::array_prod(*static_cast<const Base*>(this));
+    return (NumDims == 0) ? 1 : internal::array_prod(*static_cast<const Base*>(this));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DSizes() {
@@ -296,25 +296,25 @@ struct DSizes : array<DenseIndex, NumDims> {
     EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 2 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE)
   }
 #else
-  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1) {
+  EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1) {
     eigen_assert(NumDims == 2);
     (*this)[0] = i0;
     (*this)[1] = i1;
   }
-  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) {
+  EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) {
     eigen_assert(NumDims == 3);
     (*this)[0] = i0;
     (*this)[1] = i1;
     (*this)[2] = i2;
   }
-  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) {
+  EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) {
     eigen_assert(NumDims == 4);
     (*this)[0] = i0;
     (*this)[1] = i1;
     (*this)[2] = i2;
     (*this)[3] = i3;
   }
-  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) {
+  EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) {
     eigen_assert(NumDims == 5);
     (*this)[0] = i0;
     (*this)[1] = i1;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index 1fb27a65b..26b1f65a8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -34,7 +34,7 @@ struct traits<TensorEvalToOp<XprType> >
   static const int Layout = XprTraits::Layout;
 
   enum {
-    Flags = 0,
+    Flags = 0
   };
 };
 
@@ -56,7 +56,7 @@ struct nested<TensorEvalToOp<XprType>, 1, typename eval<TensorEvalToOp<XprType>
 
 
 template<typename XprType>
-class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType> >
+class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType>, ReadOnlyAccessors>
 {
   public:
   typedef typename Eigen::internal::traits<TensorEvalToOp>::Scalar Scalar;
@@ -88,10 +88,14 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
   typedef TensorEvalToOp<ArgType> XprType;
   typedef typename ArgType::Scalar Scalar;
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  typedef typename XprType::Index Index;
+  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
     IsAligned = true,
-    PacketAccess = true,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = true
@@ -104,10 +108,6 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {
   }
 
-  typedef typename XprType::Index Index;
-  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* scalar) {
@@ -138,6 +138,13 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
     return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    // We assume that evalPacket or evalScalar is called to perform the
+    // assignment and account for the cost of the write here.
+    return m_impl.costPerCoeff(vectorized) +
+        TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
+  }
+
   EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_buffer; }
 
  private:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index 947a8ed88..ae4ce3c90 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -101,6 +101,11 @@ struct TensorEvaluator
     }
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
+                        internal::unpacket_traits<PacketReturnType>::size);
+  }
+
   EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; }
 
  protected:
@@ -184,6 +189,11 @@ struct TensorEvaluator<const Derived, Device>
     return loadConstant(m_data+index);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
+                        internal::unpacket_traits<PacketReturnType>::size);
+  }
+
   EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; }
 
  protected:
@@ -219,6 +229,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
@@ -237,6 +248,12 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
     return m_functor.template packetOp<Index, PacketReturnType>(index);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
+                        internal::unpacket_traits<PacketReturnType>::size);
+  }
+
   EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
 
  private:
@@ -270,6 +287,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
@@ -293,6 +311,12 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
     return m_functor.packetOp(m_argImpl.template packet<LoadMode>(index));
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    const double functor_cost = internal::functor_traits<UnaryOp>::Cost;
+    return m_argImpl.costPerCoeff(vectorized) +
+        TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
+  }
+
   EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
 
  private:
@@ -330,6 +354,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
   typedef typename XprType::Scalar Scalar;
   typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
   typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
@@ -358,6 +383,14 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
     return m_functor.packetOp(m_leftImpl.template packet<LoadMode>(index), m_rightImpl.template packet<LoadMode>(index));
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    const double functor_cost = internal::functor_traits<BinaryOp>::Cost;
+    return m_leftImpl.costPerCoeff(vectorized) +
+           m_rightImpl.costPerCoeff(vectorized) +
+           TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
+  }
+
   EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
 
  private:
@@ -398,6 +431,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
   typedef typename XprType::Index Index;
   typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
   typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions;
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
@@ -425,7 +459,6 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
   {
-    const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
     internal::Selector<PacketSize> select;
     for (Index i = 0; i < PacketSize; ++i) {
       select.select[i] = m_condImpl.coeff(index+i);
@@ -435,6 +468,13 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
                             m_elseImpl.template packet<LoadMode>(index));
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    return m_condImpl.costPerCoeff(vectorized) +
+           m_thenImpl.costPerCoeff(vectorized)
+        .cwiseMax(m_elseImpl.costPerCoeff(vectorized));
+  }
+
   EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
 
  private:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 4f4e07aaf..1155354cd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -59,9 +59,17 @@ class TensorExecutor<Expression, DefaultDevice, true>
     {
       const Index size = array_prod(evaluator.dimensions());
       const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
+      // Give the compiler a strong hint to unroll the loop. But don't insist
+      // on unrolling, because if the function is expensive the compiler should not
+      // unroll the loop at the expense of inlining.
+      const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
+      for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
+        for (Index j = 0; j < 4; j++) {
+          evaluator.evalPacket(i + j * PacketSize);
+        }
+      }
       const Index VectorizedSize = (size / PacketSize) * PacketSize;
-
-      for (Index i = 0; i < VectorizedSize; i += PacketSize) {
+      for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
         evaluator.evalPacket(i);
       }
       for (Index i = VectorizedSize; i < size; ++i) {
@@ -78,8 +86,9 @@ class TensorExecutor<Expression, DefaultDevice, true>
 #ifdef EIGEN_USE_THREADS
 template <typename Evaluator, typename Index, bool Vectorizable>
 struct EvalRange {
-  static void run(Evaluator evaluator, const Index first, const Index last) {
-    eigen_assert(last > first);
+  static void run(Evaluator* evaluator_in, const Index first, const Index last) {
+    Evaluator evaluator = *evaluator_in;
+    eigen_assert(last >= first);
     for (Index i = first; i < last; ++i) {
       evaluator.evalScalar(i);
     }
@@ -88,28 +97,35 @@ struct EvalRange {
 
 template <typename Evaluator, typename Index>
 struct EvalRange<Evaluator, Index, true> {
-  static void run(Evaluator evaluator, const Index first, const Index last) {
-    eigen_assert(last > first);
-
+  static void run(Evaluator* evaluator_in, const Index first, const Index last) {
+    Evaluator evaluator = *evaluator_in;
+    eigen_assert(last >= first);
     Index i = first;
-    static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+    const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
     if (last - first >= PacketSize) {
       eigen_assert(first % PacketSize == 0);
-      Index lastPacket = last - (last % PacketSize);
-      for (; i < lastPacket; i += PacketSize) {
+      Index last_chunk_offset = last - 4 * PacketSize;
+      // Give the compiler a strong hint to unroll the loop. But don't insist
+      // on unrolling, because if the function is expensive the compiler should not
+      // unroll the loop at the expense of inlining.
+      for (; i <= last_chunk_offset; i += 4*PacketSize) {
+        for (Index j = 0; j < 4; j++) {
+          evaluator.evalPacket(i + j * PacketSize);
+        }
+      }
+      last_chunk_offset = last - PacketSize;
+      for (; i <= last_chunk_offset; i += PacketSize) {
         evaluator.evalPacket(i);
       }
     }
-
     for (; i < last; ++i) {
       evaluator.evalScalar(i);
     }
   }
 };
 
-template<typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
-{
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
  public:
   typedef typename Expression::Index Index;
   static inline void run(const Expression& expr, const ThreadPoolDevice& device)
@@ -119,24 +135,34 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
     if (needs_assign)
     {
+      const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
       const Index size = array_prod(evaluator.dimensions());
-
-      static const int PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
-
-      int blocksz = std::ceil<int>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1;
-      const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
-      const unsigned int numblocks = static_cast<unsigned int>(size / blocksize);
-
-      Barrier barrier(numblocks);
-      for (unsigned int i = 0; i < numblocks; ++i) {
-        device.enqueue_with_barrier(&barrier, &EvalRange<Evaluator, Index, Vectorizable>::run, evaluator, i*blocksize, (i+1)*blocksize);
+      size_t num_threads = device.numThreads();
+#ifdef EIGEN_USE_COST_MODEL
+      if (num_threads > 1) {
+        num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
+            size, evaluator.costPerCoeff(Vectorizable), num_threads);
       }
-
-      if (static_cast<Index>(numblocks) * blocksize < size) {
-        EvalRange<Evaluator, Index, Vectorizable>::run(evaluator, numblocks * blocksize, size);
+#endif
+      if (num_threads == 1) {
+        EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
+      } else {
+        Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
+        const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
+        const Index numblocks = size / blocksize;
+
+        Barrier barrier(numblocks);
+        for (int i = 0; i < numblocks; ++i) {
+          device.enqueue_with_barrier(
+              &barrier, &EvalRange<Evaluator, Index, Vectorizable>::run,
+              &evaluator, i * blocksize, (i + 1) * blocksize);
+        }
+        if (numblocks * blocksize < size) {
+          EvalRange<Evaluator, Index, Vectorizable>::run(
+              &evaluator, numblocks * blocksize, size);
+        }
+        barrier.Wait();
       }
-
-      barrier.Wait();
     }
     evaluator.cleanup();
   }
@@ -147,98 +173,78 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
 // GPU: the evaluation of the expression is offloaded to a GPU.
 #if defined(EIGEN_USE_GPU)
 
-template <typename Expression>
-class TensorExecutor<Expression, GpuDevice, false> {
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, GpuDevice, Vectorizable> {
  public:
   typedef typename Expression::Index Index;
-  static EIGEN_DEVICE_FUNC void run(const Expression& expr, const GpuDevice& device);
+  static void run(const Expression& expr, const GpuDevice& device);
 };
 
-template <typename Expression>
-class TensorExecutor<Expression, GpuDevice, true> {
- public:
-  typedef typename Expression::Index Index;
-  static EIGEN_DEVICE_FUNC void run(const Expression& expr, const GpuDevice& device);
-};
 
 #if defined(__CUDACC__)
+template <typename Evaluator, typename Index, bool Vectorizable>
+struct EigenMetaKernelEval {
+  static __device__ EIGEN_ALWAYS_INLINE
+  void run(Evaluator& eval, Index first, Index last, Index step_size) {
+    for (Index i = first; i < last; i += step_size) {
+      eval.evalScalar(i);
+    }
+  }
+};
+
+template <typename Evaluator, typename Index>
+struct EigenMetaKernelEval<Evaluator, Index, true> {
+  static __device__ EIGEN_ALWAYS_INLINE
+  void run(Evaluator& eval, Index first, Index last, Index step_size) {
+    const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+    const Index vectorized_size = (last / PacketSize) * PacketSize;
+    const Index vectorized_step_size = step_size * PacketSize;
+
+    // Use the vector path
+    for (Index i = first * PacketSize; i < vectorized_size;
+         i += vectorized_step_size) {
+      eval.evalPacket(i);
+    }
+    for (Index i = vectorized_size + first; i < last; i += step_size) {
+      eval.evalScalar(i);
+    }
+  }
+};
 
 template <typename Evaluator, typename Index>
 __global__ void
 __launch_bounds__(1024)
-EigenMetaKernel_NonVectorizable(Evaluator memcopied_eval, Index size) {
-  // Cuda memcopies the kernel arguments. That's fine for POD, but for more
-  // complex types such as evaluators we should really conform to the C++
-  // standard and call a proper copy constructor.
-  Evaluator eval(memcopied_eval);
+EigenMetaKernel(Evaluator memcopied_eval, Index size) {
 
   const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
   const Index step_size = blockDim.x * gridDim.x;
 
-  // Use the scalar path
-  for (Index i = first_index; i < size; i += step_size) {
-    eval.evalScalar(i);
-  }
-}
-
-template <typename Evaluator, typename Index>
-__global__ void
-__launch_bounds__(1024)
-EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) {
   // Cuda memcopies the kernel arguments. That's fine for POD, but for more
   // complex types such as evaluators we should really conform to the C++
   // standard and call a proper copy constructor.
   Evaluator eval(memcopied_eval);
 
-  const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
-  const Index step_size = blockDim.x * gridDim.x;
-
-  // Use the vector path
-  const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
-  const Index vectorized_step_size = step_size * PacketSize;
-  const Index vectorized_size = (size / PacketSize) * PacketSize;
-  for (Index i = first_index * PacketSize; i < vectorized_size;
-       i += vectorized_step_size) {
-    eval.evalPacket(i);
-  }
-  for (Index i = vectorized_size + first_index; i < size; i += step_size) {
-    eval.evalScalar(i);
-  }
+  const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
+  EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
 }
 
 /*static*/
-template <typename Expression>
-EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device)
-{
+template <typename Expression, bool Vectorizable>
+inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
+    const Expression& expr, const GpuDevice& device) {
   TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
   const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-  if (needs_assign)
-  {
+  if (needs_assign) {
     const int block_size = device.maxCudaThreadsPerBlock();
-    const int max_blocks = numext::mini<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size);
+    const int max_blocks = device.getNumCudaMultiProcessors() *
+                           device.maxCudaThreadsPerMultiProcessor() / block_size;
     const Index size = array_prod(evaluator.dimensions());
-    // Create a least one block to ensure we won't crash if we're called with tensors of size 0.
-    const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
-    LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
-  }
-  evaluator.cleanup();
-}
+    // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
+    const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
 
-
-/*static*/
-template<typename Expression>
-EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(const Expression& expr, const GpuDevice& device)
-{
-  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
-  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-  if (needs_assign)
-  {
-    const int block_size = device.maxCudaThreadsPerBlock();
-    const int max_blocks = numext::mini<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size);
-    const Index size = array_prod(evaluator.dimensions());
-    // Create a least one block to ensure we won't crash if we're called with tensors of size 0.
-    const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
-    LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
+    LAUNCH_CUDA_KERNEL(
+        (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
+        num_blocks, block_size, 0, device, evaluator, size);
   }
   evaluator.cleanup();
 }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
index 49d849e23..ea250d8bc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
@@ -40,7 +40,7 @@ struct traits<TensorCwiseNullaryOp<NullaryOp, XprType> >
   static const int Layout = XprTraits::Layout;
 
   enum {
-    Flags = 0,
+    Flags = 0
   };
 };
 
@@ -163,7 +163,7 @@ struct traits<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >
   static const int Layout = XprTraits::Layout;
 
   enum {
-    Flags = 0,
+    Flags = 0
   };
 };
 
@@ -252,7 +252,7 @@ struct nested<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, 1, typename e
 
 
 template<typename IfXprType, typename ThenXprType, typename ElseXprType>
-class TensorSelectOp : public TensorBase<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >
+class TensorSelectOp : public TensorBase<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, ReadOnlyAccessors>
 {
   public:
     typedef typename Eigen::internal::traits<TensorSelectOp>::Scalar Scalar;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index d6db45ade..ece2ed91b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -129,6 +129,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
   typedef typename internal::conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar;
   typedef OutputScalar CoeffReturnType;
   typedef typename PacketType<OutputScalar, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
     IsAligned = false,
@@ -176,7 +177,6 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
     }
   }
 
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
     if (m_data) {
       m_device.deallocate(m_data);
@@ -189,11 +189,17 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
     return m_data[index];
   }
 
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const {
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType
+  packet(Index index) const {
     return internal::ploadt<PacketReturnType, LoadMode>(m_data + index);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
+  }
+
   EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; }
 
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index 9c0ed43b7..b27ee0084 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -128,7 +128,6 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
       return m_storage.data()[0];
     }
 
-
 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const
@@ -137,8 +136,54 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
       EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
       return this->operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
     }
+#else
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const
+    {
+      if (Options&RowMajor) {
+        const Index index = i1 + i0 * m_storage.dimensions()[1];
+        return m_storage.data()[index];
+      } else {
+        const Index index = i0 + i1 * m_storage.dimensions()[0];
+        return m_storage.data()[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const
+    {
+      if (Options&RowMajor) {
+         const Index index = i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0);
+         return m_storage.data()[index];
+      } else {
+         const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * i2);
+        return m_storage.data()[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const
+    {
+      if (Options&RowMajor) {
+        const Index index = i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0));
+        return m_storage.data()[index];
+      } else {
+        const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * i3));
+        return m_storage.data()[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
+    {
+      if (Options&RowMajor) {
+        const Index index = i4 + m_storage.dimensions()[4] * (i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0)));
+        return m_storage.data()[index];
+      } else {
+        const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * (i3 + m_storage.dimensions()[3] * i4)));
+        return m_storage.data()[index];
+      }
+    }
 #endif
 
+
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
     {
@@ -176,6 +221,51 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
       EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
       return operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
     }
+#else
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1)
+    {
+       if (Options&RowMajor) {
+         const Index index = i1 + i0 * m_storage.dimensions()[1];
+        return m_storage.data()[index];
+      } else {
+        const Index index = i0 + i1 * m_storage.dimensions()[0];
+        return m_storage.data()[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2)
+    {
+       if (Options&RowMajor) {
+         const Index index = i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0);
+        return m_storage.data()[index];
+      } else {
+         const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * i2);
+        return m_storage.data()[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
+    {
+      if (Options&RowMajor) {
+        const Index index = i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0));
+        return m_storage.data()[index];
+      } else {
+        const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * i3));
+        return m_storage.data()[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
+    {
+      if (Options&RowMajor) {
+        const Index index = i4 + m_storage.dimensions()[4] * (i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0)));
+        return m_storage.data()[index];
+      } else {
+        const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * (i3 + m_storage.dimensions()[3] * i4)));
+        return m_storage.data()[index];
+      }
+    }
 #endif
 
     EIGEN_DEVICE_FUNC
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index 14f480901..5d0548b84 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -34,7 +34,7 @@ struct traits<TensorForcedEvalOp<XprType> >
   static const int Layout = XprTraits::Layout;
 
   enum {
-    Flags = 0,
+    Flags = 0
   };
 };
 
@@ -55,7 +55,7 @@ struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<X
 
 
 template<typename XprType>
-class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType> >
+class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOnlyAccessors>
 {
   public:
   typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar;
@@ -83,10 +83,14 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
   typedef TensorForcedEvalOp<ArgType> XprType;
   typedef typename ArgType::Scalar Scalar;
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
     IsAligned = true,
-    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    PacketAccess = (PacketSize > 1),
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     RawAccess = true
   };
@@ -95,10 +99,6 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
       : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL)
   { }
 
-  typedef typename XprType::Index Index;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
@@ -132,6 +132,10 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
     return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
+  }
+
   EIGEN_DEVICE_FUNC Scalar* data() const { return m_buffer; }
 
  private:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index b7c13f67f..33cd00391 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -64,7 +64,7 @@ struct scalar_sigmoid_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const {
     const T one = T(1);
-    return one / (one + std::exp(-x));
+    return one / (one + numext::exp(-x));
   }
 
   template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -158,8 +158,8 @@ template <typename T> struct MeanReducer
   }
 
   protected:
-    int scalarCount_;
-    int packetCount_;
+    DenseIndex scalarCount_;
+    DenseIndex packetCount_;
 };
 
 template <typename T> struct MaxReducer
@@ -594,6 +594,8 @@ template <> class UniformRandomGenerator<std::complex<double> > {
 template <typename Scalar>
 struct functor_traits<UniformRandomGenerator<Scalar> > {
   enum {
+    // Rough estimate.
+    Cost = 100 * NumTraits<Scalar>::MulCost,
     PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
   };
 };
@@ -774,6 +776,8 @@ template <typename T> class NormalRandomGenerator {
 template <typename Scalar>
 struct functor_traits<NormalRandomGenerator<Scalar> > {
   enum {
+    // Rough estimate.
+    Cost = 100 * NumTraits<Scalar>::MulCost,
     PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
   };
 };
@@ -799,7 +803,7 @@ class GaussianGenerator {
       T offset = coordinates[i] - m_means[i];
       tmp += offset * offset / m_two_sigmas[i];
     }
-    return std::exp(-tmp);
+    return numext::exp(-tmp);
   }
 
  private:
@@ -807,6 +811,15 @@ class GaussianGenerator {
   array<T, NumDims> m_two_sigmas;
 };
 
+template <typename T, typename Index, size_t NumDims>
+struct functor_traits<GaussianGenerator<T, Index, NumDims> > {
+  enum {
+    Cost = NumDims * (2 * NumTraits<T>::AddCost + NumTraits<T>::MulCost +
+                      functor_traits<scalar_quotient_op<T, T> >::Cost) +
+           functor_traits<scalar_exp_op<T> >::Cost,
+    PacketAccess = GaussianGenerator<T, Index, NumDims>::PacketAccess
+  };
+};
 
 } // end namespace internal
 } // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
index e4154bd0b..8ff7d5815 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@@ -145,6 +145,14 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
     return rslt;
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool) const {
+    // TODO(rmlarsen): This is just a placeholder. Define interface to make
+    // generators return their cost.
+    return TensorOpCost(0, 0, TensorOpCost::AddCost<Scalar>() +
+                                  TensorOpCost::MulCost<Scalar>());
+  }
+
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
  protected:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index 72594a05c..bafcc67bd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -159,6 +159,9 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
   typedef TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>,
                           Device> Self;
   typedef TensorEvaluator<ArgType, Device> Impl;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
     IsAligned = false,
@@ -307,9 +310,6 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
     }
   }
 
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
@@ -362,15 +362,14 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    const Index packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) {
       return packetWithPossibleZero(index);
     }
 
-    const Index indices[2] = {index, index + packetSize - 1};
+    const Index indices[2] = {index, index + PacketSize - 1};
     const Index patchIndex = indices[0] / m_fastPatchStride;
     if (patchIndex != indices[1] / m_fastPatchStride) {
       return packetWithPossibleZero(index);
@@ -434,12 +433,23 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
   Index rowInflateStride() const { return m_row_inflate_strides; }
   Index colInflateStride() const { return m_col_inflate_strides; }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    // We conservatively estimate the cost for the code path where the computed
+    // index is inside the original image and
+    // TensorEvaluator<ArgType, Device>::CoordAccess is false.
+    const double compute_cost = 3 * TensorOpCost::DivCost<Index>() +
+                                6 * TensorOpCost::MulCost<Index>() +
+                                8 * TensorOpCost::MulCost<Index>();
+    return m_impl.costPerCoeff(vectorized) +
+           TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+  }
+
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
-    for (int i = 0; i < packetSize; ++i) {
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    for (int i = 0; i < PacketSize; ++i) {
       values[i] = coeff(index+i);
     }
     PacketReturnType rslt = internal::pload<PacketReturnType>(values);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
index 368e6f685..de2f67d74 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
@@ -81,6 +81,10 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
   typedef typename XprType::Index Index;
   static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
   typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
@@ -123,11 +127,6 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
     }
   }
 
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
@@ -190,18 +189,30 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
-    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
-    for (int i = 0; i < packetSize; ++i) {
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    for (int i = 0; i < PacketSize; ++i) {
       values[i] = coeff(index+i);
     }
     PacketReturnType rslt = internal::pload<PacketReturnType>(values);
     return rslt;
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    const double compute_cost = NumDims * (3 * TensorOpCost::DivCost<Index>() +
+                                           3 * TensorOpCost::MulCost<Index>() +
+                                           2 * TensorOpCost::AddCost<Index>());
+    const double input_size = m_impl.dimensions().TotalSize();
+    const double output_size = m_dimensions.TotalSize();
+    if (output_size == 0)
+      return TensorOpCost();
+    return m_impl.costPerCoeff(vectorized) +
+           TensorOpCost(sizeof(CoeffReturnType) * input_size / output_size, 0,
+                        compute_cost, vectorized, PacketSize);
+  }
+
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
  protected:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
index 9b85914ff..cd0109ef4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -155,6 +155,10 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
     return m_impl.template packet<LoadMode>(index);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return m_impl.costPerCoeff(vectorized);
+  }
+
   EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); }
 
   const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
@@ -177,7 +181,7 @@ template<typename ArgType, typename Device>
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
-    CoordAccess = false,  // to be implemented
+    CoordAccess = false  // to be implemented
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index 6af2d45d4..cd04716bd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -24,9 +24,17 @@ const T2& choose(Cond<false>, const T1&, const T2& second) {
   return second;
 }
 
-template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+
+template <typename T, typename X, typename Y>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T divup(const X x, const Y y) {
+  return static_cast<T>((x + y - 1) / y);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T divup(const T x, const T y) {
-  return (x + y - 1) / y;
+  return static_cast<T>((x + y - 1) / y);
 }
 
 template <size_t n> struct max_n_1 {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index a9c222ea0..bfa65a607 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -142,6 +142,10 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
     return m_impl.template packet<LoadMode>(index);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return m_impl.costPerCoeff(vectorized);
+  }
+
   EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); }
 
   const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
@@ -449,6 +453,11 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
     }
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims);
+  }
+
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const {
     Scalar* result = m_impl.data();
     if (result) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index a595a0175..88b838b27 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -87,6 +87,10 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
   typedef typename XprType::Index Index;
   static const int NumDims = internal::array_size<PaddingDimensions>::value;
   typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
     IsAligned = false,
@@ -129,10 +133,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
     }
   }
 
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
@@ -224,21 +224,51 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
     return m_impl.coeff(inputIndex);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    TensorOpCost cost = m_impl.costPerCoeff(vectorized);
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < NumDims; ++i)
+        updateCostPerDimension(cost, i, i == 0);
+    } else {
+      for (int i = NumDims - 1; i >= 0; --i)
+        updateCostPerDimension(cost, i, i == NumDims - 1);
+    }
+    return cost;
+  }
+
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
+ private:
+  void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const {
+    const double in = static_cast<double>(m_impl.dimensions()[i]);
+    const double out = in + m_padding[i].first + m_padding[i].second;
+    if (out == 0)
+      return;
+    const double reduction = in / out;
+    cost *= reduction;
+    if (first) {
+      cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() +
+                    reduction * (1 * TensorOpCost::AddCost<Index>()));
+    } else {
+      cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() +
+                                 2 * TensorOpCost::MulCost<Index>() +
+                    reduction * (2 * TensorOpCost::MulCost<Index>() +
+                                 1 * TensorOpCost::DivCost<Index>()));
+    }
+  }
+
  protected:
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     const Index initialIndex = index;
     Index inputIndex = 0;
     for (int i = NumDims - 1; i > 0; --i) {
       const Index first = index;
-      const Index last = index + packetSize - 1;
+      const Index last = index + PacketSize - 1;
       const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i];
       const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
       const Index lastPaddedRight = m_outputStrides[i+1];
@@ -263,7 +293,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
       }
     }
 
-    const Index last = index + packetSize - 1;
+    const Index last = index + PacketSize - 1;
     const Index first = index;
     const Index lastPaddedLeft = m_padding[0].first;
     const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
@@ -288,16 +318,15 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     const Index initialIndex = index;
     Index inputIndex = 0;
 
     for (int i = 0; i < NumDims - 1; ++i) {
       const Index first = index;
-      const Index last = index + packetSize - 1;
+      const Index last = index + PacketSize - 1;
       const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1];
       const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1];
       const Index lastPaddedRight = m_outputStrides[i];
@@ -322,7 +351,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
       }
     }
 
-    const Index last = index + packetSize - 1;
+    const Index last = index + PacketSize - 1;
     const Index first = index;
     const Index lastPaddedLeft = m_padding[NumDims-1].first;
     const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second);
@@ -347,9 +376,8 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
-    for (int i = 0; i < packetSize; ++i) {
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    for (int i = 0; i < PacketSize; ++i) {
       values[i] = coeff(index+i);
     }
     PacketReturnType rslt = internal::pload<PacketReturnType>(values);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
index 0bf460f4e..a87e45330 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@@ -85,6 +85,10 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
   static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value + 1;
   typedef DSizes<Index, NumDims> Dimensions;
   typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
 
   enum {
     IsAligned = false,
@@ -137,9 +141,6 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
     }
   }
 
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
@@ -183,12 +184,11 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0;
-    Index indices[2] = {index, index + packetSize - 1};
+    Index indices[2] = {index, index + PacketSize - 1};
     Index patchIndices[2] = {indices[0] / m_outputStrides[output_stride_index],
                              indices[1] / m_outputStrides[output_stride_index]};
     Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[output_stride_index],
@@ -229,15 +229,15 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
     inputIndices[0] += (patchIndices[0] + patchOffsets[0]);
     inputIndices[1] += (patchIndices[1] + patchOffsets[1]);
 
-    if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+    if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
       PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
       return rslt;
     }
     else {
-      EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
+      EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
       values[0] = m_impl.coeff(inputIndices[0]);
-      values[packetSize-1] = m_impl.coeff(inputIndices[1]);
-      for (int i = 1; i < packetSize-1; ++i) {
+      values[PacketSize-1] = m_impl.coeff(inputIndices[1]);
+      for (int i = 1; i < PacketSize-1; ++i) {
         values[i] = coeff(index+i);
       }
       PacketReturnType rslt = internal::pload<PacketReturnType>(values);
@@ -245,6 +245,14 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
     }
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    const double compute_cost = NumDims * (TensorOpCost::DivCost<Index>() +
+                                           TensorOpCost::MulCost<Index>() +
+                                           2 * TensorOpCost::AddCost<Index>());
+    return m_impl.costPerCoeff(vectorized) +
+           TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+  }
+
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
  protected:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 00f870328..885295f0a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -214,7 +214,7 @@ struct FullReducer {
 
   static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) {
     const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
-    *output = InnerMostDimReducer<Self, Op>::reduce(self, 0, num_coeffs, reducer);
+    *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
   }
 };
 
@@ -222,18 +222,19 @@ struct FullReducer {
 #ifdef EIGEN_USE_THREADS
 // Multithreaded full reducers
 template <typename Self, typename Op,
-          bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
+          bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
 struct FullReducerShard {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex,
                   typename Self::Index numValuesToReduce, Op& reducer,
                   typename Self::CoeffReturnType* output) {
-    *output = InnerMostDimReducer<Self, Op, vectorizable>::reduce(
+    *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
         self, firstIndex, numValuesToReduce, reducer);
   }
 };
 
-template <typename Self, typename Op>
-struct FullReducer<Self, Op, ThreadPoolDevice, false> {
+// Multithreaded full reducer
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
   static const bool HasOptimizedImplementation = !Op::IsStateful;
   static const int PacketSize =
       unpacket_traits<typename Self::PacketReturnType>::size;
@@ -247,79 +248,44 @@ struct FullReducer<Self, Op, ThreadPoolDevice, false> {
       *output = reducer.finalize(reducer.initialize());
       return;
     }
-    const std::size_t num_threads = device.numThreads();
-    if (num_threads == 1) {
-      *output = InnerMostDimReducer<Self, Op, false>::reduce(self, 0, num_coeffs, reducer);
-      return;
-    } else {
-      const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
-      const unsigned int numblocks = blocksize > 0 ? static_cast<unsigned int>(num_coeffs / blocksize) : 0;
-      eigen_assert(num_coeffs >= static_cast<Index>(numblocks) * blocksize);
-
-      Barrier barrier(numblocks);
-      MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
-      for (unsigned int i = 0; i < numblocks; ++i) {
-        device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, false>::run, self,
-                                    i * blocksize, blocksize, reducer, &shards[i]);
-      }
-
-      typename Self::CoeffReturnType finalShard;
-      if (static_cast<Index>(numblocks) * blocksize < num_coeffs) {
-        finalShard = InnerMostDimReducer<Self, Op, false>::reduce(
-            self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer);
-      } else {
-        finalShard = reducer.initialize();
-      }
-      barrier.Wait();
-      for (unsigned int i = 0; i < numblocks; ++i) {
-        reducer.reduce(shards[i], &finalShard);
-      }
-      *output = reducer.finalize(finalShard);
-    }
-  }
-};
-
-template <typename Self, typename Op>
-struct FullReducer<Self, Op, ThreadPoolDevice, true> {
-  static const bool HasOptimizedImplementation = !Op::IsStateful;
-  static const int PacketSize =
-      unpacket_traits<typename Self::PacketReturnType>::size;
-
-  // launch one reducer per thread and accumulate the result.
-  static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device,
-                  typename Self::CoeffReturnType* output) {
-    typedef typename Self::Index Index;
-    const Index num_coeffs = array_prod(self.m_impl.dimensions());
-    if (num_coeffs == 0) {
-      *output = reducer.finalize(reducer.initialize());
-      return;
-    }
-    const std::size_t num_threads = device.numThreads();
+#ifdef EIGEN_USE_COST_MODEL
+    const TensorOpCost cost =
+        self.m_impl.costPerCoeff(Vectorizable) +
+        TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable,
+                     PacketSize);
+    const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
+        num_coeffs, cost, device.numThreads());
+#else
+    const int num_threads = device.numThreads();
+#endif
     if (num_threads == 1) {
-      *output = InnerMostDimReducer<Self, Op, true>::reduce(self, 0, num_coeffs, reducer);
+      *output =
+          InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
       return;
     }
-    const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
-    const unsigned int numblocks = blocksize > 0 ? static_cast<unsigned int>(num_coeffs / blocksize) : 0;
-    eigen_assert(num_coeffs >= static_cast<Index>(numblocks) * blocksize);
+    const Index blocksize =
+        std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
+    const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
+    eigen_assert(num_coeffs >= numblocks * blocksize);
 
     Barrier barrier(numblocks);
     MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
-    for (unsigned int i = 0; i < numblocks; ++i) {
-      device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, true>::run,
+    for (Index i = 0; i < numblocks; ++i) {
+      device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run,
                                   self, i * blocksize, blocksize, reducer,
                                   &shards[i]);
     }
     typename Self::CoeffReturnType finalShard;
-    if (static_cast<Index>(numblocks) * blocksize < num_coeffs) {
-      finalShard = InnerMostDimReducer<Self, Op, true>::reduce(
-          self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer);
+    if (numblocks * blocksize < num_coeffs) {
+      finalShard = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
+          self, numblocks * blocksize, num_coeffs - numblocks * blocksize,
+          reducer);
     } else {
       finalShard = reducer.initialize();
     }
-
     barrier.Wait();
-    for (unsigned int i = 0; i < numblocks; ++i) {
+
+    for (Index i = 0; i < numblocks; ++i) {
       reducer.reduce(shards[i], &finalShard);
     }
     *output = reducer.finalize(finalShard);
@@ -411,6 +377,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> Self;
   static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
+  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
     IsAligned = false,
@@ -495,8 +464,13 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static bool size_large_enough(Index total_size) {
+#ifndef EIGEN_USE_COST_MODEL
+    return total_size > 1024 * 1024;
+#else
+    return true || total_size;
+#endif
+  }
 
   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) {
     m_impl.evalSubExprsIfNeeded(NULL);
@@ -504,7 +478,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
     // Use the FullReducer if possible.
     if (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
         ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
-         (!RunningOnGPU && (internal::array_prod(m_impl.dimensions()) > 1024 * 1024)))) {
+         (!RunningOnGPU && size_large_enough(internal::array_prod(m_impl.dimensions()))))) {
 
       bool need_assign = false;
       if (!data) {
@@ -584,16 +558,15 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
+    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
 
-    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
     if (ReducingInnerMostDims) {
       const Index num_values_to_reduce =
 	(static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
       const Index firstIndex = firstInput(index);
-      for (Index i = 0; i < packetSize; ++i) {
+      for (Index i = 0; i < PacketSize; ++i) {
         Op reducer(m_reducer);
         values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce,
                                                                     num_values_to_reduce, reducer);
@@ -602,18 +575,18 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
       const Index firstIndex = firstInput(index);
       const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1;
       // TBD: extend this the the n innermost dimensions that we preserve.
-      if (((firstIndex % m_dimensions[innermost_dim]) + packetSize - 1) < m_dimensions[innermost_dim]) {
+      if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) {
         Op reducer(m_reducer);
         typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
         internal::InnerMostDimPreserver<NumReducedDims-1, Self, Op>::reduce(*this, firstIndex, reducer, &accum);
         return reducer.finalizePacket(accum);
       } else {
-        for (int i = 0; i < packetSize; ++i) {
+        for (int i = 0; i < PacketSize; ++i) {
           values[i] = coeff(index + i);
         }
       }
     } else {
-      for (int i = 0; i < packetSize; ++i) {
+      for (int i = 0; i < PacketSize; ++i) {
         values[i] = coeff(index + i);
       }
     }
@@ -621,6 +594,18 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
     return rslt;
   }
 
+  // Must be called after evalSubExprsIfNeeded().
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    if (RunningFullReduction && m_result) {
+      return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
+    } else {
+      const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
+      const double compute_cost = num_values_to_reduce * internal::functor_traits<Op>::Cost;
+      return m_impl.costPerCoeff(vectorized) * num_values_to_reduce +
+          TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+    }
+  }
+
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
   private:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index c33d54d6e..fd2587dd5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -130,13 +130,18 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
     assert(false && "Should only be called on floats");
   }
 
-  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
     typedef typename Self::Index Index;
 
     const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    // Don't crash when we're called with an input tensor of size 0.
+    if (num_coeffs == 0) {
+      return;
+    }
+
     const int block_size = 256;
     const int num_per_thread = 128;
-    const int num_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread));
+    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
 
     if (num_blocks > 1) {
       // We initialize the outputs outside the reduction kernel when we can't be sure that there
@@ -231,7 +236,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
     return true;
   }
 
-  static EIGEN_DEVICE_FUNC bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
     typedef typename Self::Index Index;
 
     // It's faster to use the usual code.
@@ -310,7 +315,7 @@ struct OuterReducer<Self, Op, GpuDevice> {
     return true;
   }
 
-  static EIGEN_DEVICE_FUNC bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
     typedef typename Self::Index Index;
 
     // It's faster to use the usual code.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index 96d92038c..1a59cc8f7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -104,6 +104,10 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
   typedef typename XprType::Index Index;
   static const int NumDims = internal::array_size<ReverseDimensions>::value;
   typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
     IsAligned = false,
@@ -135,10 +139,6 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
     }
   }
 
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   const Dimensions& dimensions() const { return m_dimensions; }
 
@@ -195,21 +195,33 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   PacketReturnType packet(Index index) const
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     // TODO(ndjaitly): write a better packing routine that uses
     // local structure.
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type
-                                                            values[packetSize];
-    for (int i = 0; i < packetSize; ++i) {
+                                                            values[PacketSize];
+    for (int i = 0; i < PacketSize; ++i) {
       values[i] = coeff(index+i);
     }
     PacketReturnType rslt = internal::pload<PacketReturnType>(values);
     return rslt;
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
+                                     2 * TensorOpCost::MulCost<Index>() +
+                                     TensorOpCost::DivCost<Index>());
+    for (int i = 0; i < NumDims; ++i) {
+      if (m_reverse[i]) {
+        compute_cost += 2 * TensorOpCost::AddCost<Index>();
+      }
+    }
+    return m_impl.costPerCoeff(vectorized) +
+           TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
+  }
+
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
  protected:
@@ -246,6 +258,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   const Dimensions& dimensions() const { return this->m_dimensions; }
@@ -256,14 +269,13 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
 
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x) {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     // This code is pilfered from TensorMorphing.h
-    EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
+    EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
     internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
-    for (int i = 0; i < packetSize; ++i) {
+    for (int i = 0; i < PacketSize; ++i) {
       this->coeffRef(index+i) = values[i];
     }
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index c19833ea5..e76533710 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -104,6 +104,9 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
   static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
   typedef DSizes<Index, NumDims> Dimensions;
   typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
     IsAligned = false,
@@ -145,9 +148,6 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
     }
   }
 
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
@@ -166,18 +166,25 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
-    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
-    for (int i = 0; i < packetSize; ++i) {
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    for (int i = 0; i < PacketSize; ++i) {
       values[i] = coeff(index+i);
     }
     PacketReturnType rslt = internal::pload<PacketReturnType>(values);
     return rslt;
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
+                                           2 * TensorOpCost::MulCost<Index>() +
+                                           TensorOpCost::DivCost<Index>());
+    return m_impl.costPerCoeff(vectorized) +
+           TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
+  }
+
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
  protected:
@@ -219,6 +226,9 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
   static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
   typedef DSizes<Index, NumDims> Dimensions;
   typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
     IsAligned = false,
@@ -230,9 +240,6 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
       : Base(op, device)
   { }
 
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
   {
     return this->m_impl.coeffRef(this->srcCoeff(index));
@@ -241,12 +248,11 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
   template <int StoreMode> EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
-    static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
 
-    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
     internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
-    for (int i = 0; i < packetSize; ++i) {
+    for (int i = 0; i < PacketSize; ++i) {
       this->coeffRef(index+i) = values[i];
     }
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 085f8fd3d..52b7d216a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -103,6 +103,10 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
   typedef typename XprType::Index Index;
   static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
   typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
@@ -142,10 +146,6 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
     }
   }
 
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
@@ -164,12 +164,11 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     Index inputIndices[] = {0, 0};
-    Index indices[] = {index, index + packetSize - 1};
+    Index indices[] = {index, index + PacketSize - 1};
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx0 = indices[0] / m_outputStrides[i];
@@ -193,15 +192,15 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
       inputIndices[0] += indices[0] * m_inputStrides[NumDims-1];
       inputIndices[1] += indices[1] * m_inputStrides[NumDims-1];
     }
-    if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+    if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
       PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
       return rslt;
     }
     else {
-      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
       values[0] = m_impl.coeff(inputIndices[0]);
-      values[packetSize-1] = m_impl.coeff(inputIndices[1]);
-      for (int i = 1; i < packetSize-1; ++i) {
+      values[PacketSize-1] = m_impl.coeff(inputIndices[1]);
+      for (int i = 1; i < PacketSize-1; ++i) {
         values[i] = coeff(index+i);
       }
       PacketReturnType rslt = internal::pload<PacketReturnType>(values);
@@ -209,6 +208,20 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
     }
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    double compute_cost = (NumDims - 1) * (TensorOpCost::AddCost<Index>() +
+                                           TensorOpCost::MulCost<Index>() +
+                                           TensorOpCost::DivCost<Index>()) +
+        TensorOpCost::MulCost<Index>();
+    if (vectorized) {
+      compute_cost *= 2;  // packet() computes two indices
+    }
+    const int innerDim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : (NumDims - 1);
+    return m_impl.costPerCoeff(vectorized && m_inputStrides[innerDim] == 1) +
+        // Computation is not vectorized per se, but it is done once per packet.
+        TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+  }
+
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
  protected:
@@ -266,6 +279,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
   {
@@ -275,12 +289,11 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+packetSize-1 < this->dimensions().TotalSize());
+    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < this->dimensions().TotalSize());
 
     Index inputIndices[] = {0, 0};
-    Index indices[] = {index, index + packetSize - 1};
+    Index indices[] = {index, index + PacketSize - 1};
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx0 = indices[0] / this->m_outputStrides[i];
@@ -304,15 +317,15 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
       inputIndices[0] += indices[0] * this->m_inputStrides[NumDims-1];
       inputIndices[1] += indices[1] * this->m_inputStrides[NumDims-1];
     }
-    if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+    if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
       this->m_impl.template writePacket<Unaligned>(inputIndices[0], x);
     }
     else {
-      EIGEN_ALIGN_MAX Scalar values[packetSize];
+      EIGEN_ALIGN_MAX Scalar values[PacketSize];
       internal::pstore<Scalar, PacketReturnType>(values, x);
       this->m_impl.coeffRef(inputIndices[0]) = values[0];
-      this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1];
-      for (int i = 1; i < packetSize-1; ++i) {
+      this->m_impl.coeffRef(inputIndices[1]) = values[PacketSize-1];
+      for (int i = 1; i < PacketSize-1; ++i) {
         this->coeffRef(index+i) = values[i];
       }
     }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
index 2f06f8442..b7597b3a5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
@@ -40,7 +40,7 @@ class compute_tensor_flags
   };
 
   public:
-    enum { ret = packet_access_bit};
+    enum { ret = packet_access_bit };
 };
 
 
@@ -54,7 +54,7 @@ struct traits<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
   static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
   enum {
     Options = Options_,
-    Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0 : LvalueBit),
+    Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0 : LvalueBit)
   };
 };
 
@@ -69,7 +69,7 @@ struct traits<TensorFixedSize<Scalar_, Dimensions, Options_, IndexType_> >
   static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
   enum {
     Options = Options_,
-    Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0: LvalueBit),
+    Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0: LvalueBit)
   };
 };
 
@@ -86,7 +86,7 @@ struct traits<TensorMap<PlainObjectType, Options_> >
   static const int Layout = BaseTraits::Layout;
   enum {
     Options = Options_,
-    Flags = BaseTraits::Flags,
+    Flags = BaseTraits::Flags
   };
 };
 
@@ -102,7 +102,7 @@ struct traits<TensorRef<PlainObjectType> >
   static const int Layout = BaseTraits::Layout;
   enum {
     Options = BaseTraits::Options,
-    Flags = BaseTraits::Flags,
+    Flags = BaseTraits::Flags
   };
 };
 
@@ -253,7 +253,7 @@ struct nested<const TensorRef<PlainObjectType> >
 // Pc=0.
 typedef enum {
   PADDING_VALID = 1,
-  PADDING_SAME = 2,
+  PADDING_SAME = 2
 } PaddingType;
 
 }  // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
index 3e56589c3..5950f38e2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
@@ -53,9 +53,7 @@ struct TensorUInt128
   template<typename T>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   explicit TensorUInt128(const T& x) : high(0), low(x) {
-    typedef typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type UnsignedT;
-    typedef typename conditional<sizeof(LOW) == 8, uint64_t, uint32_t>::type UnsignedLow;
-    eigen_assert(static_cast<UnsignedT>(x) <= static_cast<UnsignedLow>(NumTraits<LOW>::highest()));
+    eigen_assert((static_cast<typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type>(x) <= static_cast<typename conditional<sizeof(LOW) == 8, uint64_t, uint32_t>::type>(NumTraits<LOW>::highest())));
     eigen_assert(x >= 0);
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
index 5bdfbad46..e735fc76f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@@ -171,6 +171,9 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
   static const int NumDims = NumInputDims + 1;
   typedef DSizes<Index, NumDims> Dimensions;
   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
     IsAligned = false,
@@ -336,9 +339,6 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
     }
   }
 
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
@@ -408,16 +408,15 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    const Index packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1 ||
         m_in_plane_strides != 1 || m_plane_inflate_strides != 1) {
       return packetWithPossibleZero(index);
     }
 
-    const Index indices[2] = {index, index + packetSize - 1};
+    const Index indices[2] = {index, index + PacketSize - 1};
     const Index patchIndex = indices[0] / m_fastPatchStride;
     if (patchIndex != indices[1] / m_fastPatchStride) {
       return packetWithPossibleZero(index);
@@ -495,6 +494,14 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
     return packetWithPossibleZero(index);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    const double compute_cost =
+        10 * TensorOpCost::DivCost<Index>() + 21 * TensorOpCost::MulCost<Index>() +
+        8 * TensorOpCost::AddCost<Index>();
+    return TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+  }
+
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
   const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
@@ -518,9 +525,8 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
-    for (int i = 0; i < packetSize; ++i) {
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    for (int i = 0; i < PacketSize; ++i) {
       values[i] = coeff(index+i);
     }
     PacketReturnType rslt = internal::pload<PacketReturnType>(values);
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt b/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt
new file mode 100644
index 000000000..88fef50c6
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt
@@ -0,0 +1,6 @@
+FILE(GLOB Eigen_CXX11_ThreadPool_SRCS "*.h")
+
+INSTALL(FILES
+  ${Eigen_CXX11_ThreadPool_SRCS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/ThreadPool COMPONENT Devel
+  )
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
new file mode 100644
index 000000000..6dd64f185
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
@@ -0,0 +1,234 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_
+#define EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_
+
+namespace Eigen {
+
+// EventCount allows to wait for arbitrary predicates in non-blocking
+// algorithms. Think of condition variable, but wait predicate does not need to
+// be protected by a mutex. Usage:
+// Waiting thread does:
+//
+//   if (predicate)
+//     return act();
+//   EventCount::Waiter& w = waiters[my_index];
+//   ec.Prewait(&w);
+//   if (predicate) {
+//     ec.CancelWait(&w);
+//     return act();
+//   }
+//   ec.CommitWait(&w);
+//
+// Notifying thread does:
+//
+//   predicate = true;
+//   ec.Notify(true);
+//
+// Notify is cheap if there are no waiting threads. Prewait/CommitWait are not
+// cheap, but they are executed only if the preceeding predicate check has
+// failed.
+//
+// Algorihtm outline:
+// There are two main variables: predicate (managed by user) and state_.
+// Operation closely resembles Dekker mutual algorithm:
+// https://en.wikipedia.org/wiki/Dekker%27s_algorithm
+// Waiting thread sets state_ then checks predicate, Notifying thread sets
+// predicate then checks state_. Due to seq_cst fences in between these
+// operations it is guaranteed than either waiter will see predicate change
+// and won't block, or notifying thread will see state_ change and will unblock
+// the waiter, or both. But it can't happen that both threads don't see each
+// other changes, which would lead to deadlock.
+class EventCount {
+ public:
+  class Waiter;
+
+  EventCount(std::vector<Waiter>& waiters) : waiters_(waiters) {
+    eigen_assert(waiters.size() < (1 << kWaiterBits) - 1);
+    // Initialize epoch to something close to overflow to test overflow.
+    state_ = kStackMask | (kEpochMask - kEpochInc * waiters.size() * 2);
+  }
+
+  ~EventCount() {
+    // Ensure there are no waiters.
+    eigen_assert((state_.load() & (kStackMask | kWaiterMask)) == kStackMask);
+  }
+
+  // Prewait prepares for waiting.
+  // After calling this function the thread must re-check the wait predicate
+  // and call either CancelWait or CommitWait passing the same Waiter object.
+  void Prewait(Waiter* w) {
+    w->epoch = state_.fetch_add(kWaiterInc, std::memory_order_relaxed);
+    std::atomic_thread_fence(std::memory_order_seq_cst);
+  }
+
+  // CommitWait commits waiting.
+  void CommitWait(Waiter* w) {
+    w->state = Waiter::kNotSignaled;
+    // Modification epoch of this waiter.
+    uint64_t epoch =
+        (w->epoch & kEpochMask) +
+        (((w->epoch & kWaiterMask) >> kWaiterShift) << kEpochShift);
+    uint64_t state = state_.load(std::memory_order_seq_cst);
+    for (;;) {
+      if (int64_t((state & kEpochMask) - epoch) < 0) {
+        // The preceeding waiter has not decided on its fate. Wait until it
+        // calls either CancelWait or CommitWait, or is notified.
+        EIGEN_THREAD_YIELD();
+        state = state_.load(std::memory_order_seq_cst);
+        continue;
+      }
+      // We've already been notified.
+      if (int64_t((state & kEpochMask) - epoch) > 0) return;
+      // Remove this thread from prewait counter and add it to the waiter list.
+      eigen_assert((state & kWaiterMask) != 0);
+      uint64_t newstate = state - kWaiterInc + kEpochInc;
+      newstate = (newstate & ~kStackMask) | (w - &waiters_[0]);
+      if ((state & kStackMask) == kStackMask)
+        w->next.store(nullptr, std::memory_order_relaxed);
+      else
+        w->next.store(&waiters_[state & kStackMask], std::memory_order_relaxed);
+      if (state_.compare_exchange_weak(state, newstate,
+                                       std::memory_order_release))
+        break;
+    }
+    Park(w);
+  }
+
+  // CancelWait cancels effects of the previous Prewait call.
+  void CancelWait(Waiter* w) {
+    uint64_t epoch =
+        (w->epoch & kEpochMask) +
+        (((w->epoch & kWaiterMask) >> kWaiterShift) << kEpochShift);
+    uint64_t state = state_.load(std::memory_order_relaxed);
+    for (;;) {
+      if (int64_t((state & kEpochMask) - epoch) < 0) {
+        // The preceeding waiter has not decided on its fate. Wait until it
+        // calls either CancelWait or CommitWait, or is notified.
+        EIGEN_THREAD_YIELD();
+        state = state_.load(std::memory_order_relaxed);
+        continue;
+      }
+      // We've already been notified.
+      if (int64_t((state & kEpochMask) - epoch) > 0) return;
+      // Remove this thread from prewait counter.
+      eigen_assert((state & kWaiterMask) != 0);
+      if (state_.compare_exchange_weak(state, state - kWaiterInc + kEpochInc,
+                                       std::memory_order_relaxed))
+        return;
+    }
+  }
+
+  // Notify wakes one or all waiting threads.
+  // Must be called after changing the associated wait predicate.
+  void Notify(bool all) {
+    std::atomic_thread_fence(std::memory_order_seq_cst);
+    uint64_t state = state_.load(std::memory_order_acquire);
+    for (;;) {
+      // Easy case: no waiters.
+      if ((state & kStackMask) == kStackMask && (state & kWaiterMask) == 0)
+        return;
+      uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
+      uint64_t newstate;
+      if (all) {
+        // Reset prewait counter and empty wait list.
+        newstate = (state & kEpochMask) + (kEpochInc * waiters) + kStackMask;
+      } else if (waiters) {
+        // There is a thread in pre-wait state, unblock it.
+        newstate = state + kEpochInc - kWaiterInc;
+      } else {
+        // Pop a waiter from list and unpark it.
+        Waiter* w = &waiters_[state & kStackMask];
+        Waiter* wnext = w->next.load(std::memory_order_relaxed);
+        uint64_t next = kStackMask;
+        if (wnext != nullptr) next = wnext - &waiters_[0];
+        // Note: we don't add kEpochInc here. ABA problem on the lock-free stack
+        // can't happen because a waiter is re-pushed onto the stack only after
+        // it was in the pre-wait state which inevitably leads to epoch
+        // increment.
+        newstate = (state & kEpochMask) + next;
+      }
+      if (state_.compare_exchange_weak(state, newstate,
+                                       std::memory_order_acquire)) {
+        if (!all && waiters) return;  // unblocked pre-wait thread
+        if ((state & kStackMask) == kStackMask) return;
+        Waiter* w = &waiters_[state & kStackMask];
+        if (!all) w->next.store(nullptr, std::memory_order_relaxed);
+        Unpark(w);
+        return;
+      }
+    }
+  }
+
+  class Waiter {
+    friend class EventCount;
+    std::atomic<Waiter*> next;
+    std::mutex mu;
+    std::condition_variable cv;
+    uint64_t epoch;
+    unsigned state;
+    enum {
+      kNotSignaled,
+      kWaiting,
+      kSignaled,
+    };
+    // Prevent false sharing with other Waiter objects in the same vector.
+    char pad_[128];
+  };
+
+ private:
+  // State_ layout:
+  // - low kStackBits is a stack of waiters committed wait.
+  // - next kWaiterBits is count of waiters in prewait state.
+  // - next kEpochBits is modification counter.
+  static const uint64_t kStackBits = 16;
+  static const uint64_t kStackMask = (1ull << kStackBits) - 1;
+  static const uint64_t kWaiterBits = 16;
+  static const uint64_t kWaiterShift = 16;
+  static const uint64_t kWaiterMask = ((1ull << kWaiterBits) - 1)
+                                      << kWaiterShift;
+  static const uint64_t kWaiterInc = 1ull << kWaiterBits;
+  static const uint64_t kEpochBits = 32;
+  static const uint64_t kEpochShift = 32;
+  static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift;
+  static const uint64_t kEpochInc = 1ull << kEpochShift;
+  std::atomic<uint64_t> state_;
+  std::vector<Waiter>& waiters_;
+
+  void Park(Waiter* w) {
+    std::unique_lock<std::mutex> lock(w->mu);
+    while (w->state != Waiter::kSignaled) {
+      w->state = Waiter::kWaiting;
+      w->cv.wait(lock);
+    }
+  }
+
+  void Unpark(Waiter* waiters) {
+    Waiter* next = nullptr;
+    for (Waiter* w = waiters; w; w = next) {
+      next = w->next.load(std::memory_order_relaxed);
+      unsigned state;
+      {
+        std::unique_lock<std::mutex> lock(w->mu);
+        state = w->state;
+        w->state = Waiter::kSignaled;
+      }
+      // Avoid notifying if it wasn't waiting.
+      if (state == Waiter::kWaiting) w->cv.notify_one();
+    }
+  }
+
+  EventCount(const EventCount&) = delete;
+  void operator=(const EventCount&) = delete;
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
new file mode 100644
index 000000000..1c471a19f
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@@ -0,0 +1,232 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
+#define EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
+
+
+namespace Eigen {
+
+template <typename Environment>
+class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
+ public:
+  typedef typename Environment::Task Task;
+  typedef RunQueue<Task, 1024> Queue;
+
+  NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment())
+      : env_(env),
+        threads_(num_threads),
+        queues_(num_threads),
+        waiters_(num_threads),
+        blocked_(),
+        spinning_(),
+        done_(),
+        ec_(waiters_) {
+    for (int i = 0; i < num_threads; i++) queues_.push_back(new Queue());
+    for (int i = 0; i < num_threads; i++)
+      threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); }));
+  }
+
+  ~NonBlockingThreadPoolTempl() {
+    done_.store(true, std::memory_order_relaxed);
+    // Now if all threads block without work, they will start exiting.
+    // But note that threads can continue to work arbitrary long,
+    // block, submit new work, unblock and otherwise live full life.
+    ec_.Notify(true);
+
+    // Join threads explicitly to avoid destruction order issues.
+    for (size_t i = 0; i < threads_.size(); i++) delete threads_[i];
+    for (size_t i = 0; i < threads_.size(); i++) delete queues_[i];
+  }
+
+  void Schedule(std::function<void()> fn) {
+    Task t = env_.CreateTask(std::move(fn));
+    PerThread* pt = GetPerThread();
+    if (pt->pool == this) {
+      // Worker thread of this pool, push onto the thread's queue.
+      Queue* q = queues_[pt->index];
+      t = q->PushFront(std::move(t));
+    } else {
+      // A free-standing thread (or worker of another pool), push onto a random
+      // queue.
+      Queue* q = queues_[Rand(&pt->rand) % queues_.size()];
+      t = q->PushBack(std::move(t));
+    }
+    // Note: below we touch this after making w available to worker threads.
+    // Strictly speaking, this can lead to a racy-use-after-free. Consider that
+    // Schedule is called from a thread that is neither main thread nor a worker
+    // thread of this pool. Then, execution of w directly or indirectly
+    // completes overall computations, which in turn leads to destruction of
+    // this. We expect that such scenario is prevented by program, that is,
+    // this is kept alive while any threads can potentially be in Schedule.
+    if (!t.f)
+      ec_.Notify(false);
+    else
+      env_.ExecuteTask(t);  // Push failed, execute directly.
+  }
+
+ private:
+  typedef typename Environment::EnvThread Thread;
+
+  struct PerThread {
+    bool inited;
+    NonBlockingThreadPoolTempl* pool;  // Parent pool, or null for normal threads.
+    unsigned index;         // Worker thread index in pool.
+    unsigned rand;          // Random generator state.
+  };
+
+  Environment env_;
+  MaxSizeVector<Thread*> threads_;
+  MaxSizeVector<Queue*> queues_;
+  std::vector<EventCount::Waiter> waiters_;
+  std::atomic<unsigned> blocked_;
+  std::atomic<bool> spinning_;
+  std::atomic<bool> done_;
+  EventCount ec_;
+
+  // Main worker thread loop.
+  void WorkerLoop(unsigned index) {
+    PerThread* pt = GetPerThread();
+    pt->pool = this;
+    pt->index = index;
+    Queue* q = queues_[index];
+    EventCount::Waiter* waiter = &waiters_[index];
+    std::vector<Task> stolen;
+    for (;;) {
+      Task t;
+      if (!stolen.empty()) {
+        t = std::move(stolen.back());
+        stolen.pop_back();
+      }
+      if (!t.f) t = q->PopFront();
+      if (!t.f) {
+        if (Steal(&stolen)) {
+          t = std::move(stolen.back());
+          stolen.pop_back();
+          while (stolen.size()) {
+            Task t1 = q->PushFront(std::move(stolen.back()));
+            stolen.pop_back();
+            if (t1.f) {
+              // There is not much we can do in this case. Just execute the
+              // remaining directly.
+              stolen.push_back(std::move(t1));
+              break;
+            }
+          }
+        }
+      }
+      if (t.f) {
+        env_.ExecuteTask(t);
+        continue;
+      }
+      // Leave one thread spinning. This reduces latency.
+      if (!spinning_ && !spinning_.exchange(true)) {
+        bool nowork = true;
+        for (int i = 0; i < 1000; i++) {
+          if (!OutOfWork()) {
+            nowork = false;
+            break;
+          }
+        }
+        spinning_ = false;
+        if (!nowork) continue;
+      }
+      if (!WaitForWork(waiter)) return;
+    }
+  }
+
+  // Steal tries to steal work from other worker threads in best-effort manner.
+  bool Steal(std::vector<Task>* stolen) {
+    if (queues_.size() == 1) return false;
+    PerThread* pt = GetPerThread();
+    unsigned lastq = pt->index;
+    for (unsigned i = queues_.size(); i > 0; i--) {
+      unsigned victim = Rand(&pt->rand) % queues_.size();
+      if (victim == lastq && queues_.size() > 2) {
+        i++;
+        continue;
+      }
+      // Steal half of elements from a victim queue.
+      // It is typical to steal just one element, but that assumes that work is
+      // recursively subdivided in halves so that the stolen element is exactly
+      // half of work. If work elements are equally-sized, then is makes sense
+      // to steal half of elements at once and then work locally for a while.
+      if (queues_[victim]->PopBackHalf(stolen)) return true;
+      lastq = victim;
+    }
+    // Just to make sure that we did not miss anything.
+    for (unsigned i = queues_.size(); i > 0; i--)
+      if (queues_[i - 1]->PopBackHalf(stolen)) return true;
+    return false;
+  }
+
+  // WaitForWork blocks until new work is available, or if it is time to exit.
+  bool WaitForWork(EventCount::Waiter* waiter) {
+    // We already did best-effort emptiness check in Steal, so prepare blocking.
+    ec_.Prewait(waiter);
+    // Now do reliable emptiness check.
+    if (!OutOfWork()) {
+      ec_.CancelWait(waiter);
+      return true;
+    }
+    // Number of blocked threads is used as termination condition.
+    // If we are shutting down and all worker threads blocked without work,
+    // that's we are done.
+    blocked_++;
+    if (done_ && blocked_ == threads_.size()) {
+      ec_.CancelWait(waiter);
+      // Almost done, but need to re-check queues.
+      // Consider that all queues are empty and all worker threads are preempted
+      // right after incrementing blocked_ above. Now a free-standing thread
+      // submits work and calls destructor (which sets done_). If we don't
+      // re-check queues, we will exit leaving the work unexecuted.
+      if (!OutOfWork()) {
+        // Note: we must not pop from queues before we decrement blocked_,
+        // otherwise the following scenario is possible. Consider that instead
+        // of checking for emptiness we popped the only element from queues.
+        // Now other worker threads can start exiting, which is bad if the
+        // work item submits other work. So we just check emptiness here,
+        // which ensures that all worker threads exit at the same time.
+        blocked_--;
+        return true;
+      }
+      // Reached stable termination state.
+      ec_.Notify(true);
+      return false;
+    }
+    ec_.CommitWait(waiter);
+    blocked_--;
+    return true;
+  }
+
+  bool OutOfWork() {
+    for (unsigned i = 0; i < queues_.size(); i++)
+      if (!queues_[i]->Empty()) return false;
+    return true;
+  }
+
+  PerThread* GetPerThread() {
+    EIGEN_THREAD_LOCAL PerThread per_thread_;
+    PerThread* pt = &per_thread_;
+    if (pt->inited) return pt;
+    pt->inited = true;
+    pt->rand = std::hash<std::thread::id>()(std::this_thread::get_id());
+    return pt;
+  }
+
+  static unsigned Rand(unsigned* state) {
+    return *state = *state * 1103515245 + 12345;
+  }
+};
+
+typedef NonBlockingThreadPoolTempl<StlThreadEnvironment> NonBlockingThreadPool;
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
new file mode 100644
index 000000000..0544a6e15
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
@@ -0,0 +1,210 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
+#define EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
+
+
+namespace Eigen {
+
+// RunQueue is a fixed-size, partially non-blocking deque or Work items.
+// Operations on front of the queue must be done by a single thread (owner),
+// operations on back of the queue can be done by multiple threads concurrently.
+//
+// Algorithm outline:
+// All remote threads operating on the queue back are serialized by a mutex.
+// This ensures that at most two threads access state: owner and one remote
+// thread (Size aside). The algorithm ensures that the occupied region of the
+// underlying array is logically continuous (can wraparound, but no stray
+// occupied elements). Owner operates on one end of this region, remote thread
+// operates on the other end. Synchronization between these threads
+// (potential consumption of the last element and take up of the last empty
+// element) happens by means of state variable in each element. States are:
+// empty, busy (in process of insertion of removal) and ready. Threads claim
+// elements (empty->busy and ready->busy transitions) by means of a CAS
+// operation. The finishing transition (busy->empty and busy->ready) are done
+// with plain store as the element is exclusively owned by the current thread.
+//
+// Note: we could permit only pointers as elements, then we would not need
+// separate state variable as null/non-null pointer value would serve as state,
+// but that would require malloc/free per operation for large, complex values
+// (and this is designed to store std::function<()>).
+template <typename Work, unsigned kSize>
+class RunQueue {
+ public:
+  RunQueue() : front_(), back_() {
+    // require power-of-two for fast masking
+    eigen_assert((kSize & (kSize - 1)) == 0);
+    eigen_assert(kSize > 2);            // why would you do this?
+    eigen_assert(kSize <= (64 << 10));  // leave enough space for counter
+    for (unsigned i = 0; i < kSize; i++)
+      array_[i].state.store(kEmpty, std::memory_order_relaxed);
+  }
+
+  ~RunQueue() { eigen_assert(Size() == 0); }
+
+  // PushFront inserts w at the beginning of the queue.
+  // If queue is full returns w, otherwise returns default-constructed Work.
+  Work PushFront(Work w) {
+    unsigned front = front_.load(std::memory_order_relaxed);
+    Elem* e = &array_[front & kMask];
+    uint8_t s = e->state.load(std::memory_order_relaxed);
+    if (s != kEmpty ||
+        !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
+      return w;
+    front_.store(front + 1 + (kSize << 1), std::memory_order_relaxed);
+    e->w = std::move(w);
+    e->state.store(kReady, std::memory_order_release);
+    return Work();
+  }
+
+  // PopFront removes and returns the first element in the queue.
+  // If the queue was empty returns default-constructed Work.
+  Work PopFront() {
+    unsigned front = front_.load(std::memory_order_relaxed);
+    Elem* e = &array_[(front - 1) & kMask];
+    uint8_t s = e->state.load(std::memory_order_relaxed);
+    if (s != kReady ||
+        !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
+      return Work();
+    Work w = std::move(e->w);
+    e->state.store(kEmpty, std::memory_order_release);
+    front = ((front - 1) & kMask2) | (front & ~kMask2);
+    front_.store(front, std::memory_order_relaxed);
+    return w;
+  }
+
+  // PushBack adds w at the end of the queue.
+  // If queue is full returns w, otherwise returns default-constructed Work.
+  Work PushBack(Work w) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    unsigned back = back_.load(std::memory_order_relaxed);
+    Elem* e = &array_[(back - 1) & kMask];
+    uint8_t s = e->state.load(std::memory_order_relaxed);
+    if (s != kEmpty ||
+        !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
+      return w;
+    back = ((back - 1) & kMask2) | (back & ~kMask2);
+    back_.store(back, std::memory_order_relaxed);
+    e->w = std::move(w);
+    e->state.store(kReady, std::memory_order_release);
+    return Work();
+  }
+
+  // PopBack removes and returns the last elements in the queue.
+  // Can fail spuriously.
+  Work PopBack() {
+    if (Empty()) return 0;
+    std::unique_lock<std::mutex> lock(mutex_, std::try_to_lock);
+    if (!lock) return Work();
+    unsigned back = back_.load(std::memory_order_relaxed);
+    Elem* e = &array_[back & kMask];
+    uint8_t s = e->state.load(std::memory_order_relaxed);
+    if (s != kReady ||
+        !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
+      return Work();
+    Work w = std::move(e->w);
+    e->state.store(kEmpty, std::memory_order_release);
+    back_.store(back + 1 + (kSize << 1), std::memory_order_relaxed);
+    return w;
+  }
+
+  // PopBackHalf removes and returns half last elements in the queue.
+  // Returns number of elements removed. But can also fail spuriously.
+  unsigned PopBackHalf(std::vector<Work>* result) {
+    if (Empty()) return 0;
+    std::unique_lock<std::mutex> lock(mutex_, std::try_to_lock);
+    if (!lock) return 0;
+    unsigned back = back_.load(std::memory_order_relaxed);
+    unsigned size = Size();
+    unsigned mid = back;
+    if (size > 1) mid = back + (size - 1) / 2;
+    unsigned n = 0;
+    unsigned start = 0;
+    for (; static_cast<int>(mid - back) >= 0; mid--) {
+      Elem* e = &array_[mid & kMask];
+      uint8_t s = e->state.load(std::memory_order_relaxed);
+      if (n == 0) {
+        if (s != kReady ||
+            !e->state.compare_exchange_strong(s, kBusy,
+                                              std::memory_order_acquire))
+          continue;
+        start = mid;
+      } else {
+        // Note: no need to store temporal kBusy, we exclusively own these
+        // elements.
+        eigen_assert(s == kReady);
+      }
+      result->push_back(std::move(e->w));
+      e->state.store(kEmpty, std::memory_order_release);
+      n++;
+    }
+    if (n != 0)
+      back_.store(start + 1 + (kSize << 1), std::memory_order_relaxed);
+    return n;
+  }
+
+  // Size returns current queue size.
+  // Can be called by any thread at any time.
+  unsigned Size() const {
+    // Emptiness plays critical role in thread pool blocking. So we go to great
+    // effort to not produce false positives (claim non-empty queue as empty).
+    for (;;) {
+      // Capture a consistent snapshot of front/tail.
+      unsigned front = front_.load(std::memory_order_acquire);
+      unsigned back = back_.load(std::memory_order_acquire);
+      unsigned front1 = front_.load(std::memory_order_relaxed);
+      if (front != front1) continue;
+      int size = (front & kMask2) - (back & kMask2);
+      // Fix overflow.
+      if (size < 0) size += 2 * kSize;
+      // Order of modification in push/pop is crafted to make the queue look
+      // larger than it is during concurrent modifications. E.g. pop can
+      // decrement size before the corresponding push has incremented it.
+      // So the computed size can be up to kSize + 1, fix it.
+      if (size > static_cast<int>(kSize)) size = kSize;
+      return size;
+    }
+  }
+
+  // Empty tests whether container is empty.
+  // Can be called by any thread at any time.
+  bool Empty() const { return Size() == 0; }
+
+ private:
+  static const unsigned kMask = kSize - 1;
+  static const unsigned kMask2 = (kSize << 1) - 1;
+  struct Elem {
+    std::atomic<uint8_t> state;
+    Work w;
+  };
+  enum {
+    kEmpty,
+    kBusy,
+    kReady,
+  };
+  std::mutex mutex_;
+  // Low log(kSize) + 1 bits in front_ and back_ contain rolling index of
+  // front/back, repsectively. The remaining bits contain modification counters
+  // that are incremented on Push operations. This allows us to (1) distinguish
+  // between empty and full conditions (if we would use log(kSize) bits for
+  // position, these conditions would be indistinguishable); (2) obtain
+  // consistent snapshot of front_/back_ for Size operation using the
+  // modification counters.
+  std::atomic<unsigned> front_;
+  std::atomic<unsigned> back_;
+  Elem array_[kSize];
+
+  RunQueue(const RunQueue&) = delete;
+  void operator=(const RunQueue&) = delete;
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
new file mode 100644
index 000000000..17fd1658b
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
@@ -0,0 +1,127 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H
+#define EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H
+
+namespace Eigen {
+
+// The implementation of the ThreadPool type ensures that the Schedule method
+// runs the functions it is provided in FIFO order when the scheduling is done
+// by a single thread.
+// Environment provides a way to create threads and also allows to intercept
+// task submission and execution.
+template <typename Environment>
+class SimpleThreadPoolTempl : public ThreadPoolInterface {
+ public:
+  // Construct a pool that contains "num_threads" threads.
+  explicit SimpleThreadPoolTempl(int num_threads, Environment env = Environment())
+      : env_(env), threads_(num_threads), waiters_(num_threads) {
+    for (int i = 0; i < num_threads; i++) {
+      threads_.push_back(env.CreateThread([this]() { WorkerLoop(); }));
+    }
+  }
+
+  // Wait until all scheduled work has finished and then destroy the
+  // set of threads.
+  ~SimpleThreadPoolTempl() {
+    {
+      // Wait for all work to get done.
+      std::unique_lock<std::mutex> l(mu_);
+      while (!pending_.empty()) {
+        empty_.wait(l);
+      }
+      exiting_ = true;
+
+      // Wakeup all waiters.
+      for (auto w : waiters_) {
+        w->ready = true;
+        w->task.f = nullptr;
+        w->cv.notify_one();
+      }
+    }
+
+    // Wait for threads to finish.
+    for (auto t : threads_) {
+      delete t;
+    }
+  }
+
+  // Schedule fn() for execution in the pool of threads. The functions are
+  // executed in the order in which they are scheduled.
+  void Schedule(std::function<void()> fn) {
+    Task t = env_.CreateTask(std::move(fn));
+    std::unique_lock<std::mutex> l(mu_);
+    if (waiters_.empty()) {
+      pending_.push_back(std::move(t));
+    } else {
+      Waiter* w = waiters_.back();
+      waiters_.pop_back();
+      w->ready = true;
+      w->task = std::move(t);
+      w->cv.notify_one();
+    }
+  }
+
+ protected:
+  void WorkerLoop() {
+    std::unique_lock<std::mutex> l(mu_);
+    Waiter w;
+    Task t;
+    while (!exiting_) {
+      if (pending_.empty()) {
+        // Wait for work to be assigned to me
+        w.ready = false;
+        waiters_.push_back(&w);
+        while (!w.ready) {
+          w.cv.wait(l);
+        }
+        t = w.task;
+        w.task.f = nullptr;
+      } else {
+        // Pick up pending work
+        t = std::move(pending_.front());
+        pending_.pop_front();
+        if (pending_.empty()) {
+          empty_.notify_all();
+        }
+      }
+      if (t.f) {
+        mu_.unlock();
+        env_.ExecuteTask(t);
+        t.f = nullptr;
+        mu_.lock();
+      }
+    }
+  }
+
+ private:
+  typedef typename Environment::Task Task;
+  typedef typename Environment::EnvThread Thread;
+
+  struct Waiter {
+    std::condition_variable cv;
+    Task task;
+    bool ready;
+  };
+
+  Environment env_;
+  std::mutex mu_;
+  MaxSizeVector<Thread*> threads_;  // All threads
+  MaxSizeVector<Waiter*> waiters_;  // Stack of waiting threads.
+  std::deque<Task> pending_;          // Queue of pending work
+  std::condition_variable empty_;          // Signaled on pending_.empty()
+  bool exiting_ = false;
+};
+
+typedef SimpleThreadPoolTempl<StlThreadEnvironment> SimpleThreadPool;
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
new file mode 100644
index 000000000..d2204ad5b
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
@@ -0,0 +1,38 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
+
+namespace Eigen {
+
+struct StlThreadEnvironment {
+  struct Task {
+    std::function<void()> f;
+  };
+
+  // EnvThread constructor must start the thread,
+  // destructor must join the thread.
+  class EnvThread {
+   public:
+    EnvThread(std::function<void()> f) : thr_(f) {}
+    ~EnvThread() { thr_.join(); }
+
+   private:
+    std::thread thr_;
+  };
+
+  EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(f); }
+  Task CreateTask(std::function<void()> f) { return Task{std::move(f)}; }
+  void ExecuteTask(const Task& t) { t.f(); }
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h
new file mode 100644
index 000000000..cfa221732
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h
@@ -0,0 +1,22 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
+
+// Try to come up with a portable implementation of thread local variables
+#if EIGEN_COMP_GNUC && EIGEN_GNUC_AT_MOST(4, 7)
+#define EIGEN_THREAD_LOCAL static __thread
+#elif EIGEN_COMP_CLANG
+#define EIGEN_THREAD_LOCAL static __thread
+#else
+#define EIGEN_THREAD_LOCAL static thread_local
+#endif
+
+#endif  // EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
new file mode 100644
index 000000000..38b40aceb
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
@@ -0,0 +1,26 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
+
+namespace Eigen {
+
+// This defines an interface that ThreadPoolDevice can take to use
+// custom thread pools underneath.
+class ThreadPoolInterface {
+ public:
+  virtual void Schedule(std::function<void()> fn) = 0;
+
+  virtual ~ThreadPoolInterface() {}
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h
new file mode 100644
index 000000000..a859c7ba3
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h
@@ -0,0 +1,20 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
+
+// Try to come up with a portable way to yield
+#if EIGEN_COMP_GNUC && EIGEN_GNUC_AT_MOST(4, 7)
+#define EIGEN_THREAD_YIELD() sched_yield()
+#else
+#define EIGEN_THREAD_YIELD() std::this_thread::yield()
+#endif
+
+#endif  // EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
diff --git a/unsupported/Eigen/CXX11/src/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/util/CMakeLists.txt
new file mode 100644
index 000000000..7eab492d6
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/util/CMakeLists.txt
@@ -0,0 +1,6 @@
+FILE(GLOB Eigen_CXX11_util_SRCS "*.h")
+
+INSTALL(FILES
+  ${Eigen_CXX11_util_SRCS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/util COMPONENT Devel
+  )
diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
index c582e21f5..ec27eddb8 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h
+++ b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
@@ -10,12 +10,22 @@
 #ifndef EIGEN_CXX11META_H
 #define EIGEN_CXX11META_H
 
+#include <vector>
+#include "EmulateArray.h"
+
+// Emulate the cxx11 functionality that we need if the compiler doesn't support it.
+// Visual studio 2015 doesn't advertise itself as cxx11 compliant, although it
+// supports enough of the standard for our needs
+#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900
+
+#include "CXX11Workarounds.h"
+
 namespace Eigen {
 
 namespace internal {
 
 /** \internal
-  * \file CXX11/Core/util/CXX11Meta.h
+  * \file CXX11/util/CXX11Meta.h
   * This file contains generic metaprogramming classes which are not specifically related to Eigen.
   * This file expands upon Core/util/Meta.h and adds support for C++11 specific features.
   */
@@ -523,4 +533,10 @@ InstType instantiate_by_c_array(ArrType* arr)
 
 } // end namespace Eigen
 
+#else // Non C++11, fallback to emulation mode
+
+#include "EmulateCXX11Meta.h"
+
+#endif
+
 #endif // EIGEN_CXX11META_H
diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h
index fe4d22803..fe4d22803 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h
+++ b/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h
diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
index 579519b04..24159e54c 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
@@ -222,7 +222,7 @@ template<class T, std::size_t N> struct array_size<const array<T,N>& > {
 
 #else
 
-// The compiler supports c++11, and we're not targetting cuda: use std::array as Eigen array
+// The compiler supports c++11, and we're not targetting cuda: use std::array as Eigen::array
 #include <array>
 namespace Eigen {
 
@@ -264,8 +264,4 @@ template<class T, std::size_t N> struct array_size<std::array<T,N> > {
 
 #endif
 
-
-
-
-
 #endif  // EIGEN_EMULATE_ARRAY_H
diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h
index d685d4f9d..f3aa1b144 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
+++ b/unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h
@@ -17,7 +17,7 @@ namespace Eigen {
 namespace internal {
 
 /** \internal
-  * \file CXX11/Core/util/EmulateCXX11Meta.h
+  * \file CXX11/util/EmulateCXX11Meta.h
   * This file emulates a subset of the functionality provided by CXXMeta.h for
   * compilers that don't yet support cxx11 such as nvcc.
   */
diff --git a/unsupported/Eigen/CXX11/src/Core/util/MaxSizeVector.h b/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
index 551124bae..961456f10 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/MaxSizeVector.h
+++ b/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
@@ -41,7 +41,7 @@ class MaxSizeVector {
   // Construct a new MaxSizeVector, reserve and resize to n.
   // Copy the init value to all elements.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit MaxSizeVector(size_t n, const T& init)
+  MaxSizeVector(size_t n, const T& init)
       : reserve_(n), size_(n),
         data_(static_cast<T*>(internal::aligned_malloc(n * sizeof(T)))) {
     for (size_t i = 0; i < n; ++i) { new (&data_[i]) T(init); }
diff --git a/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h b/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
index b30e0a90a..995427978 100644
--- a/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
+++ b/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
@@ -304,7 +304,7 @@ LevenbergMarquardt<FunctorType>::minimizeInit(FVectorType  &x)
 //     m_fjac.reserve(VectorXi::Constant(n,5)); // FIXME Find a better alternative
     if (!m_useExternalScaling)
         m_diag.resize(n);
-    eigen_assert( (!m_useExternalScaling || m_diag.size()==n) || "When m_useExternalScaling is set, the caller must provide a valid 'm_diag'");
+    eigen_assert( (!m_useExternalScaling || m_diag.size()==n) && "When m_useExternalScaling is set, the caller must provide a valid 'm_diag'");
     m_qtf.resize(n);
 
     /* Function Body */
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
index bbb7e5776..af515eb13 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
@@ -210,9 +210,9 @@ struct matrix_exp_computeUV<MatrixType, float>
     using std::pow;
     const float l1norm = arg.cwiseAbs().colwise().sum().maxCoeff();
     squarings = 0;
-    if (l1norm < 4.258730016922831e-001) {
+    if (l1norm < 4.258730016922831e-001f) {
       matrix_exp_pade3(arg, U, V);
-    } else if (l1norm < 1.880152677804762e+000) {
+    } else if (l1norm < 1.880152677804762e+000f) {
       matrix_exp_pade5(arg, U, V);
     } else {
       const float maxnorm = 3.925724783138660f;
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
index 8f7a6f3b0..077853cbd 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
@@ -132,6 +132,7 @@ template <typename EivalsType, typename Cluster>
 void matrix_function_partition_eigenvalues(const EivalsType& eivals, std::list<Cluster>& clusters)
 {
   typedef typename EivalsType::Index Index;
+  typedef typename EivalsType::RealScalar RealScalar;
   for (Index i=0; i<eivals.rows(); ++i) {
     // Find cluster containing i-th ei'val, adding a new cluster if necessary
     typename std::list<Cluster>::iterator qi = matrix_function_find_cluster(i, clusters);
@@ -145,7 +146,7 @@ void matrix_function_partition_eigenvalues(const EivalsType& eivals, std::list<C
 
     // Look for other element to add to the set
     for (Index j=i+1; j<eivals.rows(); ++j) {
-      if (abs(eivals(j) - eivals(i)) <= matrix_function_separation
+      if (abs(eivals(j) - eivals(i)) <= RealScalar(matrix_function_separation)
           && std::find(qi->begin(), qi->end(), j) == qi->end()) {
         typename std::list<Cluster>::iterator qj = matrix_function_find_cluster(j, clusters);
         if (qj == clusters.end()) {
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
index e43e86e90..8a78fc1f7 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
@@ -37,6 +37,7 @@ template <typename MatrixType>
 void matrix_log_compute_2x2(const MatrixType& A, MatrixType& result)
 {
   typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
   using std::abs;
   using std::ceil;
   using std::imag;
@@ -54,14 +55,14 @@ void matrix_log_compute_2x2(const MatrixType& A, MatrixType& result)
   {
     result(0,1) = A(0,1) / A(0,0);
   }
-  else if ((abs(A(0,0)) < 0.5*abs(A(1,1))) || (abs(A(0,0)) > 2*abs(A(1,1))))
+  else if ((abs(A(0,0)) < RealScalar(0.5)*abs(A(1,1))) || (abs(A(0,0)) > 2*abs(A(1,1))))
   {
     result(0,1) = A(0,1) * (logA11 - logA00) / y;
   }
   else
   {
     // computation in previous branch is inaccurate if A(1,1) \approx A(0,0)
-    int unwindingNumber = static_cast<int>(ceil((imag(logA11 - logA00) - EIGEN_PI) / (2*EIGEN_PI)));
+    int unwindingNumber = static_cast<int>(ceil((imag(logA11 - logA00) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI)));
     result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,2*EIGEN_PI*unwindingNumber)) / y;
   }
 }
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
index f37d31c3f..6167368d8 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
@@ -298,7 +298,7 @@ MatrixPowerAtomic<MatrixType>::computeSuperDiag(const ComplexScalar& curr, const
 
   ComplexScalar logCurr = log(curr);
   ComplexScalar logPrev = log(prev);
-  int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - EIGEN_PI) / (2*EIGEN_PI));
+  int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI));
   ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, EIGEN_PI*unwindingNumber);
   return RealScalar(2) * exp(RealScalar(0.5) * p * (logCurr + logPrev)) * sinh(p * w) / (curr - prev);
 }
diff --git a/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h b/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
index b8ba6ddcb..8fe3ed86b 100644
--- a/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
+++ b/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
@@ -150,7 +150,7 @@ HybridNonLinearSolver<FunctorType,Scalar>::solveInit(FVectorType  &x)
     fjac.resize(n, n);
     if (!useExternalScaling)
         diag.resize(n);
-    eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'");
+    eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'");
 
     /* Function Body */
     nfev = 0;
@@ -390,7 +390,7 @@ HybridNonLinearSolver<FunctorType,Scalar>::solveNumericalDiffInit(FVectorType  &
     fvec.resize(n);
     if (!useExternalScaling)
         diag.resize(n);
-    eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'");
+    eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'");
 
     /* Function Body */
     nfev = 0;
diff --git a/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h b/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
index 69106ddc5..fe3b79ca7 100644
--- a/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
+++ b/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
@@ -179,7 +179,7 @@ LevenbergMarquardt<FunctorType,Scalar>::minimizeInit(FVectorType  &x)
     fjac.resize(m, n);
     if (!useExternalScaling)
         diag.resize(n);
-    eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'");
+    eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'");
     qtf.resize(n);
 
     /* Function Body */
@@ -215,7 +215,7 @@ LevenbergMarquardt<FunctorType,Scalar>::minimizeOneStep(FVectorType  &x)
 {
     using std::abs;
     using std::sqrt;
-    
+
     eigen_assert(x.size()==n); // check the caller is not cheating us
 
     /* calculate the jacobian matrix. */
@@ -398,7 +398,7 @@ LevenbergMarquardt<FunctorType,Scalar>::minimizeOptimumStorageInit(FVectorType
     fjac.resize(n, n);
     if (!useExternalScaling)
         diag.resize(n);
-    eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'");
+    eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'");
     qtf.resize(n);
 
     /* Function Body */
diff --git a/unsupported/Eigen/src/Splines/Spline.h b/unsupported/Eigen/src/Splines/Spline.h
index d1636f466..ddcddfc9a 100644
--- a/unsupported/Eigen/src/Splines/Spline.h
+++ b/unsupported/Eigen/src/Splines/Spline.h
@@ -394,7 +394,7 @@ namespace Eigen
 
     Matrix<Scalar,Order,Order> ndu(p+1,p+1);
 
-    double saved, temp;
+    Scalar saved, temp; // FIXME These were double instead of Scalar. Was there a reason for that?
 
     ndu(0,0) = 1.0;
 
@@ -433,7 +433,7 @@ namespace Eigen
       // Compute the k-th derivative
       for (DenseIndex k=1; k<=static_cast<DenseIndex>(n); ++k)
       {
-        double d = 0.0;
+        Scalar d = 0.0;
         DenseIndex rk,pk,j1,j2;
         rk = r-k; pk = p-k;
 
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 96652bfcf..22442b394 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -110,34 +110,48 @@ ei_add_test(minres)
 ei_add_test(levenberg_marquardt)
 ei_add_test(kronecker_product)
 
-ei_add_test(float16)
+# TODO: The following test names are prefixed with the cxx11 string, since historically
+# the tests depended on c++11. This isn't the case anymore so we ought to rename them.
+ei_add_test(cxx11_float16)
+ei_add_test(cxx11_tensor_dimension)
+ei_add_test(cxx11_tensor_map)
+ei_add_test(cxx11_tensor_assign)
+ei_add_test(cxx11_tensor_comparisons)
+ei_add_test(cxx11_tensor_forced_eval)
+ei_add_test(cxx11_tensor_math)
+ei_add_test(cxx11_tensor_const)
+ei_add_test(cxx11_tensor_intdiv)
+ei_add_test(cxx11_tensor_casts)
+ei_add_test(cxx11_tensor_empty)
+ei_add_test(cxx11_tensor_sugar)
+ei_add_test(cxx11_tensor_roundings)
+ei_add_test(cxx11_tensor_layout_swap)
+ei_add_test(cxx11_tensor_io)
+if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
+  # This test requires __uint128_t which is only available on 64bit systems 
+  ei_add_test(cxx11_tensor_uint128)
+endif() 
 
 if(EIGEN_TEST_CXX11)
   # It should be safe to always run these tests as there is some fallback code for
   # older compiler that don't support cxx11.
   set(CMAKE_CXX_STANDARD 11)
 
+  ei_add_test(cxx11_eventcount "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+  ei_add_test(cxx11_runqueue "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
   ei_add_test(cxx11_meta)
   ei_add_test(cxx11_tensor_simple)
 #  ei_add_test(cxx11_tensor_symmetry)
-  ei_add_test(cxx11_tensor_assign)
-  ei_add_test(cxx11_tensor_dimension)
   ei_add_test(cxx11_tensor_index_list)
   ei_add_test(cxx11_tensor_mixed_indices)
-  ei_add_test(cxx11_tensor_comparisons)
   ei_add_test(cxx11_tensor_contraction)
   ei_add_test(cxx11_tensor_convolution)
   ei_add_test(cxx11_tensor_expr)
-  ei_add_test(cxx11_tensor_math)
-  ei_add_test(cxx11_tensor_forced_eval)
   ei_add_test(cxx11_tensor_fixed_size)
-  ei_add_test(cxx11_tensor_const)
   ei_add_test(cxx11_tensor_of_const_values)
   ei_add_test(cxx11_tensor_of_complex)
   ei_add_test(cxx11_tensor_of_strings)
-  ei_add_test(cxx11_tensor_intdiv)
   ei_add_test(cxx11_tensor_lvalue)
-  ei_add_test(cxx11_tensor_map)
   ei_add_test(cxx11_tensor_broadcasting)
   ei_add_test(cxx11_tensor_chipping)
   ei_add_test(cxx11_tensor_concatenation)
@@ -155,23 +169,11 @@ if(EIGEN_TEST_CXX11)
   ei_add_test(cxx11_tensor_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
   ei_add_test(cxx11_tensor_ref)
   ei_add_test(cxx11_tensor_random)
-  ei_add_test(cxx11_tensor_casts)
-  ei_add_test(cxx11_tensor_roundings)
-  ei_add_test(cxx11_tensor_reverse)
-  ei_add_test(cxx11_tensor_layout_swap)
-  ei_add_test(cxx11_tensor_io)
   ei_add_test(cxx11_tensor_generator)
   ei_add_test(cxx11_tensor_custom_op)
   ei_add_test(cxx11_tensor_custom_index)
-  ei_add_test(cxx11_tensor_sugar)
   ei_add_test(cxx11_tensor_fft)
   ei_add_test(cxx11_tensor_ifft)
-  ei_add_test(cxx11_tensor_empty)
-
-  if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
-    # This test requires __uint128_t which is only available on 64bit systems 
-    ei_add_test(cxx11_tensor_uint128)
-  endif() 
 
 endif()
 
@@ -191,6 +193,10 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
   if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
     set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE)
   endif()
+  if(EIGEN_TEST_CUDA_CLANG)
+   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_${EIGEN_CUDA_COMPUTE_ARCH}")
+  endif()
+
   set(CUDA_NVCC_FLAGS "-std=c++11 --relaxed-constexpr -arch compute_${EIGEN_CUDA_COMPUTE_ARCH} -Xcudafe \"--display_error_number\"")
   cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include")
   set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
@@ -207,10 +213,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
     ei_add_test(cxx11_tensor_random_cuda)
   endif()
 
-  # Operations other that casting of half floats are only supported starting with arch 5.3
-  if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 52)
-    ei_add_test(cxx11_tensor_of_float16_cuda)
-  endif()
+  ei_add_test(cxx11_tensor_of_float16_cuda)
 
   unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
 endif()
diff --git a/unsupported/test/FFTW.cpp b/unsupported/test/FFTW.cpp
index d3718e2d2..1dd6dc97d 100644
--- a/unsupported/test/FFTW.cpp
+++ b/unsupported/test/FFTW.cpp
@@ -54,7 +54,7 @@ complex<long double>  promote(long double x) { return complex<long double>( x);
         long double difpower=0;
         size_t n = (min)( buf1.size(),buf2.size() );
         for (size_t k=0;k<n;++k) {
-            totalpower += (numext::abs2( buf1[k] ) + numext::abs2(buf2[k]) )/2.;
+            totalpower += (numext::abs2( buf1[k] ) + numext::abs2(buf2[k]) )/2;
             difpower += numext::abs2(buf1[k] - buf2[k]);
         }
         return sqrt(difpower/totalpower);
diff --git a/unsupported/test/NonLinearOptimization.cpp b/unsupported/test/NonLinearOptimization.cpp
index 724ea7b5b..6a5ed057f 100644
--- a/unsupported/test/NonLinearOptimization.cpp
+++ b/unsupported/test/NonLinearOptimization.cpp
@@ -14,6 +14,9 @@
 
 using std::sqrt;
 
+// tolerance for chekcing number of iterations
+#define LM_EVAL_COUNT_TOL 4/3
+
 int fcn_chkder(const VectorXd &x, VectorXd &fvec, MatrixXd &fjac, int iflag)
 {
     /*      subroutine fcn for chkder example. */
@@ -1023,7 +1026,8 @@ void testNistLanczos1(void)
   VERIFY_IS_EQUAL(lm.njev, 72);
   // check norm^2
   std::cout.precision(30);
-  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.4290986055242372e-25);  // should be 1.4307867721E-25, but nist results are on 128-bit floats
+  std::cout << lm.fvec.squaredNorm() << "\n";
+  VERIFY(lm.fvec.squaredNorm() <= 1.4307867721E-25);
   // check x
   VERIFY_IS_APPROX(x[0], 9.5100000027E-02);
   VERIFY_IS_APPROX(x[1], 1.0000000001E+00);
@@ -1044,7 +1048,7 @@ void testNistLanczos1(void)
   VERIFY_IS_EQUAL(lm.nfev, 9);
   VERIFY_IS_EQUAL(lm.njev, 8);
   // check norm^2
-  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.430571737783119393e-25);  // should be 1.4307867721E-25, but nist results are on 128-bit floats
+  VERIFY(lm.fvec.squaredNorm() <= 1.4307867721E-25);
   // check x
   VERIFY_IS_APPROX(x[0], 9.5100000027E-02);
   VERIFY_IS_APPROX(x[1], 1.0000000001E+00);
@@ -1354,8 +1358,12 @@ void testNistMGH17(void)
   
   // check return value
   VERIFY_IS_EQUAL(info, 2); 
-  VERIFY(lm.nfev < 650);  // 602
-  VERIFY(lm.njev < 600);  // 545
+  ++g_test_level;
+  VERIFY_IS_EQUAL(lm.nfev, 602);  // 602
+  VERIFY_IS_EQUAL(lm.njev, 545);  // 545
+  --g_test_level;
+  VERIFY(lm.nfev < 602 * LM_EVAL_COUNT_TOL);
+  VERIFY(lm.njev < 545 * LM_EVAL_COUNT_TOL);
 
   /*
    * Second try
diff --git a/unsupported/test/autodiff.cpp b/unsupported/test/autodiff.cpp
index 374f86df9..c4606cd17 100644
--- a/unsupported/test/autodiff.cpp
+++ b/unsupported/test/autodiff.cpp
@@ -16,7 +16,8 @@ EIGEN_DONT_INLINE Scalar foo(const Scalar& x, const Scalar& y)
   using namespace std;
 //   return x+std::sin(y);
   EIGEN_ASM_COMMENT("mybegin");
-  return static_cast<Scalar>(x*2 - 1 + pow(1+x,2) + 2*sqrt(y*y+0) - 4 * sin(0+x) + 2 * cos(y+0) - exp(-0.5*x*x+0));
+  // pow(float, int) promotes to pow(double, double)
+  return x*2 - 1 + static_cast<Scalar>(pow(1+x,2)) + 2*sqrt(y*y+0) - 4 * sin(0+x) + 2 * cos(y+0) - exp(Scalar(-0.5)*x*x+0);
   //return x+2*y*x;//x*2 -std::pow(x,2);//(2*y/x);// - y*2;
   EIGEN_ASM_COMMENT("myend");
 }
diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp
new file mode 100644
index 000000000..f16cc6f07
--- /dev/null
+++ b/unsupported/test/cxx11_eventcount.cpp
@@ -0,0 +1,140 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+#include "main.h"
+#include <Eigen/CXX11/ThreadPool>
+
+// Visual studio doesn't implement a rand_r() function since its
+// implementation of rand() is already thread safe
+int rand_reentrant(unsigned int* s) {
+#ifdef EIGEN_COMP_MSVC_STRICT
+  EIGEN_UNUSED_VARIABLE(s);
+  return rand();
+#else
+  return rand_r(s);
+#endif
+}
+
+static void test_basic_eventcount()
+{
+  std::vector<EventCount::Waiter> waiters(1);
+  EventCount ec(waiters);
+  EventCount::Waiter& w = waiters[0];
+  ec.Notify(false);
+  ec.Prewait(&w);
+  ec.Notify(true);
+  ec.CommitWait(&w);
+  ec.Prewait(&w);
+  ec.CancelWait(&w);
+}
+
+// Fake bounded counter-based queue.
+struct TestQueue {
+  std::atomic<int> val_;
+  static const int kQueueSize = 10;
+
+  TestQueue() : val_() {}
+
+  ~TestQueue() { VERIFY_IS_EQUAL(val_.load(), 0); }
+
+  bool Push() {
+    int val = val_.load(std::memory_order_relaxed);
+    for (;;) {
+      VERIFY_GE(val, 0);
+      VERIFY_LE(val, kQueueSize);
+      if (val == kQueueSize) return false;
+      if (val_.compare_exchange_weak(val, val + 1, std::memory_order_relaxed))
+        return true;
+    }
+  }
+
+  bool Pop() {
+    int val = val_.load(std::memory_order_relaxed);
+    for (;;) {
+      VERIFY_GE(val, 0);
+      VERIFY_LE(val, kQueueSize);
+      if (val == 0) return false;
+      if (val_.compare_exchange_weak(val, val - 1, std::memory_order_relaxed))
+        return true;
+    }
+  }
+
+  bool Empty() { return val_.load(std::memory_order_relaxed) == 0; }
+};
+
+const int TestQueue::kQueueSize;
+
+// A number of producers send messages to a set of consumers using a set of
+// fake queues. Ensure that it does not crash, consumers don't deadlock and
+// number of blocked and unblocked threads match.
+static void test_stress_eventcount()
+{
+  const int kThreads = std::thread::hardware_concurrency();
+  static const int kEvents = 1 << 16;
+  static const int kQueues = 10;
+
+  std::vector<EventCount::Waiter> waiters(kThreads);
+  EventCount ec(waiters);
+  TestQueue queues[kQueues];
+
+  std::vector<std::unique_ptr<std::thread>> producers;
+  for (int i = 0; i < kThreads; i++) {
+    producers.emplace_back(new std::thread([&ec, &queues]() {
+      unsigned int rnd = static_cast<unsigned int>(std::hash<std::thread::id>()(std::this_thread::get_id()));
+      for (int j = 0; j < kEvents; j++) {
+        unsigned idx = rand_reentrant(&rnd) % kQueues;
+        if (queues[idx].Push()) {
+          ec.Notify(false);
+          continue;
+        }
+        EIGEN_THREAD_YIELD();
+        j--;
+      }
+    }));
+  }
+
+  std::vector<std::unique_ptr<std::thread>> consumers;
+  for (int i = 0; i < kThreads; i++) {
+    consumers.emplace_back(new std::thread([&ec, &queues, &waiters, i]() {
+      EventCount::Waiter& w = waiters[i];
+      unsigned int rnd = static_cast<unsigned int>(std::hash<std::thread::id>()(std::this_thread::get_id()));
+      for (int j = 0; j < kEvents; j++) {
+        unsigned idx = rand_reentrant(&rnd) % kQueues;
+        if (queues[idx].Pop()) continue;
+        j--;
+        ec.Prewait(&w);
+        bool empty = true;
+        for (int q = 0; q < kQueues; q++) {
+          if (!queues[q].Empty()) {
+            empty = false;
+            break;
+          }
+        }
+        if (!empty) {
+          ec.CancelWait(&w);
+          continue;
+        }
+        ec.CommitWait(&w);
+      }
+    }));
+  }
+
+  for (int i = 0; i < kThreads; i++) {
+    producers[i]->join();
+    consumers[i]->join();
+  }
+}
+
+void test_cxx11_eventcount()
+{
+  CALL_SUBTEST(test_basic_eventcount());
+  CALL_SUBTEST(test_stress_eventcount());
+}
diff --git a/unsupported/test/float16.cpp b/unsupported/test/cxx11_float16.cpp
index 13f3ddaca..9141c4820 100644
--- a/unsupported/test/float16.cpp
+++ b/unsupported/test/cxx11_float16.cpp
@@ -7,7 +7,7 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC float16
+#define EIGEN_TEST_FUNC cxx11_float16
 
 #include "main.h"
 #include <Eigen/src/Core/arch/CUDA/Half.h>
@@ -31,11 +31,11 @@ void test_conversion()
   VERIFY_IS_EQUAL(half(1.19209e-07f).x, 0x0002);
 
   // Verify round-to-nearest-even behavior.
-  float val1 = float(half(__half{0x3c00}));
-  float val2 = float(half(__half{0x3c01}));
-  float val3 = float(half(__half{0x3c02}));
-  VERIFY_IS_EQUAL(half(0.5 * (val1 + val2)).x, 0x3c00);
-  VERIFY_IS_EQUAL(half(0.5 * (val2 + val3)).x, 0x3c02);
+  float val1 = float(half(__half(0x3c00)));
+  float val2 = float(half(__half(0x3c01)));
+  float val3 = float(half(__half(0x3c02)));
+  VERIFY_IS_EQUAL(half(0.5f * (val1 + val2)).x, 0x3c00);
+  VERIFY_IS_EQUAL(half(0.5f * (val2 + val3)).x, 0x3c02);
 
   // Conversion from int.
   VERIFY_IS_EQUAL(half(-1).x, 0xbc00);
@@ -49,35 +49,43 @@ void test_conversion()
   VERIFY_IS_EQUAL(half(true).x, 0x3c00);
 
   // Conversion to float.
-  VERIFY_IS_EQUAL(float(half(__half{0x0000})), 0.0f);
-  VERIFY_IS_EQUAL(float(half(__half{0x3c00})), 1.0f);
+  VERIFY_IS_EQUAL(float(half(__half(0x0000))), 0.0f);
+  VERIFY_IS_EQUAL(float(half(__half(0x3c00))), 1.0f);
 
   // Denormals.
-  VERIFY_IS_APPROX(float(half(__half{0x8001})), -5.96046e-08f);
-  VERIFY_IS_APPROX(float(half(__half{0x0001})), 5.96046e-08f);
-  VERIFY_IS_APPROX(float(half(__half{0x0002})), 1.19209e-07f);
+  VERIFY_IS_APPROX(float(half(__half(0x8001))), -5.96046e-08f);
+  VERIFY_IS_APPROX(float(half(__half(0x0001))), 5.96046e-08f);
+  VERIFY_IS_APPROX(float(half(__half(0x0002))), 1.19209e-07f);
 
   // NaNs and infinities.
   VERIFY(!(numext::isinf)(float(half(65504.0f))));  // Largest finite number.
   VERIFY(!(numext::isnan)(float(half(0.0f))));
-  VERIFY((numext::isinf)(float(half(__half{0xfc00}))));
-  VERIFY((numext::isnan)(float(half(__half{0xfc01}))));
-  VERIFY((numext::isinf)(float(half(__half{0x7c00}))));
-  VERIFY((numext::isnan)(float(half(__half{0x7c01}))));
+  VERIFY((numext::isinf)(float(half(__half(0xfc00)))));
+  VERIFY((numext::isnan)(float(half(__half(0xfc01)))));
+  VERIFY((numext::isinf)(float(half(__half(0x7c00)))));
+  VERIFY((numext::isnan)(float(half(__half(0x7c01)))));
+
+#if !EIGEN_COMP_MSVC
+  // Visual Studio errors out on divisions by 0
   VERIFY((numext::isnan)(float(half(0.0 / 0.0))));
   VERIFY((numext::isinf)(float(half(1.0 / 0.0))));
   VERIFY((numext::isinf)(float(half(-1.0 / 0.0))));
+#endif
 
   // Exactly same checks as above, just directly on the half representation.
-  VERIFY(!(numext::isinf)(half(__half{0x7bff})));
-  VERIFY(!(numext::isnan)(half(__half{0x0000})));
-  VERIFY((numext::isinf)(half(__half{0xfc00})));
-  VERIFY((numext::isnan)(half(__half{0xfc01})));
-  VERIFY((numext::isinf)(half(__half{0x7c00})));
-  VERIFY((numext::isnan)(half(__half{0x7c01})));
+  VERIFY(!(numext::isinf)(half(__half(0x7bff))));
+  VERIFY(!(numext::isnan)(half(__half(0x0000))));
+  VERIFY((numext::isinf)(half(__half(0xfc00))));
+  VERIFY((numext::isnan)(half(__half(0xfc01))));
+  VERIFY((numext::isinf)(half(__half(0x7c00))));
+  VERIFY((numext::isnan)(half(__half(0x7c01))));
+
+#if !EIGEN_COMP_MSVC
+  // Visual Studio errors out on divisions by 0
   VERIFY((numext::isnan)(half(0.0 / 0.0)));
   VERIFY((numext::isinf)(half(1.0 / 0.0)));
   VERIFY((numext::isinf)(half(-1.0 / 0.0)));
+#endif
 }
 
 void test_arithmetic()
@@ -114,6 +122,8 @@ void test_comparison()
   VERIFY(half(1.0f) != half(2.0f));
 
   // Comparisons with NaNs and infinities.
+#if !EIGEN_COMP_MSVC
+  // Visual Studio errors out on divisions by 0
   VERIFY(!(half(0.0 / 0.0) == half(0.0 / 0.0)));
   VERIFY(half(0.0 / 0.0) != half(0.0 / 0.0));
 
@@ -124,13 +134,26 @@ void test_comparison()
 
   VERIFY(half(1.0) < half(1.0 / 0.0));
   VERIFY(half(1.0) > half(-1.0 / 0.0));
+#endif
 }
 
-void test_functions()
+void test_basic_functions()
 {
   VERIFY_IS_EQUAL(float(numext::abs(half(3.5f))), 3.5f);
   VERIFY_IS_EQUAL(float(numext::abs(half(-3.5f))), 3.5f);
 
+  VERIFY_IS_EQUAL(float(numext::floor(half(3.5f))), 3.0f);
+  VERIFY_IS_EQUAL(float(numext::floor(half(-3.5f))), -4.0f);
+
+  VERIFY_IS_EQUAL(float(numext::ceil(half(3.5f))), 4.0f);
+  VERIFY_IS_EQUAL(float(numext::ceil(half(-3.5f))), -3.0f);
+
+  VERIFY_IS_APPROX(float(numext::sqrt(half(0.0f))), 0.0f);
+  VERIFY_IS_APPROX(float(numext::sqrt(half(4.0f))), 2.0f);
+
+  VERIFY_IS_APPROX(float(numext::pow(half(0.0f), half(1.0f))), 0.0f);
+  VERIFY_IS_APPROX(float(numext::pow(half(2.0f), half(2.0f))), 4.0f);
+
   VERIFY_IS_EQUAL(float(numext::exp(half(0.0f))), 1.0f);
   VERIFY_IS_APPROX(float(numext::exp(half(EIGEN_PI))), float(20.0 + EIGEN_PI));
 
@@ -138,10 +161,32 @@ void test_functions()
   VERIFY_IS_APPROX(float(numext::log(half(10.0f))), 2.30273f);
 }
 
-void test_float16()
+void test_trigonometric_functions()
+{
+  VERIFY_IS_APPROX(numext::cos(half(0.0f)), half(cosf(0.0f)));
+  VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI)), half(cosf(EIGEN_PI)));
+  //VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI/2)), half(cosf(EIGEN_PI/2)));
+  //VERIFY_IS_APPROX(numext::cos(half(3*EIGEN_PI/2)), half(cosf(3*EIGEN_PI/2)));
+  VERIFY_IS_APPROX(numext::cos(half(3.5f)), half(cosf(3.5f)));
+
+  VERIFY_IS_APPROX(numext::sin(half(0.0f)), half(sinf(0.0f)));
+  //  VERIFY_IS_APPROX(numext::sin(half(EIGEN_PI)), half(sinf(EIGEN_PI)));
+  VERIFY_IS_APPROX(numext::sin(half(EIGEN_PI/2)), half(sinf(EIGEN_PI/2)));
+  VERIFY_IS_APPROX(numext::sin(half(3*EIGEN_PI/2)), half(sinf(3*EIGEN_PI/2)));
+  VERIFY_IS_APPROX(numext::sin(half(3.5f)), half(sinf(3.5f)));
+
+  VERIFY_IS_APPROX(numext::tan(half(0.0f)), half(tanf(0.0f)));
+  //  VERIFY_IS_APPROX(numext::tan(half(EIGEN_PI)), half(tanf(EIGEN_PI)));
+  //  VERIFY_IS_APPROX(numext::tan(half(EIGEN_PI/2)), half(tanf(EIGEN_PI/2)));
+  //VERIFY_IS_APPROX(numext::tan(half(3*EIGEN_PI/2)), half(tanf(3*EIGEN_PI/2)));
+  VERIFY_IS_APPROX(numext::tan(half(3.5f)), half(tanf(3.5f)));
+}
+
+void test_cxx11_float16()
 {
   CALL_SUBTEST(test_conversion());
   CALL_SUBTEST(test_arithmetic());
   CALL_SUBTEST(test_comparison());
-  CALL_SUBTEST(test_functions());
+  CALL_SUBTEST(test_basic_functions());
+  CALL_SUBTEST(test_trigonometric_functions());
 }
diff --git a/unsupported/test/cxx11_meta.cpp b/unsupported/test/cxx11_meta.cpp
index ecac3add1..8911c59d8 100644
--- a/unsupported/test/cxx11_meta.cpp
+++ b/unsupported/test/cxx11_meta.cpp
@@ -10,7 +10,7 @@
 #include "main.h"
 
 #include <array>
-#include <Eigen/CXX11/Core>
+#include <Eigen/CXX11/src/util/CXX11Meta.h>
 
 using Eigen::internal::is_same;
 using Eigen::internal::type_list;
diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp
new file mode 100644
index 000000000..d20d87111
--- /dev/null
+++ b/unsupported/test/cxx11_runqueue.cpp
@@ -0,0 +1,227 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+#include <cstdlib>
+#include "main.h"
+#include <Eigen/CXX11/ThreadPool>
+
+
+// Visual studio doesn't implement a rand_r() function since its
+// implementation of rand() is already thread safe
+int rand_reentrant(unsigned int* s) {
+#ifdef EIGEN_COMP_MSVC_STRICT
+  EIGEN_UNUSED_VARIABLE(s);
+  return rand();
+#else
+  return rand_r(s);
+#endif
+}
+
+void test_basic_runqueue()
+{
+  RunQueue<int, 4> q;
+  // Check empty state.
+  VERIFY(q.Empty());
+  VERIFY_IS_EQUAL(0u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PopFront());
+  std::vector<int> stolen;
+  VERIFY_IS_EQUAL(0u, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(0u, stolen.size());
+  // Push one front, pop one front.
+  VERIFY_IS_EQUAL(0, q.PushFront(1));
+  VERIFY_IS_EQUAL(1u, q.Size());
+  VERIFY_IS_EQUAL(1, q.PopFront());
+  VERIFY_IS_EQUAL(0u, q.Size());
+  // Push front to overflow.
+  VERIFY_IS_EQUAL(0, q.PushFront(2));
+  VERIFY_IS_EQUAL(1u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushFront(3));
+  VERIFY_IS_EQUAL(2u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushFront(4));
+  VERIFY_IS_EQUAL(3u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushFront(5));
+  VERIFY_IS_EQUAL(4u, q.Size());
+  VERIFY_IS_EQUAL(6, q.PushFront(6));
+  VERIFY_IS_EQUAL(4u, q.Size());
+  VERIFY_IS_EQUAL(5, q.PopFront());
+  VERIFY_IS_EQUAL(3u, q.Size());
+  VERIFY_IS_EQUAL(4, q.PopFront());
+  VERIFY_IS_EQUAL(2u, q.Size());
+  VERIFY_IS_EQUAL(3, q.PopFront());
+  VERIFY_IS_EQUAL(1u, q.Size());
+  VERIFY_IS_EQUAL(2, q.PopFront());
+  VERIFY_IS_EQUAL(0u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PopFront());
+  // Push one back, pop one back.
+  VERIFY_IS_EQUAL(0, q.PushBack(7));
+  VERIFY_IS_EQUAL(1u, q.Size());
+  VERIFY_IS_EQUAL(1u, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(1u, stolen.size());
+  VERIFY_IS_EQUAL(7, stolen[0]);
+  VERIFY_IS_EQUAL(0u, q.Size());
+  stolen.clear();
+  // Push back to overflow.
+  VERIFY_IS_EQUAL(0, q.PushBack(8));
+  VERIFY_IS_EQUAL(1u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushBack(9));
+  VERIFY_IS_EQUAL(2u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushBack(10));
+  VERIFY_IS_EQUAL(3u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushBack(11));
+  VERIFY_IS_EQUAL(4u, q.Size());
+  VERIFY_IS_EQUAL(12, q.PushBack(12));
+  VERIFY_IS_EQUAL(4u, q.Size());
+  // Pop back in halves.
+  VERIFY_IS_EQUAL(2u, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(2u, stolen.size());
+  VERIFY_IS_EQUAL(10, stolen[0]);
+  VERIFY_IS_EQUAL(11, stolen[1]);
+  VERIFY_IS_EQUAL(2u, q.Size());
+  stolen.clear();
+  VERIFY_IS_EQUAL(1u, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(1u, stolen.size());
+  VERIFY_IS_EQUAL(9, stolen[0]);
+  VERIFY_IS_EQUAL(1u, q.Size());
+  stolen.clear();
+  VERIFY_IS_EQUAL(1u, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(1u, stolen.size());
+  VERIFY_IS_EQUAL(8, stolen[0]);
+  stolen.clear();
+  VERIFY_IS_EQUAL(0u, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(0u, stolen.size());
+  // Empty again.
+  VERIFY(q.Empty());
+  VERIFY_IS_EQUAL(0u, q.Size());
+}
+
+// Empty tests that the queue is not claimed to be empty when is is in fact not.
+// Emptiness property is crucial part of thread pool blocking scheme,
+// so we go to great effort to ensure this property. We create a queue with
+// 1 element and then push 1 element (either front or back at random) and pop
+// 1 element (either front or back at random). So queue always contains at least
+// 1 element, but otherwise changes chaotically. Another thread constantly tests
+// that the queue is not claimed to be empty.
+void test_empty_runqueue()
+{
+  RunQueue<int, 4> q;
+  q.PushFront(1);
+  std::atomic<bool> done(false);
+  std::thread mutator([&q, &done]() {
+    unsigned rnd = 0;
+    std::vector<int> stolen;
+    for (int i = 0; i < 1 << 18; i++) {
+      if (rand_reentrant(&rnd) % 2)
+        VERIFY_IS_EQUAL(0, q.PushFront(1));
+      else
+        VERIFY_IS_EQUAL(0, q.PushBack(1));
+      if (rand_reentrant(&rnd) % 2)
+        VERIFY_IS_EQUAL(1, q.PopFront());
+      else {
+        for (;;) {
+          if (q.PopBackHalf(&stolen) == 1) {
+            stolen.clear();
+            break;
+          }
+          VERIFY_IS_EQUAL(0u, stolen.size());
+        }
+      }
+    }
+    done = true;
+  });
+  while (!done) {
+    VERIFY(!q.Empty());
+    int size = q.Size();
+    VERIFY_GE(size, 1);
+    VERIFY_LE(size, 2);
+  }
+  VERIFY_IS_EQUAL(1, q.PopFront());
+  mutator.join();
+}
+
+// Stress is a chaotic random test.
+// One thread (owner) calls PushFront/PopFront, other threads call PushBack/
+// PopBack. Ensure that we don't crash, deadlock, and all sanity checks pass.
+void test_stress_runqueue()
+{
+  static const int kEvents = 1 << 18;
+  RunQueue<int, 8> q;
+  std::atomic<int> total(0);
+  std::vector<std::unique_ptr<std::thread>> threads;
+  threads.emplace_back(new std::thread([&q, &total]() {
+    int sum = 0;
+    int pushed = 1;
+    int popped = 1;
+    while (pushed < kEvents || popped < kEvents) {
+      if (pushed < kEvents) {
+        if (q.PushFront(pushed) == 0) {
+          sum += pushed;
+          pushed++;
+        }
+      }
+      if (popped < kEvents) {
+        int v = q.PopFront();
+        if (v != 0) {
+          sum -= v;
+          popped++;
+        }
+      }
+    }
+    total += sum;
+  }));
+  for (int i = 0; i < 2; i++) {
+    threads.emplace_back(new std::thread([&q, &total]() {
+      int sum = 0;
+      for (int j = 1; j < kEvents; j++) {
+        if (q.PushBack(j) == 0) {
+          sum += j;
+          continue;
+        }
+        EIGEN_THREAD_YIELD();
+        j--;
+      }
+      total += sum;
+    }));
+    threads.emplace_back(new std::thread([&q, &total]() {
+      int sum = 0;
+      std::vector<int> stolen;
+      for (int j = 1; j < kEvents;) {
+        if (q.PopBackHalf(&stolen) == 0) {
+          EIGEN_THREAD_YIELD();
+          continue;
+        }
+        while (stolen.size() && j < kEvents) {
+          int v = stolen.back();
+          stolen.pop_back();
+          VERIFY_IS_NOT_EQUAL(v, 0);
+          sum += v;
+          j++;
+        }
+      }
+      while (stolen.size()) {
+        int v = stolen.back();
+        stolen.pop_back();
+        VERIFY_IS_NOT_EQUAL(v, 0);
+        while ((v = q.PushBack(v)) != 0) EIGEN_THREAD_YIELD();
+      }
+      total -= sum;
+    }));
+  }
+  for (size_t i = 0; i < threads.size(); i++) threads[i]->join();
+  VERIFY(q.Empty());
+  VERIFY(total.load() == 0);
+}
+
+void test_cxx11_runqueue()
+{
+  CALL_SUBTEST_1(test_basic_runqueue());
+  CALL_SUBTEST_2(test_empty_runqueue());
+  CALL_SUBTEST_3(test_stress_runqueue());
+}
diff --git a/unsupported/test/cxx11_tensor_argmax.cpp b/unsupported/test/cxx11_tensor_argmax.cpp
index 482dfa7de..037767270 100644
--- a/unsupported/test/cxx11_tensor_argmax.cpp
+++ b/unsupported/test/cxx11_tensor_argmax.cpp
@@ -64,7 +64,7 @@ static void test_argmax_tuple_reducer()
   Tensor<Tuple<DenseIndex, float>, 0, DataLayout> reduced;
   DimensionList<DenseIndex, 4> dims;
   reduced = index_tuples.reduce(
-      dims, internal::ArgMaxTupleReducer<Tuple<DenseIndex, float>>());
+      dims, internal::ArgMaxTupleReducer<Tuple<DenseIndex, float> >());
 
   Tensor<float, 0, DataLayout> maxi = tensor.maximum();
 
@@ -74,7 +74,7 @@ static void test_argmax_tuple_reducer()
   for (int d = 0; d < 3; ++d) reduce_dims[d] = d;
   Tensor<Tuple<DenseIndex, float>, 1, DataLayout> reduced_by_dims(7);
   reduced_by_dims = index_tuples.reduce(
-      reduce_dims, internal::ArgMaxTupleReducer<Tuple<DenseIndex, float>>());
+      reduce_dims, internal::ArgMaxTupleReducer<Tuple<DenseIndex, float> >());
 
   Tensor<float, 1, DataLayout> max_by_dims = tensor.maximum(reduce_dims);
 
@@ -96,7 +96,7 @@ static void test_argmin_tuple_reducer()
   Tensor<Tuple<DenseIndex, float>, 0, DataLayout> reduced;
   DimensionList<DenseIndex, 4> dims;
   reduced = index_tuples.reduce(
-      dims, internal::ArgMinTupleReducer<Tuple<DenseIndex, float>>());
+      dims, internal::ArgMinTupleReducer<Tuple<DenseIndex, float> >());
 
   Tensor<float, 0, DataLayout> mini = tensor.minimum();
 
@@ -106,7 +106,7 @@ static void test_argmin_tuple_reducer()
   for (int d = 0; d < 3; ++d) reduce_dims[d] = d;
   Tensor<Tuple<DenseIndex, float>, 1, DataLayout> reduced_by_dims(7);
   reduced_by_dims = index_tuples.reduce(
-      reduce_dims, internal::ArgMinTupleReducer<Tuple<DenseIndex, float>>());
+      reduce_dims, internal::ArgMinTupleReducer<Tuple<DenseIndex, float> >());
 
   Tensor<float, 1, DataLayout> min_by_dims = tensor.minimum(reduce_dims);
 
diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_cuda.cu
index 6d1ef07f9..98ac180ef 100644
--- a/unsupported/test/cxx11_tensor_contract_cuda.cu
+++ b/unsupported/test/cxx11_tensor_contract_cuda.cu
@@ -84,6 +84,65 @@ void test_cuda_contraction(int m_size, int k_size, int n_size)
   cudaFree((void*)d_t_result);
 }
 
+
+template<int DataLayout>
+void test_scalar(int m_size, int k_size, int n_size)
+{
+  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
+  // with these dimensions, the output has 300 * 140 elements, which is
+  // more than 30 * 1024, which is the number of threads in blocks on
+  // a 15 SM GK110 GPU
+  Tensor<float, 2, DataLayout> t_left(m_size, k_size);
+  Tensor<float, 2, DataLayout> t_right(k_size, n_size);
+  Tensor<float, 0, DataLayout> t_result;
+  Tensor<float, 0, DataLayout> t_result_gpu;
+  Eigen::array<DimPair, 2> dims(DimPair(0, 0), DimPair(1, 1));
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
+  std::size_t t_right_bytes = t_right.size() * sizeof(float);
+  std::size_t t_result_bytes = sizeof(float);
+
+  float* d_t_left;
+  float* d_t_right;
+  float* d_t_result;
+
+  cudaMalloc((void**)(&d_t_left), t_left_bytes);
+  cudaMalloc((void**)(&d_t_right), t_right_bytes);
+  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+
+  cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_left(d_t_left, m_size, k_size);
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_right(d_t_right, k_size, n_size);
+  Eigen::TensorMap<Eigen::Tensor<float, 0, DataLayout> >
+      gpu_t_result(d_t_result);
+
+  gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
+  t_result = t_left.contract(t_right, dims);
+
+  cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+  if (fabs(t_result() - t_result_gpu()) > 1e-4f &&
+      !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {
+    std::cout << "mismatch detected: " << t_result()
+              << " vs " <<  t_result_gpu() << std::endl;
+    assert(false);
+  }
+
+  cudaFree((void*)d_t_left);
+  cudaFree((void*)d_t_right);
+  cudaFree((void*)d_t_result);
+}
+
+
 template<int DataLayout>
 void test_cuda_contraction_m() {
   for (int k = 32; k < 256; k++) {
@@ -138,6 +197,9 @@ void test_cxx11_tensor_cuda()
   CALL_SUBTEST_1(test_cuda_contraction<ColMajor>(128, 128, 128));
   CALL_SUBTEST_1(test_cuda_contraction<RowMajor>(128, 128, 128));
 
+  CALL_SUBTEST_1(test_scalar<ColMajor>(128, 128, 128));
+  CALL_SUBTEST_1(test_scalar<RowMajor>(128, 128, 128));
+
   CALL_SUBTEST_2(test_cuda_contraction_m<ColMajor>());
   CALL_SUBTEST_3(test_cuda_contraction_m<RowMajor>());
 
diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp
index 0e16308a2..73623b2ed 100644
--- a/unsupported/test/cxx11_tensor_contraction.cpp
+++ b/unsupported/test/cxx11_tensor_contraction.cpp
@@ -87,19 +87,14 @@ static void test_scalar()
   vec1.setRandom();
   vec2.setRandom();
 
-  Tensor<float, 1, DataLayout> scalar(1);
-  scalar.setZero();
   Eigen::array<DimPair, 1> dims = {{DimPair(0, 0)}};
-  typedef TensorEvaluator<decltype(vec1.contract(vec2, dims)), DefaultDevice> Evaluator;
-  Evaluator eval(vec1.contract(vec2, dims), DefaultDevice());
-  eval.evalTo(scalar.data());
-  EIGEN_STATIC_ASSERT(Evaluator::NumDims==1ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  Tensor<float, 0, DataLayout> scalar = vec1.contract(vec2, dims);
 
   float expected = 0.0f;
   for (int i = 0; i < 6; ++i) {
     expected += vec1(i) * vec2(i);
   }
-  VERIFY_IS_APPROX(scalar(0), expected);
+  VERIFY_IS_APPROX(scalar(), expected);
 }
 
 template<int DataLayout>
diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu
index 134359611..4026f48f0 100644
--- a/unsupported/test/cxx11_tensor_cuda.cu
+++ b/unsupported/test/cxx11_tensor_cuda.cu
@@ -661,6 +661,9 @@ void test_cuda_digamma()
   for (int i = 5; i < 7; ++i) {
     VERIFY_IS_EQUAL(out(i), expected_out(i));
   }
+
+  cudaFree(d_in);
+  cudaFree(d_out);
 }
 
 template <typename Scalar>
@@ -718,13 +721,17 @@ void test_cuda_zeta()
   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
 
   VERIFY_IS_EQUAL(out(0), expected_out(0));
-  VERIFY_IS_APPROX_OR_LESS_THAN(out(3), expected_out(3));
+  VERIFY((std::isnan)(out(3)));
 
   for (int i = 1; i < 6; ++i) {
     if (i != 3) {
       VERIFY_IS_APPROX(out(i), expected_out(i));
     }
   }
+
+  cudaFree(d_in_x);
+  cudaFree(d_in_q);
+  cudaFree(d_out);
 }
 
 template <typename Scalar>
@@ -787,6 +794,10 @@ void test_cuda_polygamma()
   for (int i = 0; i < 7; ++i) {
     VERIFY_IS_APPROX(out(i), expected_out(i));
   }
+
+  cudaFree(d_in_x);
+  cudaFree(d_in_n);
+  cudaFree(d_out);
 }
 
 template <typename Scalar>
@@ -826,9 +837,9 @@ void test_cuda_igamma()
   Scalar* d_a;
   Scalar* d_x;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_a), bytes);
-  cudaMalloc((void**)(&d_x), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  assert(cudaMalloc((void**)(&d_a), bytes) == cudaSuccess);
+  assert(cudaMalloc((void**)(&d_x), bytes) == cudaSuccess);
+  assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess);
 
   cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
   cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice);
@@ -854,6 +865,10 @@ void test_cuda_igamma()
       }
     }
   }
+
+  cudaFree(d_a);
+  cudaFree(d_x);
+  cudaFree(d_out);
 }
 
 template <typename Scalar>
@@ -920,6 +935,10 @@ void test_cuda_igammac()
       }
     }
   }
+
+  cudaFree(d_a);
+  cudaFree(d_x);
+  cudaFree(d_out);
 }
 
 template <typename Scalar>
@@ -935,8 +954,8 @@ void test_cuda_erf(const Scalar stddev)
 
   Scalar* d_in;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_in), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  assert(cudaMalloc((void**)(&d_in), bytes) == cudaSuccess);
+  assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess);
 
   cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
 
diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu
index cbe9e6449..b6ca54d93 100644
--- a/unsupported/test/cxx11_tensor_device.cu
+++ b/unsupported/test/cxx11_tensor_device.cu
@@ -241,7 +241,7 @@ void test_cpu() {
         const float result = out(i,j,k);
         const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) +
                                (in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
-        if (fabs(expected) < 1e-4 && fabs(result) < 1e-4) {
+        if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
           continue;
         }
         VERIFY_IS_APPROX(expected, result);
@@ -258,7 +258,7 @@ void test_cpu() {
                                 in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) +
                                (in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
                                 in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
-        if (fabs(expected) < 1e-4 && fabs(result) < 1e-4) {
+        if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
           continue;
         }
         VERIFY_IS_APPROX(expected, result);
diff --git a/unsupported/test/cxx11_tensor_dimension.cpp b/unsupported/test/cxx11_tensor_dimension.cpp
index ce78efe52..421e73693 100644
--- a/unsupported/test/cxx11_tensor_dimension.cpp
+++ b/unsupported/test/cxx11_tensor_dimension.cpp
@@ -37,7 +37,6 @@ static void test_fixed_size()
   VERIFY_IS_EQUAL(dimensions.TotalSize(), 2*3*7);
 }
 
-
 static void test_match()
 {
   Eigen::DSizes<int, 3> dyn(2,3,7);
@@ -49,10 +48,22 @@ static void test_match()
   VERIFY_IS_EQUAL(Eigen::dimensions_match(dyn1, dyn2), false);
 }
 
+static void test_rank_zero()
+{
+  Eigen::Sizes<> scalar;
+  VERIFY_IS_EQUAL(scalar.TotalSize(), 1);
+  VERIFY_IS_EQUAL(scalar.rank(), 0);
+  VERIFY_IS_EQUAL(internal::array_prod(scalar), 1);
+
+  Eigen::DSizes<ptrdiff_t, 0> dscalar;
+  VERIFY_IS_EQUAL(dscalar.TotalSize(), 1);
+  VERIFY_IS_EQUAL(dscalar.rank(), 0);
+}
 
 void test_cxx11_tensor_dimension()
 {
   CALL_SUBTEST(test_dynamic_size());
   CALL_SUBTEST(test_fixed_size());
   CALL_SUBTEST(test_match());
+  CALL_SUBTEST(test_rank_zero());
 }
diff --git a/unsupported/test/cxx11_tensor_empty.cpp b/unsupported/test/cxx11_tensor_empty.cpp
index 9130fff35..d7eea42d7 100644
--- a/unsupported/test/cxx11_tensor_empty.cpp
+++ b/unsupported/test/cxx11_tensor_empty.cpp
@@ -24,10 +24,10 @@ static void test_empty_tensor()
 
 static void test_empty_fixed_size_tensor()
 {
-  TensorFixedSize<float, Sizes<0>> source;
-  TensorFixedSize<float, Sizes<0>> tgt1 = source;
-  TensorFixedSize<float, Sizes<0>> tgt2(source);
-  TensorFixedSize<float, Sizes<0>> tgt3;
+  TensorFixedSize<float, Sizes<0> > source;
+  TensorFixedSize<float, Sizes<0> > tgt1 = source;
+  TensorFixedSize<float, Sizes<0> > tgt2(source);
+  TensorFixedSize<float, Sizes<0> > tgt3;
   tgt3 = tgt1;
   tgt3 = tgt2;
 }
diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp
index 8389e9840..4dd355e6e 100644
--- a/unsupported/test/cxx11_tensor_expr.cpp
+++ b/unsupported/test/cxx11_tensor_expr.cpp
@@ -112,13 +112,13 @@ static void test_3d()
   Tensor<float, 3> mat1(2,3,7);
   Tensor<float, 3, RowMajor> mat2(2,3,7);
 
-  float val = 1.0;
+  float val = 1.0f;
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
         mat1(i,j,k) = val;
         mat2(i,j,k) = val;
-        val += 1.0;
+        val += 1.0f;
       }
     }
   }
@@ -142,7 +142,7 @@ static void test_3d()
   Tensor<float, 3, RowMajor> mat11(2,3,7);
   mat11 = mat2 / 3.14f;
 
-  val = 1.0;
+  val = 1.0f;
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
@@ -155,7 +155,7 @@ static void test_3d()
         VERIFY_IS_APPROX(mat9(i,j,k), val + 3.14f);
         VERIFY_IS_APPROX(mat10(i,j,k), val - 3.14f);
         VERIFY_IS_APPROX(mat11(i,j,k), val / 3.14f);
-        val += 1.0;
+        val += 1.0f;
       }
     }
   }
@@ -167,25 +167,25 @@ static void test_constants()
   Tensor<float, 3> mat2(2,3,7);
   Tensor<float, 3> mat3(2,3,7);
 
-  float val = 1.0;
+  float val = 1.0f;
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
         mat1(i,j,k) = val;
-        val += 1.0;
+        val += 1.0f;
       }
     }
   }
   mat2 = mat1.constant(3.14f);
   mat3 = mat1.cwiseMax(7.3f).exp();
 
-  val = 1.0;
+  val = 1.0f;
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
         VERIFY_IS_APPROX(mat2(i,j,k), 3.14f);
         VERIFY_IS_APPROX(mat3(i,j,k), expf((std::max)(val, 7.3f)));
-        val += 1.0;
+        val += 1.0f;
       }
     }
   }
@@ -228,25 +228,25 @@ static void test_functors()
   Tensor<float, 3> mat2(2,3,7);
   Tensor<float, 3> mat3(2,3,7);
 
-  float val = 1.0;
+  float val = 1.0f;
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
         mat1(i,j,k) = val;
-        val += 1.0;
+        val += 1.0f;
       }
     }
   }
   mat2 = mat1.inverse().unaryExpr(&asinf);
   mat3 = mat1.unaryExpr(&tanhf);
 
-  val = 1.0;
+  val = 1.0f;
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
         VERIFY_IS_APPROX(mat2(i,j,k), asinf(1.0f / mat1(i,j,k)));
         VERIFY_IS_APPROX(mat3(i,j,k), tanhf(mat1(i,j,k)));
-        val += 1.0;
+        val += 1.0f;
       }
     }
   }
diff --git a/unsupported/test/cxx11_tensor_fft.cpp b/unsupported/test/cxx11_tensor_fft.cpp
index 89874349f..2f14ebc62 100644
--- a/unsupported/test/cxx11_tensor_fft.cpp
+++ b/unsupported/test/cxx11_tensor_fft.cpp
@@ -205,15 +205,15 @@ static void test_fft_real_input_energy() {
     VERIFY_IS_EQUAL(output.dimension(i), input.dimension(i));
   }
 
-  float energy_original = 0.0;
-  float energy_after_fft = 0.0;
+  RealScalar energy_original = 0.0;
+  RealScalar energy_after_fft = 0.0;
 
   for (int i = 0; i < total_size; ++i) {
-    energy_original += pow(std::abs(input(i)), 2);
+    energy_original += numext::abs2(input(i));
   }
 
   for (int i = 0; i < total_size; ++i) {
-    energy_after_fft += pow(std::abs(output(i)), 2);
+    energy_after_fft += numext::abs2(output(i));
   }
 
   if(FFTDirection == FFT_FORWARD) {
diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp
index 1c33fefb3..4c660de65 100644
--- a/unsupported/test/cxx11_tensor_fixed_size.cpp
+++ b/unsupported/test/cxx11_tensor_fixed_size.cpp
@@ -20,6 +20,8 @@ static void test_0d()
   TensorFixedSize<float, Sizes<> > scalar1;
   TensorFixedSize<float, Sizes<>, RowMajor> scalar2;
   VERIFY_IS_EQUAL(scalar1.rank(), 0);
+  VERIFY_IS_EQUAL(scalar1.size(), 1);
+  VERIFY_IS_EQUAL(array_prod(scalar1.dimensions()), 1);
 
   scalar1() = 7.0;
   scalar2() = 13.0;
@@ -128,9 +130,9 @@ static void test_tensor_map()
 static void test_2d()
 {
   float data1[6];
-  TensorMap<TensorFixedSize<float, Sizes<2, 3> >> mat1(data1,2,3);
+  TensorMap<TensorFixedSize<float, Sizes<2, 3> > > mat1(data1,2,3);
   float data2[6];
-  TensorMap<TensorFixedSize<float, Sizes<2, 3>, RowMajor>> mat2(data2,2,3);
+  TensorMap<TensorFixedSize<float, Sizes<2, 3>, RowMajor> > mat2(data2,2,3);
 
   VERIFY_IS_EQUAL((mat1.size()), 2*3);
   VERIFY_IS_EQUAL(mat1.rank(), 2);
@@ -151,7 +153,7 @@ static void test_2d()
   mat2(1,1) = -4.0;
   mat2(1,2) = -5.0;
 
-  TensorFixedSize<float, Sizes<2, 3>> mat3;
+  TensorFixedSize<float, Sizes<2, 3> > mat3;
   TensorFixedSize<float, Sizes<2, 3>, RowMajor> mat4;
   mat3 = mat1.abs();
   mat4 = mat2.abs();
@@ -186,13 +188,13 @@ static void test_3d()
   //  VERIFY_IS_EQUAL((mat1.dimension(1)), 3);
   //  VERIFY_IS_EQUAL((mat1.dimension(2)), 7);
 
-  float val = 0.0;
+  float val = 0.0f;
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
         mat1(i,j,k) = val;
         mat2(i,j,k) = val;
-        val += 1.0;
+        val += 1.0f;
       }
     }
   }
@@ -208,13 +210,13 @@ static void test_3d()
   //  VERIFY_IS_EQUAL((mat3.dimension(2)), 7);
 
 
-  val = 0.0;
+  val = 0.0f;
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
         VERIFY_IS_APPROX(mat3(i,j,k), sqrtf(val));
         VERIFY_IS_APPROX(mat4(i,j,k), sqrtf(val));
-        val += 1.0;
+        val += 1.0f;
       }
     }
   }
@@ -224,12 +226,12 @@ static void test_3d()
 static void test_array()
 {
   TensorFixedSize<float, Sizes<2, 3, 7> > mat1;
-  float val = 0.0;
+  float val = 0.0f;
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
         mat1(i,j,k) = val;
-        val += 1.0;
+        val += 1.0f;
       }
     }
   }
@@ -237,12 +239,12 @@ static void test_array()
   TensorFixedSize<float, Sizes<2, 3, 7> > mat3;
   mat3 = mat1.pow(3.5f);
 
-  val = 0.0;
+  val = 0.0f;
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
         VERIFY_IS_APPROX(mat3(i,j,k), powf(val, 3.5f));
-        val += 1.0;
+        val += 1.0f;
       }
     }
   }
diff --git a/unsupported/test/cxx11_tensor_forced_eval.cpp b/unsupported/test/cxx11_tensor_forced_eval.cpp
index ad9de867d..45d7345e9 100644
--- a/unsupported/test/cxx11_tensor_forced_eval.cpp
+++ b/unsupported/test/cxx11_tensor_forced_eval.cpp
@@ -22,14 +22,15 @@ static void test_simple()
   m1.setRandom();
   m2.setRandom();
 
-  TensorMap<Tensor<float, 2>> mat1(m1.data(), 3,3);
-  TensorMap<Tensor<float, 2>> mat2(m2.data(), 3,3);
+  TensorMap<Tensor<float, 2> > mat1(m1.data(), 3,3);
+  TensorMap<Tensor<float, 2> > mat2(m2.data(), 3,3);
 
   Tensor<float, 2> mat3(3,3);
   mat3 = mat1;
 
   typedef Tensor<float, 1>::DimensionPair DimPair;
-  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+  Eigen::array<DimPair, 1> dims;
+  dims[0] = DimPair(1, 0);
 
   mat3 = mat3.contract(mat2, dims).eval();
 
@@ -60,7 +61,7 @@ static void test_const()
   Eigen::array<int, 2> bcast;
   bcast[0] = 3;
   bcast[1] = 1;
-  const TensorMap<Tensor<const float, 2>> input_tensor(input.data(), 3, 3);
+  const TensorMap<Tensor<const float, 2> > input_tensor(input.data(), 3, 3);
   Tensor<float, 2> output_tensor= (input_tensor - input_tensor.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
 
   for (int i = 0; i < 3; ++i) {
diff --git a/unsupported/test/cxx11_tensor_image_patch.cpp b/unsupported/test/cxx11_tensor_image_patch.cpp
index 5d6a49181..988b01481 100644
--- a/unsupported/test/cxx11_tensor_image_patch.cpp
+++ b/unsupported/test/cxx11_tensor_image_patch.cpp
@@ -568,13 +568,7 @@ static void test_imagenet_patches()
   VERIFY_IS_EQUAL(l_out.dimension(4), 16);
 
   // RowMajor
-  Tensor<float, 4, RowMajor> l_in_row_major = l_in.swap_layout();
-  VERIFY_IS_EQUAL(l_in.dimension(0), l_in_row_major.dimension(3));
-  VERIFY_IS_EQUAL(l_in.dimension(1), l_in_row_major.dimension(2));
-  VERIFY_IS_EQUAL(l_in.dimension(2), l_in_row_major.dimension(1));
-  VERIFY_IS_EQUAL(l_in.dimension(3), l_in_row_major.dimension(0));
-
-  Tensor<float, 5, RowMajor> l_out_row_major = l_in_row_major.extract_image_patches(11, 11);
+  Tensor<float, 5, RowMajor> l_out_row_major = l_in.swap_layout().extract_image_patches(11, 11);
   VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 16);
   VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 128*128);
   VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 11);
@@ -589,10 +583,8 @@ static void test_imagenet_patches()
           for (int r = 0; r < 11; ++r) {
             for (int d = 0; d < 3; ++d) {
               float expected = 0.0f;
-              float expected_row_major = 0.0f;
               if (r-5+i >= 0 && c-5+j >= 0 && r-5+i < 128 && c-5+j < 128) {
                 expected = l_in(d, r-5+i, c-5+j, b);
-                expected_row_major = l_in_row_major(b, c-5+j, r-5+i, d);
               }
               // ColMajor
               if (l_out(d, r, c, patchId, b) != expected) {
@@ -601,15 +593,13 @@ static void test_imagenet_patches()
               VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
               // RowMajor
               if (l_out_row_major(b, patchId, c, r, d) !=
-                  expected_row_major) {
+                  expected) {
                 std::cout << "Mismatch detected at index i=" << i << " j=" << j
                      << " r=" << r << " c=" << c << " d=" << d << " b=" << b
                      << std::endl;
               }
               VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d),
-                              expected_row_major);
-              // Check that ColMajor and RowMajor agree.
-              VERIFY_IS_EQUAL(expected, expected_row_major);
+                              expected);
             }
           }
         }
@@ -628,8 +618,7 @@ static void test_imagenet_patches()
   VERIFY_IS_EQUAL(l_out.dimension(4), 32);
 
   // RowMajor
-  l_in_row_major = l_in.swap_layout();
-  l_out_row_major = l_in_row_major.extract_image_patches(9, 9);
+  l_out_row_major = l_in.swap_layout().extract_image_patches(9, 9);
   VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
   VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 64*64);
   VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 9);
@@ -644,10 +633,8 @@ static void test_imagenet_patches()
           for (int r = 0; r < 9; ++r) {
             for (int d = 0; d < 16; ++d) {
               float expected = 0.0f;
-              float expected_row_major = 0.0f;
               if (r-4+i >= 0 && c-4+j >= 0 && r-4+i < 64 && c-4+j < 64) {
                 expected = l_in(d, r-4+i, c-4+j, b);
-                expected_row_major = l_in_row_major(b, c-4+j, r-4+i, d);
               }
               // ColMajor
               if (l_out(d, r, c, patchId, b) != expected) {
@@ -655,12 +642,10 @@ static void test_imagenet_patches()
               }
               VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
               // RowMajor
-              if (l_out_row_major(b, patchId, c, r, d) != expected_row_major) {
+              if (l_out_row_major(b, patchId, c, r, d) != expected) {
                 std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
               }
-              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected_row_major);
-              // Check that ColMajor and RowMajor agree.
-              VERIFY_IS_EQUAL(expected, expected_row_major);
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
             }
           }
         }
@@ -679,8 +664,7 @@ static void test_imagenet_patches()
   VERIFY_IS_EQUAL(l_out.dimension(4), 32);
 
   // RowMajor
-  l_in_row_major = l_in.swap_layout();
-  l_out_row_major = l_in_row_major.extract_image_patches(7, 7);
+  l_out_row_major = l_in.swap_layout().extract_image_patches(7, 7);
   VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
   VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 16*16);
   VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 7);
@@ -695,10 +679,8 @@ static void test_imagenet_patches()
           for (int r = 0; r < 7; ++r) {
             for (int d = 0; d < 32; ++d) {
               float expected = 0.0f;
-              float expected_row_major = 0.0f;
               if (r-3+i >= 0 && c-3+j >= 0 && r-3+i < 16 && c-3+j < 16) {
                 expected = l_in(d, r-3+i, c-3+j, b);
-                expected_row_major = l_in_row_major(b, c-3+j, r-3+i, d);
               }
               // ColMajor
               if (l_out(d, r, c, patchId, b) != expected) {
@@ -706,12 +688,10 @@ static void test_imagenet_patches()
               }
               VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
               // RowMajor
-              if (l_out_row_major(b, patchId, c, r, d) != expected_row_major) {
+              if (l_out_row_major(b, patchId, c, r, d) != expected) {
                 std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
               }
-              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected_row_major);
-              // Check that ColMajor and RowMajor agree.
-              VERIFY_IS_EQUAL(expected, expected_row_major);
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
             }
           }
         }
@@ -730,8 +710,7 @@ static void test_imagenet_patches()
   VERIFY_IS_EQUAL(l_out.dimension(4), 32);
 
   // RowMajor
-  l_in_row_major = l_in.swap_layout();
-  l_out_row_major = l_in_row_major.extract_image_patches(3, 3);
+  l_out_row_major = l_in.swap_layout().extract_image_patches(3, 3);
   VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
   VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 13*13);
   VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 3);
@@ -746,10 +725,8 @@ static void test_imagenet_patches()
           for (int r = 0; r < 3; ++r) {
             for (int d = 0; d < 64; ++d) {
               float expected = 0.0f;
-              float expected_row_major = 0.0f;
               if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 13 && c-1+j < 13) {
                 expected = l_in(d, r-1+i, c-1+j, b);
-                expected_row_major = l_in_row_major(b, c-1+j, r-1+i, d);
               }
               // ColMajor
               if (l_out(d, r, c, patchId, b) != expected) {
@@ -757,12 +734,10 @@ static void test_imagenet_patches()
               }
               VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
               // RowMajor
-              if (l_out_row_major(b, patchId, c, r, d) != expected_row_major) {
+              if (l_out_row_major(b, patchId, c, r, d) != expected) {
                 std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
               }
-              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected_row_major);
-              // Check that ColMajor and RowMajor agree.
-              VERIFY_IS_EQUAL(expected, expected_row_major);
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
             }
           }
         }
diff --git a/unsupported/test/cxx11_tensor_map.cpp b/unsupported/test/cxx11_tensor_map.cpp
index a8a095e38..3db0ee7c0 100644
--- a/unsupported/test/cxx11_tensor_map.cpp
+++ b/unsupported/test/cxx11_tensor_map.cpp
@@ -19,8 +19,8 @@ static void test_0d()
   Tensor<int, 0> scalar1;
   Tensor<int, 0, RowMajor> scalar2;
 
-  TensorMap<Tensor<const int, 0>> scalar3(scalar1.data());
-  TensorMap<Tensor<const int, 0, RowMajor>> scalar4(scalar2.data());
+  TensorMap<Tensor<const int, 0> > scalar3(scalar1.data());
+  TensorMap<Tensor<const int, 0, RowMajor> > scalar4(scalar2.data());
 
   scalar1() = 7;
   scalar2() = 13;
@@ -37,8 +37,8 @@ static void test_1d()
   Tensor<int, 1> vec1(6);
   Tensor<int, 1, RowMajor> vec2(6);
 
-  TensorMap<Tensor<const int, 1>> vec3(vec1.data(), 6);
-  TensorMap<Tensor<const int, 1, RowMajor>> vec4(vec2.data(), 6);
+  TensorMap<Tensor<const int, 1> > vec3(vec1.data(), 6);
+  TensorMap<Tensor<const int, 1, RowMajor> > vec4(vec2.data(), 6);
 
   vec1(0) = 4;  vec2(0) = 0;
   vec1(1) = 8;  vec2(1) = 1;
@@ -85,8 +85,8 @@ static void test_2d()
   mat2(1,1) = 4;
   mat2(1,2) = 5;
 
-  TensorMap<Tensor<const int, 2>> mat3(mat1.data(), 2, 3);
-  TensorMap<Tensor<const int, 2, RowMajor>> mat4(mat2.data(), 2, 3);
+  TensorMap<Tensor<const int, 2> > mat3(mat1.data(), 2, 3);
+  TensorMap<Tensor<const int, 2, RowMajor> > mat4(mat2.data(), 2, 3);
 
   VERIFY_IS_EQUAL(mat3.rank(), 2);
   VERIFY_IS_EQUAL(mat3.size(), 6);
@@ -129,8 +129,8 @@ static void test_3d()
     }
   }
 
-  TensorMap<Tensor<const int, 3>> mat3(mat1.data(), 2, 3, 7);
-  TensorMap<Tensor<const int, 3, RowMajor>> mat4(mat2.data(), array<DenseIndex, 3>{{2, 3, 7}});
+  TensorMap<Tensor<const int, 3> > mat3(mat1.data(), 2, 3, 7);
+  TensorMap<Tensor<const int, 3, RowMajor> > mat4(mat2.data(), 2, 3, 7);
 
   VERIFY_IS_EQUAL(mat3.rank(), 3);
   VERIFY_IS_EQUAL(mat3.size(), 2*3*7);
@@ -173,8 +173,8 @@ static void test_from_tensor()
     }
   }
 
-  TensorMap<Tensor<int, 3>> mat3(mat1);
-  TensorMap<Tensor<int, 3, RowMajor>> mat4(mat2);
+  TensorMap<Tensor<int, 3> > mat3(mat1);
+  TensorMap<Tensor<int, 3, RowMajor> > mat4(mat2);
 
   VERIFY_IS_EQUAL(mat3.rank(), 3);
   VERIFY_IS_EQUAL(mat3.size(), 2*3*7);
@@ -199,19 +199,23 @@ static void test_from_tensor()
     }
   }
 
-  TensorFixedSize<int, Sizes<2,3,7>> mat5;
+  TensorFixedSize<int, Sizes<2,3,7> > mat5;
 
   val = 0;
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
-        mat5(i,j,k) = val;
+        array<ptrdiff_t, 3> coords;
+        coords[0] = i;
+        coords[1] = j;
+        coords[2] = k;
+        mat5(coords) = val;
         val++;
       }
     }
   }
 
-  TensorMap<TensorFixedSize<int, Sizes<2,3,7>>> mat6(mat5);
+  TensorMap<TensorFixedSize<int, Sizes<2,3,7> > > mat6(mat5);
 
   VERIFY_IS_EQUAL(mat6.rank(), 3);
   VERIFY_IS_EQUAL(mat6.size(), 2*3*7);
@@ -233,8 +237,8 @@ static void test_from_tensor()
 
 static int f(const TensorMap<Tensor<int, 3> >& tensor) {
   //  Size<0> empty;
-  EIGEN_STATIC_ASSERT((internal::array_size<Sizes<>>::value == 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT((internal::array_size<DSizes<int, 0>>::value == 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::array_size<Sizes<> >::value == 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::array_size<DSizes<int, 0> >::value == 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
   Tensor<int, 0> result = tensor.sum();
   return result();
 }
@@ -253,7 +257,7 @@ static void test_casting()
     }
   }
 
-  TensorMap<Tensor<int, 3>> map(tensor);
+  TensorMap<Tensor<int, 3> > map(tensor);
   int sum1 = f(map);
   int sum2 = f(tensor);
 
diff --git a/unsupported/test/cxx11_tensor_math.cpp b/unsupported/test/cxx11_tensor_math.cpp
index d247bebaa..61c742a16 100644
--- a/unsupported/test/cxx11_tensor_math.cpp
+++ b/unsupported/test/cxx11_tensor_math.cpp
@@ -16,7 +16,7 @@ using Eigen::RowMajor;
 
 static void test_tanh()
 {
-  Tensor<float, 1> vec1({6});
+  Tensor<float, 1> vec1(6);
   vec1.setRandom();
 
   Tensor<float, 1> vec2 = vec1.tanh();
@@ -28,7 +28,7 @@ static void test_tanh()
 
 static void test_sigmoid()
 {
-  Tensor<float, 1> vec1({6});
+  Tensor<float, 1> vec1(6);
   vec1.setRandom();
 
   Tensor<float, 1> vec2 = vec1.sigmoid();
diff --git a/unsupported/test/cxx11_tensor_mixed_indices.cpp b/unsupported/test/cxx11_tensor_mixed_indices.cpp
index 72f826216..4fba6fdd1 100644
--- a/unsupported/test/cxx11_tensor_mixed_indices.cpp
+++ b/unsupported/test/cxx11_tensor_mixed_indices.cpp
@@ -14,8 +14,8 @@
 
 static void test_simple()
 {
-  Tensor<float, 1, ColMajor> vec1({6});
-  Tensor<float, 1, ColMajor, int> vec2({6});
+  Tensor<float, 1, ColMajor> vec1(6);
+  Tensor<float, 1, ColMajor, int> vec2(6);
 
   vec1(0) = 4.0;  vec2(0) = 0.0;
   vec1(1) = 8.0;  vec2(1) = 1.0;
diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
index cb917bb37..dceac793e 100644
--- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
@@ -134,6 +134,68 @@ void test_cuda_elementwise() {
   gpu_device.deallocate(d_res_float);
 }
 
+void test_cuda_trancendental() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_res1_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res1_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res2_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res2_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(
+      d_float1, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(
+      d_float2, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(
+      d_res1_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(
+      d_res1_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(
+      d_res2_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(
+      d_res2_float, num_elem);
+
+  gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
+  gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
+  gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>();
+  gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>();
+  gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().exp();
+  gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>().log();
+
+  Tensor<float, 1> input1(num_elem);
+  Tensor<Eigen::half, 1> half_prec1(num_elem);
+  Tensor<Eigen::half, 1> full_prec1(num_elem);
+  Tensor<float, 1> input2(num_elem);
+  Tensor<Eigen::half, 1> half_prec2(num_elem);
+  Tensor<Eigen::half, 1> full_prec2(num_elem);
+  gpu_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking elemwise exp " << i << " input = " << input1(i) << " full = " << full_prec1(i) << " half = " << half_prec1(i) << std::endl;
+    VERIFY_IS_APPROX(full_prec1(i), half_prec1(i));
+  }
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl;
+    VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
+  }
+  gpu_device.deallocate(d_float1);
+  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_res1_half);
+  gpu_device.deallocate(d_res1_float);
+  gpu_device.deallocate(d_res2_half);
+  gpu_device.deallocate(d_res2_float);
+}
+
 
 void test_cuda_contractions() {
   Eigen::CudaStreamDevice stream;
@@ -144,36 +206,38 @@ void test_cuda_contractions() {
 
   float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
   float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
 
   Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
       d_float1, rows, cols);
   Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
       d_float2, rows, cols);
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_res_half(
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_half(
       d_res_half, rows, cols);
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_res_float(
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_float(
       d_res_float, rows, cols);
 
   gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
-  gpu_float2.device(gpu_device) = gpu_float2.random() - gpu_float1.constant(0.5f);
+  gpu_float2.device(gpu_device) = gpu_float2.random() - gpu_float2.constant(0.5f);
 
   typedef Tensor<float, 2>::DimensionPair DimPair;
   Eigen::array<DimPair, 1> dims(DimPair(1, 0));
-  gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims);
-  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims).cast<float>();
+  gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims).cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims);
 
-  Tensor<float, 2> half_prec(rows, cols);
-  Tensor<float, 2> full_prec(rows, cols);
-  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
-  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+  Tensor<Eigen::half, 2> half_prec(rows, cols);
+  Tensor<Eigen::half, 2> full_prec(rows, cols);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(Eigen::half));
   gpu_device.synchronize();
 
   for (int i = 0; i < rows; ++i) {
     for (int j = 0; j < cols; ++j) {
-      std::cout << "Checking contract " << i << " " << j << std::endl;
-      VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j));
+      std::cout << "Checking contract " << i << " " << j << full_prec(i, j) << " " << half_prec(i, j) << std::endl;
+      if (numext::abs(full_prec(i, j) - half_prec(i, j)) > Eigen::half(1e-2f)) {
+        VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j));
+      }
     }
   }
 
@@ -192,29 +256,29 @@ void test_cuda_reductions() {
 
   float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
   float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_res_half = (float*)gpu_device.allocate(size * sizeof(float));
-  float* d_res_float = (float*)gpu_device.allocate(size * sizeof(float));
+  Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(size * sizeof(Eigen::half));
+  Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(size * sizeof(Eigen::half));
 
   Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
       d_float1, size, size);
   Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
       d_float2, size, size);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half(
       d_res_half, size);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float(
       d_res_float, size);
 
   gpu_float1.device(gpu_device) = gpu_float1.random();
   gpu_float2.device(gpu_device) = gpu_float2.random();
 
   Eigen::array<int, 1> redux_dim = {{0}};
-  gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim);
-  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(redux_dim).cast<float>();
+  gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim).cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(redux_dim);
 
-  Tensor<float, 1> half_prec(size);
-  Tensor<float, 1> full_prec(size);
-  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, size*sizeof(float));
-  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, size*sizeof(float));
+  Tensor<Eigen::half, 1> half_prec(size);
+  Tensor<Eigen::half, 1> full_prec(size);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, size*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, size*sizeof(Eigen::half));
   gpu_device.synchronize();
 
   for (int i = 0; i < size; ++i) {
@@ -222,6 +286,61 @@ void test_cuda_reductions() {
     VERIFY_IS_APPROX(full_prec(i), half_prec(i));
   }
 
+  redux_dim = {{1}};
+  gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim).cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(redux_dim);
+
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, size*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, size*sizeof(Eigen::half));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < size; ++i) {
+    std::cout << "Checking redux " << i << std::endl;
+    VERIFY_IS_APPROX(full_prec(i), half_prec(i));
+  }
+
+  gpu_device.deallocate(d_float1);
+  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+
+
+void test_cuda_full_reductions() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int size = 13;
+  int num_elem = size*size;
+
+  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
+  Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
+      d_float1, size, size);
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
+      d_float2, size, size);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half(
+      d_res_half);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float(
+      d_res_float);
+
+  gpu_float1.device(gpu_device) = gpu_float1.random();
+  gpu_float2.device(gpu_device) = gpu_float2.random();
+
+  gpu_res_float.device(gpu_device) = gpu_float1.sum().cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum();
+
+  Tensor<Eigen::half, 0> half_prec;
+  Tensor<Eigen::half, 0> full_prec;
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
+  gpu_device.synchronize();
+
+  VERIFY_IS_APPROX(full_prec(), half_prec());
+
   gpu_device.deallocate(d_float1);
   gpu_device.deallocate(d_float2);
   gpu_device.deallocate(d_res_half);
@@ -229,27 +348,58 @@ void test_cuda_reductions() {
 }
 
 
+void test_cuda_forced_evals() {
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
+      d_res_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, num_elem);
+
+  gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
+  gpu_res_float.device(gpu_device) = gpu_float.abs();
+  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>();
+
+  Tensor<float, 1> half_prec(num_elem);
+  Tensor<float, 1> full_prec(num_elem);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking unary " << i << std::endl;
+    VERIFY_IS_APPROX(full_prec(i), half_prec(i));
+  }
+
+  gpu_device.deallocate(d_float);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
 #endif
 
 
 void test_cxx11_tensor_of_float16_cuda()
 {
 #ifdef EIGEN_HAS_CUDA_FP16
-  Eigen::CudaStreamDevice stream;
-  Eigen::GpuDevice device(&stream);
-  if (device.majorDeviceVersion() > 5 ||
-      (device.majorDeviceVersion() == 5 && device.minorDeviceVersion() >= 3)) {
-    std::cout << "Running test on device with capability " << device.majorDeviceVersion() << "." << device.minorDeviceVersion() << std::endl;
-
-    CALL_SUBTEST_1(test_cuda_conversion());
-    CALL_SUBTEST_1(test_cuda_unary());
-    CALL_SUBTEST_1(test_cuda_elementwise());
-    CALL_SUBTEST_2(test_cuda_contractions());
-    CALL_SUBTEST_3(test_cuda_reductions());
-  }
-  else {
-   std::cout << "Half floats require compute capability of at least 5.3. This device only supports " << device.majorDeviceVersion() << "." << device.minorDeviceVersion() << ". Skipping the test" << std::endl;
-  }
+  CALL_SUBTEST_1(test_cuda_conversion());
+  CALL_SUBTEST_1(test_cuda_unary());
+  CALL_SUBTEST_1(test_cuda_elementwise());
+  CALL_SUBTEST_1(test_cuda_trancendental());
+  CALL_SUBTEST_2(test_cuda_contractions());
+  CALL_SUBTEST_3(test_cuda_reductions());
+  CALL_SUBTEST_3(test_cuda_full_reductions());
+  CALL_SUBTEST_4(test_cuda_forced_evals());
+
 #else
   std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl;
 #endif
diff --git a/unsupported/test/cxx11_tensor_simple.cpp b/unsupported/test/cxx11_tensor_simple.cpp
index 47d4d8636..fe860c970 100644
--- a/unsupported/test/cxx11_tensor_simple.cpp
+++ b/unsupported/test/cxx11_tensor_simple.cpp
@@ -195,7 +195,10 @@ static void test_3d()
   VERIFY_IS_EQUAL((epsilon(0,2,1)), -1);
   VERIFY_IS_EQUAL((epsilon(1,0,2)), -1);
 
-  array<Eigen::DenseIndex, 3> dims{{2,3,4}};
+  array<Eigen::DenseIndex, 3> dims;
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 4;
   Tensor<int, 3> t1(dims);
   Tensor<int, 3, RowMajor> t2(dims);
 
diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp
index e46197464..423074a38 100644
--- a/unsupported/test/cxx11_tensor_thread_pool.cpp
+++ b/unsupported/test/cxx11_tensor_thread_pool.cpp
@@ -226,7 +226,7 @@ void test_multithread_contraction_agrees_with_singlethread() {
   for (ptrdiff_t i = 0; i < st_result.size(); i++) {
     // if both of the values are very small, then do nothing (because the test will fail
     // due to numerical precision issues when values are small)
-    if (fabs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4) {
+    if (fabs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4f) {
       VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]);
     }
   }
@@ -234,6 +234,42 @@ void test_multithread_contraction_agrees_with_singlethread() {
 
 
 template<int DataLayout>
+void test_full_contraction() {
+  int contract_size1 = internal::random<int>(1, 500);
+  int contract_size2 = internal::random<int>(1, 500);
+
+  Tensor<float, 2, DataLayout> left(contract_size1,
+                                    contract_size2);
+  Tensor<float, 2, DataLayout> right(contract_size1,
+                                    contract_size2);
+  left.setRandom();
+  right.setRandom();
+
+  // add constants to shift values away from 0 for more precision
+  left += left.constant(1.5f);
+  right += right.constant(1.5f);
+
+  typedef Tensor<float, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 2> dims({{DimPair(0, 0), DimPair(1, 1)}});
+
+  Eigen::ThreadPool tp(internal::random<int>(2, 11));
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(2, 11));
+
+  Tensor<float, 0, DataLayout> st_result;
+  st_result = left.contract(right, dims);
+
+  Tensor<float, 0, DataLayout> tp_result;
+  tp_result.device(thread_pool_device) = left.contract(right, dims);
+
+  VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions()));
+  // if both of the values are very small, then do nothing (because the test will fail
+  // due to numerical precision issues when values are small)
+  if (fabs(st_result() - tp_result()) >= 1e-4f) {
+    VERIFY_IS_APPROX(st_result(), tp_result());
+  }
+}
+
+template<int DataLayout>
 void test_multithreaded_reductions() {
   const int num_threads = internal::random<int>(3, 11);
   ThreadPool thread_pool(num_threads);
@@ -324,6 +360,9 @@ void test_cxx11_tensor_thread_pool()
   CALL_SUBTEST_4(test_contraction_corner_cases<ColMajor>());
   CALL_SUBTEST_4(test_contraction_corner_cases<RowMajor>());
 
+  CALL_SUBTEST_4(test_full_contraction<ColMajor>());
+  CALL_SUBTEST_4(test_full_contraction<RowMajor>());
+
   CALL_SUBTEST_5(test_multithreaded_reductions<ColMajor>());
   CALL_SUBTEST_5(test_multithreaded_reductions<RowMajor>());
 
diff --git a/unsupported/test/levenberg_marquardt.cpp b/unsupported/test/levenberg_marquardt.cpp
index a2bdb99e4..64f168c16 100644
--- a/unsupported/test/levenberg_marquardt.cpp
+++ b/unsupported/test/levenberg_marquardt.cpp
@@ -23,6 +23,9 @@
 
 using std::sqrt;
 
+// tolerance for chekcing number of iterations
+#define LM_EVAL_COUNT_TOL 4/3
+
 struct lmder_functor : DenseFunctor<double>
 {
     lmder_functor(void): DenseFunctor<double>(3,15) {}
@@ -631,7 +634,7 @@ void testNistLanczos1(void)
   VERIFY_IS_EQUAL(lm.nfev(), 79);
   VERIFY_IS_EQUAL(lm.njev(), 72);
   // check norm^2
-//   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.430899764097e-25);  // should be 1.4307867721E-25, but nist results are on 128-bit floats
+  VERIFY(lm.fvec().squaredNorm() <= 1.4307867721E-25);
   // check x
   VERIFY_IS_APPROX(x[0], 9.5100000027E-02);
   VERIFY_IS_APPROX(x[1], 1.0000000001E+00);
@@ -652,7 +655,7 @@ void testNistLanczos1(void)
   VERIFY_IS_EQUAL(lm.nfev(), 9);
   VERIFY_IS_EQUAL(lm.njev(), 8);
   // check norm^2
-//   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.428595533845e-25);  // should be 1.4307867721E-25, but nist results are on 128-bit floats
+  VERIFY(lm.fvec().squaredNorm() <= 1.4307867721E-25);
   // check x
   VERIFY_IS_APPROX(x[0], 9.5100000027E-02);
   VERIFY_IS_APPROX(x[1], 1.0000000001E+00);
@@ -789,7 +792,10 @@ void testNistMGH10(void)
   MGH10_functor functor;
   LevenbergMarquardt<MGH10_functor> lm(functor);
   info = lm.minimize(x);
-  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeErrorTooSmall);
+  ++g_test_level;
+  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall);
+  --g_test_level;
+  // was: VERIFY_IS_EQUAL(info, 1);
 
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.7945855171E+01);
@@ -799,9 +805,13 @@ void testNistMGH10(void)
   VERIFY_IS_APPROX(x[2], 3.4522363462E+02);
   
   // check return value
-  //VERIFY_IS_EQUAL(info, 1);
+
+  ++g_test_level;
   VERIFY_IS_EQUAL(lm.nfev(), 284 );
   VERIFY_IS_EQUAL(lm.njev(), 249 );
+  --g_test_level;
+  VERIFY(lm.nfev() < 284 * LM_EVAL_COUNT_TOL);
+  VERIFY(lm.njev() < 249 * LM_EVAL_COUNT_TOL);
 
   /*
    * Second try
@@ -809,7 +819,10 @@ void testNistMGH10(void)
   x<< 0.02, 4000., 250.;
   // do the computation
   info = lm.minimize(x);
+  ++g_test_level;
   VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall);
+  // was: VERIFY_IS_EQUAL(info, 1);
+  --g_test_level;
 
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.7945855171E+01);
@@ -819,9 +832,12 @@ void testNistMGH10(void)
   VERIFY_IS_APPROX(x[2], 3.4522363462E+02);
   
   // check return value
-  //VERIFY_IS_EQUAL(info, 1);
+  ++g_test_level;
   VERIFY_IS_EQUAL(lm.nfev(), 126);
   VERIFY_IS_EQUAL(lm.njev(), 116);
+  --g_test_level;
+  VERIFY(lm.nfev() < 126 * LM_EVAL_COUNT_TOL);
+  VERIFY(lm.njev() < 116 * LM_EVAL_COUNT_TOL);
 }
 
 
@@ -896,8 +912,12 @@ void testNistBoxBOD(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1); 
+  ++g_test_level;
   VERIFY_IS_EQUAL(lm.nfev(), 16 );
   VERIFY_IS_EQUAL(lm.njev(), 15 );
+  --g_test_level;
+  VERIFY(lm.nfev() < 16 * LM_EVAL_COUNT_TOL);
+  VERIFY(lm.njev() < 15 * LM_EVAL_COUNT_TOL);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.1680088766E+03);
   // check x
diff --git a/unsupported/test/matrix_function.cpp b/unsupported/test/matrix_function.cpp
index 9a995f941..cd24064ad 100644
--- a/unsupported/test/matrix_function.cpp
+++ b/unsupported/test/matrix_function.cpp
@@ -113,8 +113,8 @@ void testMatrixLogarithm(const MatrixType& A)
 
   MatrixType scaledA;
   RealScalar maxImagPartOfSpectrum = A.eigenvalues().imag().cwiseAbs().maxCoeff();
-  if (maxImagPartOfSpectrum >= 0.9 * EIGEN_PI)
-    scaledA = A * 0.9 * EIGEN_PI / maxImagPartOfSpectrum;
+  if (maxImagPartOfSpectrum >= RealScalar(0.9 * EIGEN_PI))
+    scaledA = A * RealScalar(0.9 * EIGEN_PI) / maxImagPartOfSpectrum;
   else
     scaledA = A;
 
diff --git a/unsupported/test/matrix_power.cpp b/unsupported/test/matrix_power.cpp
index 8e104ed1e..53911370f 100644
--- a/unsupported/test/matrix_power.cpp
+++ b/unsupported/test/matrix_power.cpp
@@ -24,7 +24,7 @@ void test2dRotation(double tol)
     s = std::sin(angle);
     B << c, s, -s, c;
 
-    C = Apow(std::ldexp(angle,1) / EIGEN_PI);
+    C = Apow(std::ldexp(angle,1) / T(EIGEN_PI));
     std::cout << "test2dRotation: i = " << i << "   error powerm = " << relerr(C,B) << '\n';
     VERIFY(C.isApprox(B, tol));
   }