24 files changed, 231 insertions, 95 deletions
diff --git a/Eigen/src/Core/Inverse.h b/Eigen/src/Core/Inverse.h
index f303aebf9..b76f0439d 100644
--- a/Eigen/src/Core/Inverse.h
+++ b/Eigen/src/Core/Inverse.h
@@ -45,6 +45,7 @@ class Inverse : public InverseImpl<XprType,typename internal::traits<XprType>::S
 public:
   typedef typename XprType::StorageIndex StorageIndex;
   typedef typename XprType::PlainObject                       PlainObject;
+  typedef typename XprType::Scalar                            Scalar;
   typedef typename internal::ref_selector<XprType>::type      XprTypeNested;
   typedef typename internal::remove_all<XprTypeNested>::type  XprTypeNestedCleaned;
   typedef typename internal::ref_selector<Inverse>::type Nested;
diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h
index c3ea7e0e2..511e92b2f 100644
--- a/Eigen/src/SparseCore/SparseBlock.h
+++ b/Eigen/src/SparseCore/SparseBlock.h
@@ -531,22 +531,24 @@ class unary_evaluator<Block<ArgType,BlockRows,BlockCols,InnerPanel>, IteratorBas
   enum { IsRowMajor = unary_evaluator::IsRowMajor };
   const unary_evaluator& m_eval;
   Index m_outerPos;
-  Index m_innerIndex;
-  Scalar m_value;
+  const Index m_innerIndex;
   Index m_end;
+  EvalIterator m_it;
 public:
 
   EIGEN_STRONG_INLINE OuterVectorInnerIterator(const unary_evaluator& aEval, Index outer)
     : m_eval(aEval),
-      m_outerPos( (IsRowMajor ? aEval.m_block.startCol() : aEval.m_block.startRow()) - 1), // -1 so that operator++ finds the first non-zero entry
+      m_outerPos( (IsRowMajor ? aEval.m_block.startCol() : aEval.m_block.startRow()) ),
       m_innerIndex(IsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol()),
-      m_value(0),
-      m_end(IsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows())
+      m_end(IsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows()),
+      m_it(m_eval.m_argImpl, m_outerPos)
   {
     EIGEN_UNUSED_VARIABLE(outer);
     eigen_assert(outer==0);
 
-    ++(*this);
+    while(m_it && m_it.index() < m_innerIndex) ++m_it;
+    if((!m_it) || (m_it.index()!=m_innerIndex))
+      ++(*this);
   }
 
   inline StorageIndex index() const { return convert_index<StorageIndex>(m_outerPos - (IsRowMajor ? m_eval.m_block.startCol() : m_eval.m_block.startRow())); }
@@ -554,21 +556,20 @@ public:
   inline Index row()    const { return IsRowMajor ? 0 : index(); }
   inline Index col()    const { return IsRowMajor ? index() : 0; }
 
-  inline Scalar value() const { return m_value; }
+  inline Scalar value() const { return m_it.value(); }
+  inline Scalar& valueRef() { return m_it.valueRef(); }
 
   inline OuterVectorInnerIterator& operator++()
   {
     // search next non-zero entry
     while(++m_outerPos<m_end)
     {
-      EvalIterator it(m_eval.m_argImpl, m_outerPos);
+      // Restart iterator at the next inner-vector:
+      m_it.~EvalIterator();
+      ::new (&m_it) EvalIterator(m_eval.m_argImpl, m_outerPos);
       // search for the key m_innerIndex in the current outer-vector
-      while(it && it.index() < m_innerIndex) ++it;
-      if(it && it.index()==m_innerIndex)
-      {
-        m_value = it.value();
-        break;
-      }
+      while(m_it && m_it.index() < m_innerIndex) ++m_it;
+      if(m_it && m_it.index()==m_innerIndex) break;
     }
     return *this;
   }
diff --git a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
index 0a9bdeac2..c1ddd1ac1 100644
--- a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
@@ -623,6 +623,22 @@ protected:
 
 template<typename Derived>
 template<typename OtherDerived>
+Derived& SparseMatrixBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
+{
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
+  return derived();
+}
+
+template<typename Derived>
+template<typename OtherDerived>
+Derived& SparseMatrixBase<Derived>::operator-=(const EigenBase<OtherDerived> &other)
+{
+  call_assignment(derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
+  return derived();
+}
+
+template<typename Derived>
+template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 SparseMatrixBase<Derived>::operator-=(const SparseMatrixBase<OtherDerived> &other)
 {
diff --git a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
index 28f221437..ea7973790 100644
--- a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
@@ -123,8 +123,10 @@ template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
 SparseMatrixBase<Derived>::operator*=(const Scalar& other)
 {
+  typedef typename internal::evaluator<Derived>::InnerIterator EvalIterator;
+  internal::evaluator<Derived> thisEval(derived());
   for (Index j=0; j<outerSize(); ++j)
-    for (typename Derived::InnerIterator i(derived(),j); i; ++i)
+    for (EvalIterator i(thisEval,j); i; ++i)
       i.valueRef() *= other;
   return derived();
 }
@@ -133,8 +135,10 @@ template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
 SparseMatrixBase<Derived>::operator/=(const Scalar& other)
 {
+  typedef typename internal::evaluator<Derived>::InnerIterator EvalIterator;
+  internal::evaluator<Derived> thisEval(derived());
   for (Index j=0; j<outerSize(); ++j)
-    for (typename Derived::InnerIterator i(derived(),j); i; ++i)
+    for (EvalIterator i(thisEval,j); i; ++i)
       i.valueRef() /= other;
   return derived();
 }
diff --git a/Eigen/src/SparseCore/SparseMatrixBase.h b/Eigen/src/SparseCore/SparseMatrixBase.h
index 813accce1..6087da5c4 100644
--- a/Eigen/src/SparseCore/SparseMatrixBase.h
+++ b/Eigen/src/SparseCore/SparseMatrixBase.h
@@ -265,6 +265,11 @@ template<typename Derived> class SparseMatrixBase
     template<typename OtherDerived>
     Derived& operator-=(const DiagonalBase<OtherDerived>& other);
 
+    template<typename OtherDerived>
+    Derived& operator+=(const EigenBase<OtherDerived> &other);
+    template<typename OtherDerived>
+    Derived& operator-=(const EigenBase<OtherDerived> &other);
+
     Derived& operator*=(const Scalar& other);
     Derived& operator/=(const Scalar& other);
 
diff --git a/Eigen/src/SparseCore/SparseSelfAdjointView.h b/Eigen/src/SparseCore/SparseSelfAdjointView.h
index d31d9babf..9e39be738 100644
--- a/Eigen/src/SparseCore/SparseSelfAdjointView.h
+++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h
@@ -222,14 +222,43 @@ template< typename DstXprType, typename SrcXprType, typename Functor>
 struct Assignment<DstXprType, SrcXprType, Functor, SparseSelfAdjoint2Sparse>
 {
   typedef typename DstXprType::StorageIndex StorageIndex;
+  typedef internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> AssignOpType;
+
   template<typename DestScalar,int StorageOrder>
-  static void run(SparseMatrix<DestScalar,StorageOrder,StorageIndex> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  static void run(SparseMatrix<DestScalar,StorageOrder,StorageIndex> &dst, const SrcXprType &src, const AssignOpType&/*func*/)
   {
     internal::permute_symm_to_fullsymm<SrcXprType::Mode>(src.matrix(), dst);
   }
+
+  // FIXME: the handling of += and -= in sparse matrices should be cleanup so that next two overloads could be reduced to:
+  template<typename DestScalar,int StorageOrder,typename AssignFunc>
+  static void run(SparseMatrix<DestScalar,StorageOrder,StorageIndex> &dst, const SrcXprType &src, const AssignFunc& func)
+  {
+    SparseMatrix<DestScalar,StorageOrder,StorageIndex> tmp(src.rows(),src.cols());
+    run(tmp, src, AssignOpType());
+    call_assignment_no_alias_no_transpose(dst, tmp, func);
+  }
+
+  template<typename DestScalar,int StorageOrder>
+  static void run(SparseMatrix<DestScalar,StorageOrder,StorageIndex> &dst, const SrcXprType &src,
+                  const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>& /* func */)
+  {
+    SparseMatrix<DestScalar,StorageOrder,StorageIndex> tmp(src.rows(),src.cols());
+    run(tmp, src, AssignOpType());
+    dst += tmp;
+  }
+
+  template<typename DestScalar,int StorageOrder>
+  static void run(SparseMatrix<DestScalar,StorageOrder,StorageIndex> &dst, const SrcXprType &src,
+                  const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>& /* func */)
+  {
+    SparseMatrix<DestScalar,StorageOrder,StorageIndex> tmp(src.rows(),src.cols());
+    run(tmp, src, AssignOpType());
+    dst -= tmp;
+  }
   
   template<typename DestScalar>
-  static void run(DynamicSparseMatrix<DestScalar,ColMajor,StorageIndex>& dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  static void run(DynamicSparseMatrix<DestScalar,ColMajor,StorageIndex>& dst, const SrcXprType &src, const AssignOpType&/*func*/)
   {
     // TODO directly evaluate into dst;
     SparseMatrix<DestScalar,ColMajor,StorageIndex> tmp(dst.rows(),dst.cols());
diff --git a/doc/FixedSizeVectorizable.dox b/doc/FixedSizeVectorizable.dox
index 49e38af76..0012465ca 100644
--- a/doc/FixedSizeVectorizable.dox
+++ b/doc/FixedSizeVectorizable.dox
@@ -1,6 +1,6 @@
 namespace Eigen {
 
-/** \eigenManualPage TopicFixedSizeVectorizable Fixed-size vectorizable Eigen objects
+/** \eigenManualPage TopicFixedSizeVectorizable Fixed-size vectorizable %Eigen objects
 
 The goal of this page is to explain what we mean by "fixed-size vectorizable".
 
@@ -23,15 +23,15 @@ Examples include:
 
 \section FixedSizeVectorizable_explanation Explanation
 
-First, "fixed-size" should be clear: an Eigen object has fixed size if its number of rows and its number of columns are fixed at compile-time. So for example Matrix3f has fixed size, but MatrixXf doesn't (the opposite of fixed-size is dynamic-size).
+First, "fixed-size" should be clear: an %Eigen object has fixed size if its number of rows and its number of columns are fixed at compile-time. So for example \ref Matrix3f has fixed size, but \ref MatrixXf doesn't (the opposite of fixed-size is dynamic-size).
 
-The array of coefficients of a fixed-size Eigen object is a plain "static array", it is not dynamically allocated. For example, the data behind a Matrix4f is just a "float array[16]".
+The array of coefficients of a fixed-size %Eigen object is a plain "static array", it is not dynamically allocated. For example, the data behind a \ref Matrix4f is just a "float array[16]".
 
 Fixed-size objects are typically very small, which means that we want to handle them with zero runtime overhead -- both in terms of memory usage and of speed.
 
-Now, vectorization (both SSE and AltiVec) works with 128-bit packets. Moreover, for performance reasons, these packets need to be have 128-bit alignment.
+Now, vectorization works with 128-bit packets (e.g., SSE, AltiVec, NEON), 256-bit packets (e.g., AVX), or 512-bit packets (e.g., AVX512). Moreover, for performance reasons, these packets are most efficiently read and written if they have the same alignment as the packet size, that is 16 bytes, 32 bytes, and 64 bytes respectively.
 
-So it turns out that the only way that fixed-size Eigen objects can be vectorized, is if their size is a multiple of 128 bits, or 16 bytes. Eigen will then request 16-byte alignment for these objects, and henceforth rely on these objects being aligned so no runtime check for alignment is performed.
+So it turns out that the best way that fixed-size %Eigen objects can be vectorized, is if their size is a multiple of 16 bytes (or more). %Eigen will then request 16-byte alignment (or more) for these objects, and henceforth rely on these objects being aligned to achieve maximal efficiency.
 
 */
 
diff --git a/doc/PassingByValue.dox b/doc/PassingByValue.dox
index bf4d0ef4b..9254fe6d8 100644
--- a/doc/PassingByValue.dox
+++ b/doc/PassingByValue.dox
@@ -4,21 +4,21 @@ namespace Eigen {
 
 Passing objects by value is almost always a very bad idea in C++, as this means useless copies, and one should pass them by reference instead.
 
-With Eigen, this is even more important: passing \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen objects" by value is not only inefficient, it can be illegal or make your program crash! And the reason is that these Eigen objects have alignment modifiers that aren't respected when they are passed by value.
+With %Eigen, this is even more important: passing \ref TopicFixedSizeVectorizable "fixed-size vectorizable Eigen objects" by value is not only inefficient, it can be illegal or make your program crash! And the reason is that these %Eigen objects have alignment modifiers that aren't respected when they are passed by value.
 
-So for example, a function like this, where v is passed by value:
+For example, a function like this, where \c v is passed by value:
 
 \code
 void my_function(Eigen::Vector2d v);
 \endcode
 
-needs to be rewritten as follows, passing v by reference:
+needs to be rewritten as follows, passing \c v by const reference:
 
 \code
 void my_function(const Eigen::Vector2d& v);
 \endcode
 
-Likewise if you have a class having a Eigen object as member:
+Likewise if you have a class having an %Eigen object as member:
 
 \code
 struct Foo
diff --git a/doc/UnalignedArrayAssert.dox b/doc/UnalignedArrayAssert.dox
index 95d95a2d5..0f7022973 100644
--- a/doc/UnalignedArrayAssert.dox
+++ b/doc/UnalignedArrayAssert.dox
@@ -115,6 +115,15 @@ If you want to know why defining EIGEN_DONT_VECTORIZE does not by itself disable
 It doesn't disable the assertion, because otherwise code that runs fine without vectorization would suddenly crash when enabling vectorization.
 It doesn't disable 16-byte alignment, because that would mean that vectorized and non-vectorized code are not mutually ABI-compatible. This ABI compatibility is very important, even for people who develop only an in-house application, as for instance one may want to have in the same application a vectorized path and a non-vectorized path.
 
+\section checkmycode How can I check my code is safe regarding alignment issues?
+
+Unfortunately, there is no possibility in C++ to detect any of the aformentioned shortcoming at compile time (though static analysers are becoming more and more powerful and could detect some of them).
+Even at runtime, all we can do is to catch invalid unaligned allocation and trigger the explicit assertion mentioned at the begining of this page.
+Therefore, if your program runs fine on a given system with some given compilation flags, then this does not guarantee that your code is safe. For instance, on most 64 bits systems buffer are aligned on 16 bytes boundary and so, if you do not enable AVX instruction set, then your code will run fine. On the other hand, the same code may assert if moving to a more exotic platform, or enabling AVX instructions that required 32 bytes alignment by default.
+
+The situation is not hopeless though. Assuming your code is well covered by unit test, then you can check its alignment safety by linking it to a custom malloc library returning 8 bytes aligned buffers only. This way all alignment shortcomings should pop-up. To this end, you must also compile your program with \link TopicPreprocessorDirectivesPerformance EIGEN_MALLOC_ALREADY_ALIGNED=0 \endlink.
+
+
 */
 
 }
diff --git a/test/permutationmatrices.cpp b/test/permutationmatrices.cpp
index 41aa57d6d..70b469ebc 100644
--- a/test/permutationmatrices.cpp
+++ b/test/permutationmatrices.cpp
@@ -108,7 +108,14 @@ template<typename MatrixType> void permutationmatrices(const MatrixType& m)
     rm = rp;
     rm.col(i).swap(rm.col(j));
     VERIFY_IS_APPROX(rm, rp2.toDenseMatrix().template cast<Scalar>());
-  }  
+  }
+
+  {
+    // simple compilation check
+    Matrix<Scalar, Cols, Cols> A = rp;
+    Matrix<Scalar, Cols, Cols> B = rp.transpose();
+    VERIFY_IS_APPROX(A, B.transpose());
+  }
 }
 
 template<typename T>
diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp
index 2a3117b2b..4d864bbd0 100644
--- a/test/sparse_basic.cpp
+++ b/test/sparse_basic.cpp
@@ -431,6 +431,14 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
     m3 = m2.template selfadjointView<Lower>();
     VERIFY_IS_APPROX(m3, refMat3);
 
+    refMat3 += refMat2.template selfadjointView<Lower>();
+    m3 += m2.template selfadjointView<Lower>();
+    VERIFY_IS_APPROX(m3, refMat3);
+
+    refMat3 -= refMat2.template selfadjointView<Lower>();
+    m3 -= m2.template selfadjointView<Lower>();
+    VERIFY_IS_APPROX(m3, refMat3);
+
     // selfadjointView only works for square matrices:
     SparseMatrixType m4(rows, rows+1);
     VERIFY_RAISES_ASSERT(m4.template selfadjointView<Lower>());
diff --git a/test/sparse_block.cpp b/test/sparse_block.cpp
index cb5213ade..2a0b3b617 100644
--- a/test/sparse_block.cpp
+++ b/test/sparse_block.cpp
@@ -9,6 +9,20 @@
 
 #include "sparse.h"
 
+template<typename T>
+typename Eigen::internal::enable_if<(T::Flags&RowMajorBit)==RowMajorBit, typename T::RowXpr>::type
+innervec(T& A, Index i)
+{
+  return A.row(i);
+}
+
+template<typename T>
+typename Eigen::internal::enable_if<(T::Flags&RowMajorBit)==0, typename T::ColXpr>::type
+innervec(T& A, Index i)
+{
+  return A.col(i);
+}
+
 template<typename SparseMatrixType> void sparse_block(const SparseMatrixType& ref)
 {
   const Index rows = ref.rows();
@@ -20,9 +34,10 @@ template<typename SparseMatrixType> void sparse_block(const SparseMatrixType& re
   typedef typename SparseMatrixType::StorageIndex StorageIndex;
 
   double density = (std::max)(8./(rows*cols), 0.01);
-  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
+  typedef Matrix<Scalar,Dynamic,Dynamic,SparseMatrixType::IsRowMajor?RowMajor:ColMajor> DenseMatrix;
   typedef Matrix<Scalar,Dynamic,1> DenseVector;
   typedef Matrix<Scalar,1,Dynamic> RowDenseVector;
+  typedef SparseVector<Scalar> SparseVectorType;
 
   Scalar s1 = internal::random<Scalar>();
   {
@@ -110,15 +125,35 @@ template<typename SparseMatrixType> void sparse_block(const SparseMatrixType& re
     initSparse<Scalar>(density, refMat2, m2);
     Index j0 = internal::random<Index>(0,outer-1);
     Index j1 = internal::random<Index>(0,outer-1);
-    if(SparseMatrixType::IsRowMajor)
-      VERIFY_IS_APPROX(m2.innerVector(j0), refMat2.row(j0));
-    else
-      VERIFY_IS_APPROX(m2.innerVector(j0), refMat2.col(j0));
+    Index r0 = internal::random<Index>(0,rows-1);
+    Index c0 = internal::random<Index>(0,cols-1);
 
-    if(SparseMatrixType::IsRowMajor)
-      VERIFY_IS_APPROX(m2.innerVector(j0)+m2.innerVector(j1), refMat2.row(j0)+refMat2.row(j1));
-    else
-      VERIFY_IS_APPROX(m2.innerVector(j0)+m2.innerVector(j1), refMat2.col(j0)+refMat2.col(j1));
+    VERIFY_IS_APPROX(m2.innerVector(j0), innervec(refMat2,j0));
+    VERIFY_IS_APPROX(m2.innerVector(j0)+m2.innerVector(j1), innervec(refMat2,j0)+innervec(refMat2,j1));
+
+    m2.innerVector(j0) *= Scalar(2);
+    innervec(refMat2,j0) *= Scalar(2);
+    VERIFY_IS_APPROX(m2, refMat2);
+
+    m2.row(r0) *= Scalar(3);
+    refMat2.row(r0) *= Scalar(3);
+    VERIFY_IS_APPROX(m2, refMat2);
+
+    m2.col(c0) *= Scalar(4);
+    refMat2.col(c0) *= Scalar(4);
+    VERIFY_IS_APPROX(m2, refMat2);
+
+    m2.row(r0) /= Scalar(3);
+    refMat2.row(r0) /= Scalar(3);
+    VERIFY_IS_APPROX(m2, refMat2);
+
+    m2.col(c0) /= Scalar(4);
+    refMat2.col(c0) /= Scalar(4);
+    VERIFY_IS_APPROX(m2, refMat2);
+
+    SparseVectorType v1;
+    VERIFY_IS_APPROX(v1 = m2.col(c0) * 4, refMat2.col(c0)*4);
+    VERIFY_IS_APPROX(v1 = m2.row(r0) * 4, refMat2.row(r0).transpose()*4);
 
     SparseMatrixType m3(rows,cols);
     m3.reserve(VectorXi::Constant(outer,int(inner/2)));
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 8b36093f0..f98eb03bd 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -53,8 +53,10 @@ typedef __int32 int32_t;
 typedef unsigned __int32 uint32_t;
 typedef __int64 int64_t;
 typedef unsigned __int64 uint64_t;
+#include <windows.h>
 #else
 #include <stdint.h>
+#include <unistd.h>
 #endif
 
 #if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900
diff --git a/unsupported/Eigen/CXX11/ThreadPool b/unsupported/Eigen/CXX11/ThreadPool
index 141372f63..c34614194 100644
--- a/unsupported/Eigen/CXX11/ThreadPool
+++ b/unsupported/Eigen/CXX11/ThreadPool
@@ -58,6 +58,18 @@
 #include "src/ThreadPool/SimpleThreadPool.h"
 #include "src/ThreadPool/NonBlockingThreadPool.h"
 
+
+// Use the more efficient NonBlockingThreadPool by default.
+namespace Eigen {
+#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
+template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>;
+typedef NonBlockingThreadPool ThreadPool;
+#else
+template <typename Env> using ThreadPoolTempl = SimpleThreadPoolTempl<Env>;
+typedef SimpleThreadPool ThreadPool;
+#endif
+}  // namespace Eigen
+
 #endif
 
 #include <Eigen/src/Core/util/ReenableStupidWarnings.h>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
index 6a28024b6..ab320a50d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@@ -418,7 +418,7 @@ class TensorContractionSubMapper {
     return m_base_mapper.template loadHalfPacket<Alignment>(i + m_vert_offset, m_horiz_offset);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet& p) const {
     if (UseDirectOffsets) {
       m_base_mapper.storePacket(i, 0, p);
     }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index ec732f17d..e6cee11ef 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -88,11 +88,7 @@ static void initializeDeviceProp() {
 #if __cplusplus >= 201103L
         std::atomic_thread_fence(std::memory_order_acquire);
 #endif
-#if EIGEN_OS_WIN || EIGEN_OS_WIN64
-        Sleep(1000);
-#else
-        sleep(1);
-#endif
+        EIGEN_SLEEP(1000);
       }
     }
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index 210ae1368..16180ca69 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -12,17 +12,6 @@
 
 namespace Eigen {
 
-// Use the SimpleThreadPool by default. We'll switch to the new non blocking
-// thread pool later.
-#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
-template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>;
-typedef NonBlockingThreadPool ThreadPool;
-#else
-template <typename Env> using ThreadPoolTempl = SimpleThreadPoolTempl<Env>;
-typedef SimpleThreadPool ThreadPool;
-#endif
-
-
 // Barrier is an object that allows one or more threads to wait until
 // Notify has been called a specified number of times.
 class Barrier {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
index ee0078bbc..f92e39d69 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
@@ -51,4 +51,12 @@
 #endif
 
 
+#if EIGEN_OS_WIN || EIGEN_OS_WIN64
+#define EIGEN_SLEEP(n) Sleep(n)
+#elif EIGEN_OS_GNULINUX
+#define EIGEN_SLEEP(n) usleep(n * 1000);
+#else
+#define EIGEN_SLEEP(n) sleep(std::max<unsigned>(1, n/1000))
+#endif
+
 #endif
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
index b57863163..0e6a0bf8f 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@@ -28,6 +28,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
         blocked_(0),
         spinning_(0),
         done_(false),
+        cancelled_(false),
         ec_(waiters_) {
     waiters_.resize(num_threads);
 
@@ -61,10 +62,19 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
 
   ~NonBlockingThreadPoolTempl() {
     done_ = true;
+
     // Now if all threads block without work, they will start exiting.
     // But note that threads can continue to work arbitrary long,
     // block, submit new work, unblock and otherwise live full life.
-    ec_.Notify(true);
+    if (!cancelled_) {
+      ec_.Notify(true);
+    } else {
+      // Since we were cancelled, there might be entries in the queues.
+      // Empty them to prevent their destructor from asserting.
+      for (size_t i = 0; i < queues_.size(); i++) {
+        queues_[i]->Flush();
+      }
+    }
 
     // Join threads explicitly to avoid destruction order issues.
     for (size_t i = 0; i < threads_.size(); i++) delete threads_[i];
@@ -91,16 +101,25 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
     // completes overall computations, which in turn leads to destruction of
     // this. We expect that such scenario is prevented by program, that is,
     // this is kept alive while any threads can potentially be in Schedule.
-    if (!t.f)
+    if (!t.f) {
       ec_.Notify(false);
-    else
+    }
+    else {
       env_.ExecuteTask(t);  // Push failed, execute directly.
+    }
   }
 
   void Cancel() {
+    cancelled_ = true;
+    done_ = true;
+
+    // Let each thread know it's been cancelled.
     for (size_t i = 0; i < threads_.size(); i++) {
-      threads_[i]->Cancel();
+      threads_[i]->OnCancel();
     }
+
+    // Wake up the threads without work to let them exit on their own.
+    ec_.Notify(true);
   }
 
   int NumThreads() const final {
@@ -135,6 +154,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
   std::atomic<unsigned> blocked_;
   std::atomic<bool> spinning_;
   std::atomic<bool> done_;
+  std::atomic<bool> cancelled_;
   EventCount ec_;
 
   // Main worker thread loop.
@@ -145,7 +165,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
     pt->thread_id = thread_id;
     Queue* q = queues_[thread_id];
     EventCount::Waiter* waiter = &waiters_[thread_id];
-    for (;;) {
+    while (!cancelled_) {
       Task t = q->PopFront();
       if (!t.f) {
         t = Steal();
@@ -158,7 +178,11 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
           // pool. Consider a time based limit instead.
           if (!spinning_ && !spinning_.exchange(true)) {
             for (int i = 0; i < 1000 && !t.f; i++) {
-              t = Steal();
+              if (!cancelled_.load(std::memory_order_relaxed)) {
+                t = Steal();
+              } else {
+                return;
+              }
             }
             spinning_ = false;
           }
@@ -207,8 +231,12 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
     int victim = NonEmptyQueueIndex();
     if (victim != -1) {
       ec_.CancelWait(waiter);
-      *t = queues_[victim]->PopBack();
-      return true;
+      if (cancelled_) {
+        return false;
+      } else {
+        *t = queues_[victim]->PopBack();
+        return true;
+      }
     }
     // Number of blocked threads is used as termination condition.
     // If we are shutting down and all worker threads blocked without work,
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
index ab4f85fbf..fb08deb20 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
@@ -71,7 +71,7 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface {
 
   void Cancel() {
     for (size_t i = 0; i < threads_.size(); i++) {
-      threads_[i]->Cancel();
+      threads_[i]->OnCancel();
     }
   }
 
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
index b3c45057d..d94a06416 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
@@ -23,7 +23,8 @@ struct StlThreadEnvironment {
    public:
     EnvThread(std::function<void()> f) : thr_(std::move(f)) {}
     ~EnvThread() { thr_.join(); }
-    void Cancel() { EIGEN_THREAD_CANCEL(thr_); }
+    // This function is called when the threadpool is cancelled.
+    void OnCancel() { }
 
    private:
     std::thread thr_;
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
index 5935b7cd8..84e1e6cc0 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
@@ -19,8 +19,10 @@ class ThreadPoolInterface {
   // Submits a closure to be run by a thread in the pool.
   virtual void Schedule(std::function<void()> fn) = 0;
 
-  // Cancel all the threads in the pool.
-  virtual void Cancel() = 0;
+  // If implemented, stop processing the closures that have been enqueued.
+  // Currently running closures may still be processed.
+  // If not implemented, does nothing.
+  virtual void Cancel() {}
 
   // Returns the number of threads in the pool.
   virtual int NumThreads() const = 0;
diff --git a/unsupported/test/cxx11_non_blocking_thread_pool.cpp b/unsupported/test/cxx11_non_blocking_thread_pool.cpp
index fe30551ce..2c5765ce4 100644
--- a/unsupported/test/cxx11_non_blocking_thread_pool.cpp
+++ b/unsupported/test/cxx11_non_blocking_thread_pool.cpp
@@ -10,8 +10,8 @@
 
 #define EIGEN_USE_THREADS
 #include "main.h"
-#include <unistd.h>
 #include "Eigen/CXX11/ThreadPool"
+#include "Eigen/CXX11/Tensor"
 
 static void test_create_destroy_empty_pool()
 {
@@ -104,23 +104,15 @@ static void test_parallelism()
 
 static void test_cancel()
 {
-  NonBlockingThreadPool tp(4);
+  NonBlockingThreadPool tp(2);
 
-#ifdef EIGEN_SUPPORTS_THREAD_CANCELLATION
-  std::cout << "Thread cancellation is supported on this platform" << std::endl;
-
-  // Put 2 threads to sleep for much longer than the default test timeout.
-  tp.Schedule([]() { sleep(3600); } );
-  tp.Schedule([]() { sleep(3600 * 24); } );
-#else
-  std::cout << "Thread cancellation is a no-op on this platform" << std::endl;
-
-  // Make 2 threads sleep for a short period of time
-  tp.Schedule([]() { sleep(1); } );
-  tp.Schedule([]() { sleep(2); } );
-#endif
+  // Schedule a large number of closure that each sleeps for one second. This
+  // will keep the thread pool busy for much longer than the default test timeout.
+  for (int i = 0; i < 1000; ++i) {
+    tp.Schedule([]() { EIGEN_SLEEP(2000); });
+  }
 
-  // Call cancel:
+  // Cancel the processing of all the closures that are still pending.
   tp.Cancel();
 }
 
diff --git a/unsupported/test/cxx11_tensor_notification.cpp b/unsupported/test/cxx11_tensor_notification.cpp
index c946007b8..183ef02c1 100644
--- a/unsupported/test/cxx11_tensor_notification.cpp
+++ b/unsupported/test/cxx11_tensor_notification.cpp
@@ -13,15 +13,6 @@
 #include "main.h"
 #include <Eigen/CXX11/Tensor>
 
-#if EIGEN_OS_WIN || EIGEN_OS_WIN64
-#include <windows.h>
-void sleep(int seconds) {
-  Sleep(seconds*1000);
-}
-#else
-#include <unistd.h>
-#endif
-
 
 namespace {
 
@@ -40,7 +31,7 @@ static void test_notification_single()
   Eigen::Notification n;
   std::function<void()> func = std::bind(&WaitAndAdd, &n, &counter);
   thread_pool.Schedule(func);
-  sleep(1);
+  EIGEN_SLEEP(1000);
 
   // The thread should be waiting for the notification.
   VERIFY_IS_EQUAL(counter, 0);
@@ -48,7 +39,7 @@ static void test_notification_single()
   // Unblock the thread
   n.Notify();
 
-  sleep(1);
+  EIGEN_SLEEP(1000);
 
   // Verify the counter has been incremented
   VERIFY_IS_EQUAL(counter, 1);
@@ -67,10 +58,10 @@ static void test_notification_multiple()
   thread_pool.Schedule(func);
   thread_pool.Schedule(func);
   thread_pool.Schedule(func);
-  sleep(1);
+  EIGEN_SLEEP(1000);
   VERIFY_IS_EQUAL(counter, 0);
   n.Notify();
-  sleep(1);
+  EIGEN_SLEEP(1000);
   VERIFY_IS_EQUAL(counter, 4);
 }